From 2928fbb35d5c7a7a0a0101f628ff13c4177a01b4 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Thu, 24 Aug 2023 05:20:31 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 75499 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 75894 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..b2b5840d --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2023-08-16T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.15780v2","updated":"2023-08-16T17:59:07Z","published":"2023-07-24T18:47:38Z","title":"LLM-Rec: Personalized Recommendation via Prompting Large Language Models","summary":" We investigate various prompting strategies for enhancing personalized\nrecommendation performance with large language models (LLMs) through input\naugmentation. Our proposed approach, termed LLM-Rec, encompasses four distinct\nprompting strategies: (1) basic prompting, (2) recommendation-driven prompting,\n(3) engagement-guided prompting, and (4) recommendation-driven +\nengagement-guided prompting. Our empirical experiments show that incorporating\nthe augmented input text generated by LLM leads to improved recommendation\nperformance. Recommendation-driven and engagement-guided prompting strategies\nare found to elicit LLM's understanding of global and local item\ncharacteristics. This finding highlights the importance of leveraging diverse\nprompts and input augmentation techniques to enhance the recommendation\ncapabilities with LLMs.\n","authors":["Hanjia Lyu","Song Jiang","Hanqing Zeng","Qifan Wang","Si Zhang","Ren Chen","Chris Leung","Jiajie Tang","Yinglong Xia","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2307.15780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04306v2","updated":"2023-08-16T17:44:59Z","published":"2023-06-07T10:11:09Z","title":"Allophant: Cross-lingual Phoneme Recognition with Articulatory\n Attributes","summary":" This paper proposes Allophant, a multilingual phoneme recognizer. It requires\nonly a phoneme inventory for cross-lingual transfer to a target language,\nallowing for low-resource recognition. The architecture combines a\ncompositional phone embedding approach with individually supervised phonetic\nattribute classifiers in a multi-task architecture. We also introduce\nAllophoible, an extension of the PHOIBLE database. When combined with a\ndistance based mapping approach for grapheme-to-phoneme outputs, it allows us\nto train on PHOIBLE inventories directly. By training and evaluating on 34\nlanguages, we found that the addition of multi-task learning improves the\nmodel's capability of being applied to unseen phonemes and phoneme inventories.\nOn supervised languages we achieve phoneme error rate improvements of 11\npercentage points (pp.) compared to a baseline without multi-task learning.\nEvaluation of zero-shot transfer on 84 languages yielded a decrease in PER of\n2.63 pp. over the baseline.\n","authors":["Kevin Glocker","Aaricia Herygers","Munir Georges"],"pdf_url":"https://arxiv.org/pdf/2306.04306v2.pdf","comment":"5 pages, 2 figures, 2 tables, accepted to INTERSPEECH 2023; published\n version"},{"id":"http://arxiv.org/abs/2308.08493v1","updated":"2023-08-16T16:48:57Z","published":"2023-08-16T16:48:57Z","title":"Time Travel in LLMs: Tracing Data Contamination in Large Language Models","summary":" Data contamination, i.e., the presence of test data from downstream tasks in\nthe training data of large language models (LLMs), is a potential major issue\nin understanding LLMs' effectiveness on other tasks. We propose a\nstraightforward yet effective method for identifying data contamination within\nLLMs. At its core, our approach starts by identifying potential contamination\nin individual instances that are drawn from a small random sample; using this\ninformation, our approach then assesses if an entire dataset partition is\ncontaminated. To estimate contamination of individual instances, we employ\n\"guided instruction:\" a prompt consisting of the dataset name, partition type,\nand the initial segment of a reference instance, asking the LLM to complete it.\nAn instance is flagged as contaminated if the LLM's output either exactly or\nclosely matches the latter segment of the reference. To understand if an entire\npartition is contaminated, we propose two ideas. The first idea marks a dataset\npartition as contaminated if the average overlap score with the reference\ninstances (as measured by ROUGE or BLEURT) is statistically significantly\nbetter with the guided instruction vs. a general instruction that does not\ninclude the dataset and partition name. The second idea marks a dataset as\ncontaminated if a classifier based on GPT-4 with in-context learning prompting\nmarks multiple instances as contaminated. Our best method achieves an accuracy\nbetween 92% and 100% in detecting if an LLM is contaminated with seven\ndatasets, containing train and test/validation partitions, when contrasted with\nmanual evaluation by human expert. Further, our findings indicate that GPT-4 is\ncontaminated with AG News, WNLI, and XSum datasets.\n","authors":["Shahriar Golchin","Mihai Surdeanu"],"pdf_url":"https://arxiv.org/pdf/2308.08493v1.pdf","comment":"v1 preprint"},{"id":"http://arxiv.org/abs/2304.01752v2","updated":"2023-08-16T15:54:54Z","published":"2023-04-04T12:42:29Z","title":"Black Box Few-Shot Adaptation for Vision-Language models","summary":" Vision-Language (V-L) models trained with contrastive learning to align the\nvisual and language modalities have been shown to be strong few-shot learners.\nSoft prompt learning is the method of choice for few-shot downstream adaption\naiming to bridge the modality gap caused by the distribution shift induced by\nthe new domain. While parameter-efficient, prompt learning still requires\naccess to the model weights and can be computationally infeasible for large\nmodels with billions of parameters. To address these shortcomings, in this\nwork, we describe a black-box method for V-L few-shot adaptation that (a)\noperates on pre-computed image and text features and hence works without access\nto the model's weights, (b) it is orders of magnitude faster at training time,\n(c) it is amenable to both supervised and unsupervised training, and (d) it can\nbe even used to align image and text features computed from uni-modal models.\nTo achieve this, we propose Linear Feature Alignment (LFA), a simple linear\napproach for V-L re-alignment in the target domain. LFA is initialized from a\nclosed-form solution to a least-squares problem and then it is iteratively\nupdated by minimizing a re-ranking loss. Despite its simplicity, our approach\ncan even surpass soft-prompt learning methods as shown by extensive experiments\non 11 image and 2 video datasets.\n","authors":["Yassine Ouali","Adrian Bulat","Brais Martinez","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2304.01752v2.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2306.01102v2","updated":"2023-08-16T15:49:48Z","published":"2023-06-01T19:33:21Z","title":"LLMatic: Neural Architecture Search via Large Language Models and\n Quality-Diversity Optimization","summary":" Large Language Models (LLMs) have emerged as powerful tools capable of\naccomplishing a broad spectrum of tasks. Their abilities span numerous areas,\nand one area where they have made a significant impact is in the domain of code\ngeneration. In this context, we view LLMs as mutation and crossover tools.\nMeanwhile, Quality-Diversity (QD) algorithms are known to discover diverse and\nrobust solutions. By merging the code-generating abilities of LLMs with the\ndiversity and robustness of QD solutions, we introduce LLMatic, a Neural\nArchitecture Search (NAS) algorithm. While LLMs struggle to conduct NAS\ndirectly through prompts, LLMatic uses a procedural approach, leveraging QD for\nprompts and network architecture to create diverse and highly performant\nnetworks. We test LLMatic on the CIFAR-10 image classification benchmark,\ndemonstrating that it can produce competitive networks with just $2,000$\nsearches, even without prior knowledge of the benchmark domain or exposure to\nany previous top-performing models for the benchmark.\n","authors":["Muhammad U. Nasir","Sam Earle","Julian Togelius","Steven James","Christopher Cleghorn"],"pdf_url":"https://arxiv.org/pdf/2306.01102v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08442v1","updated":"2023-08-16T15:49:36Z","published":"2023-08-16T15:49:36Z","title":"Mitigating the Exposure Bias in Sentence-Level Grapheme-to-Phoneme (G2P)\n Transduction","summary":" Text-to-Text Transfer Transformer (T5) has recently been considered for the\nGrapheme-to-Phoneme (G2P) transduction. As a follow-up, a tokenizer-free\nbyte-level model based on T5 referred to as ByT5, recently gave promising\nresults on word-level G2P conversion by representing each input character with\nits corresponding UTF-8 encoding. Although it is generally understood that\nsentence-level or paragraph-level G2P can improve usability in real-world\napplications as it is better suited to perform on heteronyms and linking sounds\nbetween words, we find that using ByT5 for these scenarios is nontrivial. Since\nByT5 operates on the character level, it requires longer decoding steps, which\ndeteriorates the performance due to the exposure bias commonly observed in\nauto-regressive generation models. This paper shows that the performance of\nsentence-level and paragraph-level G2P can be improved by mitigating such\nexposure bias using our proposed loss-based sampling method.\n","authors":["Eunseop Yoon","Hee Suk Yoon","Dhananjaya Gowda","SooHwan Eom","Daehyeok Kim","John Harvill","Heting Gao","Mark Hasegawa-Johnson","Chanwoo Kim","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2308.08442v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2308.08413v1","updated":"2023-08-16T14:58:12Z","published":"2023-08-16T14:58:12Z","title":"Knowledge-Enhanced Multi-Label Few-Shot Product Attribute-Value\n Extraction","summary":" Existing attribute-value extraction (AVE) models require large quantities of\nlabeled data for training. However, new products with new attribute-value pairs\nenter the market every day in real-world e-Commerce. Thus, we formulate AVE in\nmulti-label few-shot learning (FSL), aiming to extract unseen attribute value\npairs based on a small number of training examples. We propose a\nKnowledge-Enhanced Attentive Framework (KEAF) based on prototypical networks,\nleveraging the generated label description and category information to learn\nmore discriminative prototypes. Besides, KEAF integrates with hybrid attention\nto reduce noise and capture more informative semantics for each class by\ncalculating the label-relevant and query-related weights. To achieve\nmulti-label inference, KEAF further learns a dynamic threshold by integrating\nthe semantic information from both the support set and the query set. Extensive\nexperiments with ablation studies conducted on two datasets demonstrate that\nKEAF outperforms other SOTA models for information extraction in FSL. The code\ncan be found at: https://github.com/gjiaying/KEAF\n","authors":["Jiaying Gong","Wei-Te Chen","Hoda Eldardiry"],"pdf_url":"https://arxiv.org/pdf/2308.08413v1.pdf","comment":"6 pages, 2 figures, published in CIKM 2023"},{"id":"http://arxiv.org/abs/2307.07889v2","updated":"2023-08-16T14:55:35Z","published":"2023-07-15T22:02:12Z","title":"LLM Comparative Assessment: Zero-shot NLG Evaluation through Pairwise\n Comparisons using Large Language Models","summary":" Current developments in large language models (LLMs) have enabled impressive\nzero-shot capabilities across various natural language tasks. An interesting\napplication of these systems is in the automated assessment of natural language\ngeneration (NLG), a highly challenging area with great practical benefit. In\nthis paper, we explore two options for exploiting the emergent abilities of\nLLMs for zero-shot NLG assessment: absolute score prediction, and comparative\nassessment which uses relative comparisons between pairs of candidates. Though\ncomparative assessment has not been extensively studied in NLG assessment, we\nnote that humans often find it more intuitive to compare two options rather\nthan scoring each one independently. This work examines comparative assessment\nfrom multiple perspectives: performance compared to absolute grading;\npositional biases in the prompt; and efficient ranking in terms of the number\nof comparisons. We illustrate that LLM comparative assessment is a simple,\ngeneral and effective approach for NLG assessment. For moderate-sized\nopen-source LLMs, such as FlanT5 and Llama2-chat, comparative assessment is\nsuperior to prompt scoring, and in many cases can achieve performance\ncompetitive with state-of-the-art methods. Additionally, we demonstrate that\nLLMs often exhibit strong positional biases when making pairwise comparisons,\nand we propose debiasing methods that can further improve performance.\n","authors":["Adian Liusie","Potsawee Manakul","Mark J. F. Gales"],"pdf_url":"https://arxiv.org/pdf/2307.07889v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2307.11787v2","updated":"2023-08-16T14:03:03Z","published":"2023-07-20T16:22:36Z","title":"LLM Cognitive Judgements Differ From Human","summary":" Large Language Models (LLMs) have lately been on the spotlight of\nresearchers, businesses, and consumers alike. While the linguistic capabilities\nof such models have been studied extensively, there is growing interest in\ninvestigating them as cognitive subjects. In the present work I examine GPT-3\nand ChatGPT capabilities on an limited-data inductive reasoning task from the\ncognitive science literature. The results suggest that these models' cognitive\njudgements are not human-like.\n","authors":["Sotiris Lamprinidis"],"pdf_url":"https://arxiv.org/pdf/2307.11787v2.pdf","comment":"7 pages, 1 figure. License changed to CC BY-NC-SA"},{"id":"http://arxiv.org/abs/2308.08378v1","updated":"2023-08-16T14:01:25Z","published":"2023-08-16T14:01:25Z","title":"Advancing continual lifelong learning in neural information retrieval:\n definition, dataset, framework, and empirical evaluation","summary":" Continual learning refers to the capability of a machine learning model to\nlearn and adapt to new information, without compromising its performance on\npreviously learned tasks. Although several studies have investigated continual\nlearning methods for information retrieval tasks, a well-defined task\nformulation is still lacking, and it is unclear how typical learning strategies\nperform in this context. To address this challenge, a systematic task\nformulation of continual neural information retrieval is presented, along with\na multiple-topic dataset that simulates continuous information retrieval. A\ncomprehensive continual neural information retrieval framework consisting of\ntypical retrieval models and continual learning strategies is then proposed.\nEmpirical evaluations illustrate that the proposed framework can successfully\nprevent catastrophic forgetting in neural information retrieval and enhance\nperformance on previously learned tasks. The results indicate that\nembedding-based retrieval models experience a decline in their continual\nlearning performance as the topic shift distance and dataset volume of new\ntasks increase. In contrast, pretraining-based models do not show any such\ncorrelation. Adopting suitable learning strategies can mitigate the effects of\ntopic shift and data augmentation.\n","authors":["Jingrui Hou","Georgina Cosma","Axel Finke"],"pdf_url":"https://arxiv.org/pdf/2308.08378v1.pdf","comment":"Submitted to Information Sciences"},{"id":"http://arxiv.org/abs/2308.08363v1","updated":"2023-08-16T13:39:06Z","published":"2023-08-16T13:39:06Z","title":"SummHelper: Collaborative Human-Computer Summarization","summary":" Current approaches for text summarization are predominantly automatic, with\nrather limited space for human intervention and control over the process. In\nthis paper, we introduce SummHelper, a 2-phase summarization assistant designed\nto foster human-machine collaboration. The initial phase involves content\nselection, where the system recommends potential content, allowing users to\naccept, modify, or introduce additional selections. The subsequent phase,\ncontent consolidation, involves SummHelper generating a coherent summary from\nthese selections, which users can then refine using visual mappings between the\nsummary and the source text. Small-scale user studies reveal the effectiveness\nof our application, with participants being especially appreciative of the\nbalance between automated guidance and opportunities for personal input.\n","authors":["Aviv Slobodkin","Niv Nachum","Shmuel Amar","Ori Shapira","Ido Dagan"],"pdf_url":"https://arxiv.org/pdf/2308.08363v1.pdf","comment":"Demo paper"},{"id":"http://arxiv.org/abs/2305.09781v2","updated":"2023-08-16T13:33:06Z","published":"2023-05-16T20:12:59Z","title":"SpecInfer: Accelerating Generative Large Language Model Serving with\n Speculative Inference and Token Tree Verification","summary":" The high computational and memory requirements of generative large language\nmodels (LLMs) make it challenging to serve them quickly and cheaply. This paper\nintroduces SpecInfer, an LLM serving system that accelerates generative LLM\ninference with speculative inference and token tree verification. A key insight\nbehind Specinfer is to combine various collectively boost-tuned small language\nmodels to jointly predict the LLM's outputs; the predictions are organized as a\ntoken tree, whose nodes each represent a candidate token sequence. The\ncorrectness of all candidate token sequences represented by a token tree is\nverified against the LLM in parallel using a novel tree-based parallel decoding\nmechanism. SpecInfer uses an LLM as a token tree verifier instead of an\nincremental decoder, which significantly reduces the end-to-end latency and\ncomputational requirement for serving generative LLMs while provably preserving\nmodel quality. Our evaluation shows that SpecInfer outperforms existing LLM\nserving systems by 1.3-2.4x for distributed LLM inference and by 2.6-3.5x for\noffloading-based LLM inference, while preserving the same generative\nperformance. SpecInfer is publicly available at\nhttps://github.com/flexflow/FlexFlow/tree/inference.\n","authors":["Xupeng Miao","Gabriele Oliaro","Zhihao Zhang","Xinhao Cheng","Zeyu Wang","Rae Ying Yee Wong","Alan Zhu","Lijie Yang","Xiaoxiang Shi","Chunan Shi","Zhuoming Chen","Daiyaan Arfeen","Reyna Abhyankar","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2305.09781v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03453v3","updated":"2023-08-16T12:50:46Z","published":"2023-05-05T11:56:30Z","title":"T-SciQ: Teaching Multimodal Chain-of-Thought Reasoning via Large\n Language Model Signals for Science Question Answering","summary":" Large Language Models (LLMs) have recently demonstrated exceptional\nperformance in various Natural Language Processing (NLP) tasks. They have also\nshown the ability to perform chain-of-thought (CoT) reasoning to solve complex\nproblems. Recent studies have explored CoT reasoning in complex multimodal\nscenarios, such as the science question answering task, by fine-tuning\nmultimodal models with high-quality human-annotated CoT rationales. However,\ncollecting high-quality COT rationales is usually time-consuming and costly.\nBesides, the annotated rationales are hardly accurate due to the external\nessential information missed. To address these issues, we propose a novel\nmethod termed \\emph{T-SciQ} that aims at teaching science question answering\nwith LLM signals. The T-SciQ approach generates high-quality CoT rationales as\nteaching signals and is advanced to train much smaller models to perform CoT\nreasoning in complex modalities. Additionally, we introduce a novel data mixing\nstrategy to produce more effective teaching data samples by policy for simple\nand complex science question answer problems. Extensive experimental results\nshow that our T-SciQ method achieves a new state-of-the-art performance on the\nScienceQA benchmark, with an accuracy of 96.18\\%. Moreover, our approach\noutperforms the most powerful fine-tuned baseline by 4.5\\%.\n","authors":["Lei Wang","Yi Hu","Jiabang He","Xing Xu","Ning Liu","Hui Liu","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2305.03453v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14036v2","updated":"2023-08-16T12:03:47Z","published":"2023-02-27T18:47:55Z","title":"Text-only domain adaptation for end-to-end ASR using integrated\n text-to-mel-spectrogram generator","summary":" We propose an end-to-end Automatic Speech Recognition (ASR) system that can\nbe trained on transcribed speech data, text-only data, or a mixture of both.\nThe proposed model uses an integrated auxiliary block for text-based training.\nThis block combines a non-autoregressive multi-speaker text-to-mel-spectrogram\ngenerator with a GAN-based enhancer to improve the spectrogram quality. The\nproposed system can generate a mel-spectrogram dynamically during training. It\ncan be used to adapt the ASR model to a new domain by using text-only data from\nthis domain. We demonstrate that the proposed training method significantly\nimproves ASR accuracy compared to the system trained on transcribed speech\nonly. It also surpasses cascade TTS systems with the vocoder in the adaptation\nquality and training speed.\n","authors":["Vladimir Bataev","Roman Korostik","Evgeny Shabalin","Vitaly Lavrukhin","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2302.14036v2.pdf","comment":"Accepted to INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2304.01622v2","updated":"2023-08-16T11:58:38Z","published":"2023-04-04T08:25:12Z","title":"An interpretability framework for Similar case matching","summary":" Similar Case Matching (SCM) plays a pivotal role in the legal system by\nfacilitating the efficient identification of similar cases for legal\nprofessionals. While previous research has primarily concentrated on enhancing\nthe performance of SCM models, the aspect of interpretability has been\nneglected. To bridge the gap, this study proposes an integrated pipeline\nframework for interpretable SCM. The framework comprises four modules: judicial\nfeature sentence identification, case matching, feature sentence alignment, and\nconflict resolution. In contrast to current SCM methods, our framework first\nextracts feature sentences within a legal case that contain essential\ninformation. Then it conducts case matching based on these extracted features.\nSubsequently, our framework aligns the corresponding sentences in two legal\ncases to provide evidence of similarity. In instances where the results of case\nmatching and feature sentence alignment exhibit conflicts, the conflict\nresolution module resolves these inconsistencies. The experimental results show\nthe effectiveness of our proposed framework, establishing a new benchmark for\ninterpretable SCM.\n","authors":["Nankai Lin","Haonan Liu","Jiajun Fang","Dong Zhou","Aimin Yang"],"pdf_url":"https://arxiv.org/pdf/2304.01622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08295v1","updated":"2023-08-16T11:50:38Z","published":"2023-08-16T11:50:38Z","title":"Detoxify Language Model Step-by-Step","summary":" Detoxification for LLMs is challenging since it requires models to avoid\ngenerating harmful content while maintaining the generation capability. To\nensure the safety of generations, previous detoxification methods detoxify the\nmodels by changing the data distributions or constraining the generations from\ndifferent aspects in a single-step manner. However, these approaches will\ndramatically affect the generation quality of LLMs, e.g., discourse coherence\nand semantic consistency, since language models tend to generate along the\ntoxic prompt while detoxification methods work in the opposite direction. To\nhandle such a conflict, we decompose the detoxification process into different\nsub-steps, where the detoxification is concentrated in the input stage and the\nsubsequent continual generation is based on the non-toxic prompt. Besides, we\nalso calibrate the strong reasoning ability of LLMs by designing a Detox-Chain\nto connect the above sub-steps in an orderly manner, which allows LLMs to\ndetoxify the text step-by-step. Automatic and human evaluation on two\nbenchmarks reveals that by training with Detox-Chain, six LLMs scaling from 1B\nto 33B can obtain significant detoxification and generation improvement. Our\ncode and data are available at https://github.com/CODINNLG/Detox-CoT. Warning:\nexamples in the paper may contain uncensored offensive content.\n","authors":["Zecheng Tang","Keyan Zhou","Pinzheng Wang","Yuyang Ding","Juntao Li"," Minzhang"],"pdf_url":"https://arxiv.org/pdf/2308.08295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08285v1","updated":"2023-08-16T11:10:43Z","published":"2023-08-16T11:10:43Z","title":"Pre-training with Large Language Model-based Document Expansion for\n Dense Passage Retrieval","summary":" In this paper, we systematically study the potential of pre-training with\nLarge Language Model(LLM)-based document expansion for dense passage retrieval.\nConcretely, we leverage the capabilities of LLMs for document expansion, i.e.\nquery generation, and effectively transfer expanded knowledge to retrievers\nusing pre-training strategies tailored for passage retrieval. These strategies\ninclude contrastive learning and bottlenecked query generation. Furthermore, we\nincorporate a curriculum learning strategy to reduce the reliance on LLM\ninferences. Experimental results demonstrate that pre-training with LLM-based\ndocument expansion significantly boosts the retrieval performance on\nlarge-scale web-search tasks. Our work shows strong zero-shot and out-of-domain\nretrieval abilities, making it more widely applicable for retrieval when\ninitializing with no human-labeled data.\n","authors":["Guangyuan Ma","Xing Wu","Peng Wang","Zijia Lin","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2308.08285v1.pdf","comment":"10 pages, 3 tables, 4 figures, under review"},{"id":"http://arxiv.org/abs/2301.10405v5","updated":"2023-08-16T10:57:58Z","published":"2023-01-25T04:45:06Z","title":"Editing Language Model-based Knowledge Graph Embeddings","summary":" Recently decades have witnessed the empirical success of framing Knowledge\nGraph (KG) embeddings via language models. However, language model-based KG\nembeddings are usually deployed as static artifacts, making them difficult to\nmodify post-deployment without re-training after deployment. To address this\nissue, we propose a new task of editing language model-based KG embeddings in\nthis paper. This task is designed to facilitate rapid, data-efficient updates\nto KG embeddings without compromising the performance of other aspects. We\nbuild four new datasets: E-FB15k237, A-FB15k237, E-WN18RR, and A-WN18RR, and\nevaluate several knowledge editing baselines demonstrating the limited ability\nof previous models to handle the proposed challenging task. We further propose\na simple yet strong baseline dubbed KGEditor, which utilizes additional\nparametric layers of the hyper network to edit/add facts. Our comprehensive\nexperimental results reveal that KGEditor excels in updating specific facts\nwithout impacting the overall performance, even when faced with limited\ntraining resources. Code and datasets are available in\nhttps://github.com/zjunlp/PromptKG/tree/main/deltaKG.\n","authors":["Siyuan Cheng","Ningyu Zhang","Bozhong Tian","Xi Chen","Qingbing Liu","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2301.10405v5.pdf","comment":"Work in progress and the project website is\n https://zjunlp.github.io/project/KGE_Editing/"},{"id":"http://arxiv.org/abs/2304.08862v2","updated":"2023-08-16T10:48:12Z","published":"2023-04-18T09:52:11Z","title":"Approximate Nearest Neighbour Phrase Mining for Contextual Speech\n Recognition","summary":" This paper presents an extension to train end-to-end Context-Aware\nTransformer Transducer ( CATT ) models by using a simple, yet efficient method\nof mining hard negative phrases from the latent space of the context encoder.\nDuring training, given a reference query, we mine a number of similar phrases\nusing approximate nearest neighbour search. These sampled phrases are then used\nas negative examples in the context list alongside random and ground truth\ncontextual information. By including approximate nearest neighbour phrases\n(ANN-P) in the context list, we encourage the learned representation to\ndisambiguate between similar, but not identical, biasing phrases. This improves\nbiasing accuracy when there are several similar phrases in the biasing\ninventory. We carry out experiments in a large-scale data regime obtaining up\nto 7% relative word error rate reductions for the contextual portion of test\ndata. We also extend and evaluate CATT approach in streaming applications.\n","authors":["Maurits Bleeker","Pawel Swietojanski","Stefan Braun","Xiaodan Zhuang"],"pdf_url":"https://arxiv.org/pdf/2304.08862v2.pdf","comment":"Accepted to Interspeech 2023. 5 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.08253v1","updated":"2023-08-16T09:45:06Z","published":"2023-08-16T09:45:06Z","title":"Benchmarking Neural Network Generalization for Grammar Induction","summary":" How well do neural networks generalize? Even for grammar induction tasks,\nwhere the target generalization is fully known, previous works have left the\nquestion open, testing very limited ranges beyond the training set and using\ndifferent success criteria. We provide a measure of neural network\ngeneralization based on fully specified formal languages. Given a model and a\nformal grammar, the method assigns a generalization score representing how well\na model generalizes to unseen samples in inverse relation to the amount of data\nit was trained on. The benchmark includes languages such as $a^nb^n$,\n$a^nb^nc^n$, $a^nb^mc^{n+m}$, and Dyck-1 and 2. We evaluate selected\narchitectures using the benchmark and find that networks trained with a Minimum\nDescription Length objective (MDL) generalize better and using less data than\nnetworks trained using standard loss functions. The benchmark is available at\nhttps://github.com/taucompling/bliss.\n","authors":["Nur Lan","Emmanuel Chemla","Roni Katzir"],"pdf_url":"https://arxiv.org/pdf/2308.08253v1.pdf","comment":"10 pages, 4 figures, 2 tables. Conference: Learning with Small Data\n 2023"},{"id":"http://arxiv.org/abs/2112.08637v3","updated":"2023-08-16T09:20:12Z","published":"2021-12-16T05:36:08Z","title":"Analyzing the Limits of Self-Supervision in Handling Bias in Language","summary":" Prompting inputs with natural language task descriptions has emerged as a\npopular mechanism to elicit reasonably accurate outputs from large-scale\ngenerative language models with little to no in-context supervision. This also\nhelps gain insight into how well language models capture the semantics of a\nwide range of downstream tasks purely from self-supervised pre-training on\nmassive corpora of unlabeled text. Such models have naturally also been exposed\nto a lot of undesirable content like racist and sexist language and there is\nlimited work on awareness of models along these dimensions. In this paper, we\ndefine and comprehensively evaluate how well such language models capture the\nsemantics of four tasks for bias: diagnosis, identification, extraction and\nrephrasing. We define three broad classes of task descriptions for these tasks:\nstatement, question, and completion, with numerous lexical variants within each\nclass. We study the efficacy of prompting for each task using these classes and\nthe null task description across several decoding methods and few-shot\nexamples. Our analyses indicate that language models are capable of performing\nthese tasks to widely varying degrees across different bias dimensions, such as\ngender and political affiliation. We believe our work is an important step\ntowards unbiased language models by quantifying the limits of current\nself-supervision objectives at accomplishing such sociologically challenging\ntasks.\n","authors":["Lisa Bauer","Karthik Gopalakrishnan","Spandana Gella","Yang Liu","Mohit Bansal","Dilek Hakkani-Tur"],"pdf_url":"https://arxiv.org/pdf/2112.08637v3.pdf","comment":"Accepted at Findings of the Conference on Empirical Methods in\n Natural Language Processing (EMNLP) 2022"},{"id":"http://arxiv.org/abs/2308.08241v1","updated":"2023-08-16T09:16:02Z","published":"2023-08-16T09:16:02Z","title":"TEST: Text Prototype Aligned Embedding to Activate LLM's Ability for\n Time Series","summary":" This work summarizes two strategies for completing time-series (TS) tasks\nusing today's language model (LLM): LLM-for-TS, design and train a fundamental\nlarge model for TS data; TS-for-LLM, enable the pre-trained LLM to handle TS\ndata. Considering the insufficient data accumulation, limited resources, and\nsemantic context requirements, this work focuses on TS-for-LLM methods, where\nwe aim to activate LLM's ability for TS data by designing a TS embedding method\nsuitable for LLM. The proposed method is named TEST. It first tokenizes TS,\nbuilds an encoder to embed them by instance-wise, feature-wise, and\ntext-prototype-aligned contrast, and then creates prompts to make LLM more open\nto embeddings, and finally implements TS tasks. Experiments are carried out on\nTS classification and forecasting tasks using 8 LLMs with different structures\nand sizes. Although its results cannot significantly outperform the current\nSOTA models customized for TS tasks, by treating LLM as the pattern machine, it\ncan endow LLM's ability to process TS data without compromising the language\nability. This paper is intended to serve as a foundational work that will\ninspire further research.\n","authors":["Chenxi Sun","Yaliang Li","Hongyan Li","Shenda Hong"],"pdf_url":"https://arxiv.org/pdf/2308.08241v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.08239v1","updated":"2023-08-16T09:15:18Z","published":"2023-08-16T09:15:18Z","title":"MemoChat: Tuning LLMs to Use Memos for Consistent Long-Range Open-Domain\n Conversation","summary":" We propose MemoChat, a pipeline for refining instructions that enables large\nlanguage models (LLMs) to effectively employ self-composed memos for\nmaintaining consistent long-range open-domain conversations. We demonstrate a\nlong-range open-domain conversation through iterative\n\"memorization-retrieval-response\" cycles. This requires us to carefully design\ntailored tuning instructions for each distinct stage. The instructions are\nreconstructed from a collection of public datasets to teach the LLMs to\nmemorize and retrieve past dialogues with structured memos, leading to enhanced\nconsistency when participating in future conversations. We invite experts to\nmanually annotate a test set designed to evaluate the consistency of long-range\nconversations questions. Experiments on three testing scenarios involving both\nopen-source and API-accessible chatbots at scale verify the efficacy of\nMemoChat, which outperforms strong baselines.\n","authors":["Junru Lu","Siyu An","Mingbao Lin","Gabriele Pergola","Yulan He","Di Yin","Xing Sun","Yunsheng Wu"],"pdf_url":"https://arxiv.org/pdf/2308.08239v1.pdf","comment":"Codes, data and models will be available soon"},{"id":"http://arxiv.org/abs/2308.08234v1","updated":"2023-08-16T09:11:00Z","published":"2023-08-16T09:11:00Z","title":"Challenges and Opportunities of Using Transformer-Based Multi-Task\n Learning in NLP Through ML Lifecycle: A Survey","summary":" The increasing adoption of natural language processing (NLP) models across\nindustries has led to practitioners' need for machine learning systems to\nhandle these models efficiently, from training to serving them in production.\nHowever, training, deploying, and updating multiple models can be complex,\ncostly, and time-consuming, mainly when using transformer-based pre-trained\nlanguage models. Multi-Task Learning (MTL) has emerged as a promising approach\nto improve efficiency and performance through joint training, rather than\ntraining separate models. Motivated by this, we first provide an overview of\ntransformer-based MTL approaches in NLP. Then, we discuss the challenges and\nopportunities of using MTL approaches throughout typical ML lifecycle phases,\nspecifically focusing on the challenges related to data engineering, model\ndevelopment, deployment, and monitoring phases. This survey focuses on\ntransformer-based MTL architectures and, to the best of our knowledge, is novel\nin that it systematically analyses how transformer-based MTL in NLP fits into\nML lifecycle phases. Furthermore, we motivate research on the connection\nbetween MTL and continual learning (CL), as this area remains unexplored. We\nbelieve it would be practical to have a model that can handle both MTL and CL,\nas this would make it easier to periodically re-train the model, update it due\nto distribution shifts, and add new capabilities to meet real-world\nrequirements.\n","authors":["Lovre Torbarina","Tin Ferkovic","Lukasz Roguski","Velimir Mihelcic","Bruno Sarlija","Zeljko Kraljevic"],"pdf_url":"https://arxiv.org/pdf/2308.08234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09095v2","updated":"2023-08-16T09:09:53Z","published":"2022-12-18T14:36:07Z","title":"Rethinking the Role of Scale for In-Context Learning: An\n Interpretability-based Case Study at 66 Billion Scale","summary":" Language models have been shown to perform better with an increase in scale\non a wide variety of tasks via the in-context learning paradigm. In this paper,\nwe investigate the hypothesis that the ability of a large language model to\nin-context learn-perform a task is not uniformly spread across all of its\nunderlying components. Using a 66 billion parameter language model (OPT-66B)\nacross a diverse set of 14 downstream tasks, we find this is indeed the case:\n$\\sim$70% of attention heads and $\\sim$20% of feed forward networks can be\nremoved with minimal decline in task performance. We find substantial overlap\nin the set of attention heads (un)important for in-context learning across\ntasks and number of in-context examples. We also address our hypothesis through\na task-agnostic lens, finding that a small set of attention heads in OPT-66B\nscore highly on their ability to perform primitive induction operations\nassociated with in-context learning, namely, prefix matching and copying. These\ninduction heads overlap with task-specific important heads, reinforcing\narguments by Olsson et al. (arXiv:2209.11895) regarding induction head\ngenerality to more sophisticated behaviors associated with in-context learning.\nOverall, our study provides several insights that indicate large language\nmodels may be under-trained for in-context learning and opens up questions on\nhow to pre-train language models to more effectively perform in-context\nlearning.\n","authors":["Hritik Bansal","Karthik Gopalakrishnan","Saket Dingliwal","Sravan Bodapati","Katrin Kirchhoff","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2212.09095v2.pdf","comment":"Accepted at Annual Meeting of the Association for Computational\n Linguistics (ACL) 2023, Main Proceedings"},{"id":"http://arxiv.org/abs/2307.16082v2","updated":"2023-08-16T09:00:25Z","published":"2023-07-29T21:37:55Z","title":"EnrichEvent: Enriching Social Data with Contextual Information for\n Emerging Event Extraction","summary":" Social platforms have emerged as crucial platforms for disseminating\ninformation and discussing real-life social events, which offers an excellent\nopportunity for researchers to design and implement novel event detection\nframeworks. However, most existing approaches merely exploit keyword burstiness\nor network structures to detect unspecified events. Thus, they often fail to\nidentify unspecified events regarding the challenging nature of events and\nsocial data. Social data, e.g., tweets, is characterized by misspellings,\nincompleteness, word sense ambiguation, and irregular language, as well as\nvariation in aspects of opinions. Moreover, extracting discriminative features\nand patterns for evolving events by exploiting the limited structural knowledge\nis almost infeasible. To address these challenges, in this thesis, we propose a\nnovel framework, namely EnrichEvent, that leverages the lexical and contextual\nrepresentations of streaming social data. In particular, we leverage contextual\nknowledge, as well as lexical knowledge, to detect semantically related tweets\nand enhance the effectiveness of the event detection approaches. Eventually,\nour proposed framework produces cluster chains for each event to show the\nevolving variation of the event through time. We conducted extensive\nexperiments to evaluate our framework, validating its high performance and\neffectiveness in detecting and distinguishing unspecified social events.\n","authors":["Mohammadali Sefidi Esfahani","Mohammad Akbari"],"pdf_url":"https://arxiv.org/pdf/2307.16082v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17558v2","updated":"2023-08-16T08:46:52Z","published":"2023-06-30T11:21:40Z","title":"Towards the extraction of robust sign embeddings for low resource sign\n language recognition","summary":" Isolated Sign Language Recognition (SLR) has mostly been applied on datasets\ncontaining signs executed slowly and clearly by a limited group of signers. In\nreal-world scenarios, however, we are met with challenging visual conditions,\ncoarticulated signing, small datasets, and the need for signer independent\nmodels. To tackle this difficult problem, we require a robust feature extractor\nto process the sign language videos. One could expect human pose estimators to\nbe ideal candidates. However, due to a domain mismatch with their training sets\nand challenging poses in sign language, they lack robustness on sign language\ndata and image-based models often still outperform keypoint-based models.\nFurthermore, whereas the common practice of transfer learning with image-based\nmodels yields even higher accuracy, keypoint-based models are typically trained\nfrom scratch on every SLR dataset. These factors limit their usefulness for\nSLR. From the existing literature, it is also not clear which, if any, pose\nestimator performs best for SLR. We compare the three most popular pose\nestimators for SLR: OpenPose, MMPose and MediaPipe. We show that through\nkeypoint normalization, missing keypoint imputation, and learning a pose\nembedding, we can obtain significantly better results and enable transfer\nlearning. We show that keypoint-based embeddings contain cross-lingual\nfeatures: they can transfer between sign languages and achieve competitive\nperformance even when fine-tuning only the classifier layer of an SLR model on\na target sign language. We furthermore achieve better performance using\nfine-tuned transferred embeddings than models trained only on the target sign\nlanguage. The embeddings can also be learned in a multilingual fashion. The\napplication of these embeddings could prove particularly useful for low\nresource sign languages in the future.\n","authors":["Mathieu De Coster","Ellen Rushe","Ruth Holmes","Anthony Ventresque","Joni Dambre"],"pdf_url":"https://arxiv.org/pdf/2306.17558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12267v4","updated":"2023-08-16T08:44:08Z","published":"2023-07-23T08:47:51Z","title":"Towards Automatic Boundary Detection for Human-AI Collaborative Hybrid\n Essay in Education","summary":" The recent large language models (LLMs), e.g., ChatGPT, have been able to\ngenerate human-like and fluent responses when provided with specific\ninstructions. While admitting the convenience brought by technological\nadvancement, educators also have concerns that students might leverage LLMs to\ncomplete their writing assignments and pass them off as their original work.\nAlthough many AI content detection studies have been conducted as a result of\nsuch concerns, most of these prior studies modeled AI content detection as a\nclassification problem, assuming that a text is either entirely human-written\nor entirely AI-generated. In this study, we investigated AI content detection\nin a rarely explored yet realistic setting where the text to be detected is\ncollaboratively written by human and generative LLMs (i.e., hybrid text). We\nfirst formalized the detection task as identifying the transition points\nbetween human-written content and AI-generated content from a given hybrid text\n(boundary detection). Then we proposed a two-step approach where we (1)\nseparated AI-generated content from human-written content during the encoder\ntraining process; and (2) calculated the distances between every two adjacent\nprototypes and assumed that the boundaries exist between the two adjacent\nprototypes that have the furthest distance from each other. Through extensive\nexperiments, we observed the following main findings: (1) the proposed approach\nconsistently outperformed the baseline methods across different experiment\nsettings; (2) the encoder training process can significantly boost the\nperformance of the proposed approach; (3) when detecting boundaries for\nsingle-boundary hybrid essays, the proposed approach could be enhanced by\nadopting a relatively large prototype size, leading to a 22% improvement in the\nIn-Domain evaluation and an 18% improvement in the Out-of-Domain evaluation.\n","authors":["Zijie Zeng","Lele Sha","Yuheng Li","Kaixun Yang","Dragan Gašević","Guanliang Chen"],"pdf_url":"https://arxiv.org/pdf/2307.12267v4.pdf","comment":"9 pages including references, 2 figures"},{"id":"http://arxiv.org/abs/2303.09713v2","updated":"2023-08-16T08:17:02Z","published":"2023-03-17T01:10:33Z","title":"CHAMPAGNE: Learning Real-world Conversation from Large-Scale Web Videos","summary":" Visual information is central to conversation: body gestures and physical\nbehaviour, for example, contribute to meaning that transcends words alone. To\ndate, however, most neural conversational models are limited to just text. We\nintroduce CHAMPAGNE, a generative model of conversations that can account for\nvisual contexts. To train CHAMPAGNE, we collect and release YTD-18M, a\nlarge-scale corpus of 18M video-based dialogues. YTD-18M is constructed from\nweb videos: crucial to our data collection pipeline is a pretrained language\nmodel that converts error-prone automatic transcripts to a cleaner dialogue\nformat while maintaining meaning. Human evaluation reveals that YTD-18M is more\nsensible and specific than prior resources (MMDialog, 1M dialogues), while\nmaintaining visual-groundedness. Experiments demonstrate that 1) CHAMPAGNE\nlearns to conduct conversation from YTD-18M; and 2) when fine-tuned, it\nachieves state-of-the-art results on four vision-language tasks focused on\nreal-world conversations. We release data, models, and code.\n","authors":["Seungju Han","Jack Hessel","Nouha Dziri","Yejin Choi","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2303.09713v2.pdf","comment":"ICCV 2023, Project page: https://seungjuhan.me/champagne"},{"id":"http://arxiv.org/abs/2308.08204v1","updated":"2023-08-16T08:09:10Z","published":"2023-08-16T08:09:10Z","title":"MoCoSA: Momentum Contrast for Knowledge Graph Completion with\n Structure-Augmented Pre-trained Language Models","summary":" Knowledge Graph Completion (KGC) aims to conduct reasoning on the facts\nwithin knowledge graphs and automatically infer missing links. Existing methods\ncan mainly be categorized into structure-based or description-based. On the one\nhand, structure-based methods effectively represent relational facts in\nknowledge graphs using entity embeddings. However, they struggle with\nsemantically rich real-world entities due to limited structural information and\nfail to generalize to unseen entities. On the other hand, description-based\nmethods leverage pre-trained language models (PLMs) to understand textual\ninformation. They exhibit strong robustness towards unseen entities. However,\nthey have difficulty with larger negative sampling and often lag behind\nstructure-based methods. To address these issues, in this paper, we propose\nMomentum Contrast for knowledge graph completion with Structure-Augmented\npre-trained language models (MoCoSA), which allows the PLM to perceive the\nstructural information by the adaptable structure encoder. To improve learning\nefficiency, we proposed momentum hard negative and intra-relation negative\nsampling. Experimental results demonstrate that our approach achieves\nstate-of-the-art performance in terms of mean reciprocal rank (MRR), with\nimprovements of 2.5% on WN18RR and 21% on OpenBG500.\n","authors":["Jiabang He","Liu Jia","Lei Wang","Xiyao Li","Xing Xu"],"pdf_url":"https://arxiv.org/pdf/2308.08204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08181v1","updated":"2023-08-16T07:21:01Z","published":"2023-08-16T07:21:01Z","title":"ChinaTelecom System Description to VoxCeleb Speaker Recognition\n Challenge 2023","summary":" This technical report describes ChinaTelecom system for Track 1 (closed) of\nthe VoxCeleb2023 Speaker Recognition Challenge (VoxSRC 2023). Our system\nconsists of several ResNet variants trained only on VoxCeleb2, which were fused\nfor better performance later. Score calibration was also applied for each\nvariant and the fused system. The final submission achieved minDCF of 0.1066\nand EER of 1.980%.\n","authors":["Mengjie Du","Xiang Fang","Jie Li"],"pdf_url":"https://arxiv.org/pdf/2308.08181v1.pdf","comment":"System description of VoxSRC 2023"},{"id":"http://arxiv.org/abs/2308.08176v1","updated":"2023-08-16T07:12:23Z","published":"2023-08-16T07:12:23Z","title":"RSpell: Retrieval-augmented Framework for Domain Adaptive Chinese\n Spelling Check","summary":" Chinese Spelling Check (CSC) refers to the detection and correction of\nspelling errors in Chinese texts. In practical application scenarios, it is\nimportant to make CSC models have the ability to correct errors across\ndifferent domains. In this paper, we propose a retrieval-augmented spelling\ncheck framework called RSpell, which searches corresponding domain terms and\nincorporates them into CSC models. Specifically, we employ pinyin fuzzy\nmatching to search for terms, which are combined with the input and fed into\nthe CSC model. Then, we introduce an adaptive process control mechanism to\ndynamically adjust the impact of external knowledge on the model. Additionally,\nwe develop an iterative strategy for the RSpell framework to enhance reasoning\ncapabilities. We conducted experiments on CSC datasets in three domains: law,\nmedicine, and official document writing. The results demonstrate that RSpell\nachieves state-of-the-art performance in both zero-shot and fine-tuning\nscenarios, demonstrating the effectiveness of the retrieval-augmented CSC\nframework. Our code is available at https://github.com/47777777/Rspell.\n","authors":["Siqi Song","Qi Lv","Lei Geng","Ziqiang Cao","Guohong Fu"],"pdf_url":"https://arxiv.org/pdf/2308.08176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03266v2","updated":"2023-08-16T07:03:42Z","published":"2023-08-07T03:12:27Z","title":"SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and\n Effective Hotword Customization Ability","summary":" Hotword customization is one of the important issues remained in ASR field -\nit is of value to enable users of ASR systems to customize names of entities,\npersons and other phrases. The past few years have seen both implicit and\nexplicit modeling strategies for ASR contextualization developed. While these\napproaches have performed adequately, they still exhibit certain shortcomings\nsuch as instability in effectiveness. In this paper we propose\nSemantic-augmented Contextual-Paraformer (SeACo-Paraformer) a novel NAR based\nASR system with flexible and effective hotword customization ability. It\ncombines the accuracy of the AED-based model, the efficiency of the NAR model,\nand the excellent performance in contextualization. In 50,000 hours industrial\nbig data experiments, our proposed model outperforms strong baselines in\ncustomization and general ASR tasks. Besides, we explore an efficient way to\nfilter large scale incoming hotwords for further improvement. The source codes\nand industrial models proposed and compared are all opened as well as two\nhotword test sets.\n","authors":["Xian Shi","Yexin Yang","Zerui Li","Shiliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03266v2.pdf","comment":"updated draft"},{"id":"http://arxiv.org/abs/2308.08169v1","updated":"2023-08-16T06:52:10Z","published":"2023-08-16T06:52:10Z","title":"Enhancing Performance on Seen and Unseen Dialogue Scenarios using\n Retrieval-Augmented End-to-End Task-Oriented System","summary":" End-to-end task-oriented dialogue (TOD) systems have achieved promising\nperformance by leveraging sophisticated natural language understanding and\nnatural language generation capabilities of pre-trained models. This work\nenables the TOD systems with more flexibility through a simple cache. The cache\nprovides the flexibility to dynamically update the TOD systems and handle both\nexisting and unseen dialogue scenarios. Towards this end, we first fine-tune a\nretrieval module to effectively retrieve the most relevant information entries\nfrom the cache. We then train end-to-end TOD models that can refer to and\nground on both dialogue history and retrieved information during TOD\ngeneration. The cache is straightforward to construct, and the backbone models\nof TOD systems are compatible with existing pre-trained generative models.\nExtensive experiments demonstrate the superior performance of our framework,\nwith a notable improvement in non-empty joint goal accuracy by 6.7% compared to\nstrong baselines.\n","authors":["Jianguo Zhang","Stephen Roller","Kun Qian","Zhiwei Liu","Rui Meng","Shelby Heinecke","Huan Wang","Silvio Savarese","Caiming Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.08169v1.pdf","comment":"Accepted by SIGDIAL 2023 as a long paper"},{"id":"http://arxiv.org/abs/2307.14385v2","updated":"2023-08-16T06:04:48Z","published":"2023-07-26T06:00:50Z","title":"Mental-LLM: Leveraging Large Language Models for Mental Health\n Prediction via Online Text Data","summary":" Advances in large language models (LLMs) have empowered a variety of\napplications. However, there is still a significant gap in research when it\ncomes to understanding and enhancing the capabilities of LLMs in the field of\nmental health. In this work, we present the first comprehensive evaluation of\nmultiple LLMs, including Alpaca, Alpaca-LoRA, FLAN-T5, GPT-3.5, and GPT-4, on\nvarious mental health prediction tasks via online text data. We conduct a broad\nrange of experiments, covering zero-shot prompting, few-shot prompting, and\ninstruction fine-tuning. The results indicate a promising yet limited\nperformance of LLMs with zero-shot and few-shot prompt designs for the mental\nhealth tasks. More importantly, our experiments show that instruction\nfinetuning can significantly boost the performance of LLMs for all tasks\nsimultaneously. Our best-finetuned models, Mental-Alpaca and Mental-FLAN-T5,\noutperform the best prompt design of GPT-3.5 (25 and 15 times bigger) by 10.9%\non balanced accuracy and the best of GPT-4 (250 and 150 times bigger) by 4.8%.\nThey further perform on par with the state-of-the-art task-specific language\nmodel. We also conduct an exploratory case study on LLMs' capability on the\nmental health reasoning tasks, illustrating the promising capability of certain\nmodels such as GPT-4. We summarize our findings into a set of action guidelines\nfor potential methods to enhance LLMs' capability for mental health tasks.\nMeanwhile, we also emphasize the important limitations before achieving\ndeployability in real-world mental health settings, such as known racial and\ngender bias. We highlight the important ethical risks accompanying this line of\nresearch.\n","authors":["Xuhai Xu","Bingshen Yao","Yuanzhe Dong","Saadia Gabriel","Hong Yu","James Hendler","Marzyeh Ghassemi","Anind K. Dey","Dakuo Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07711v2","updated":"2023-08-16T05:58:16Z","published":"2023-08-15T11:45:34Z","title":"SPM: Structured Pretraining and Matching Architectures for Relevance\n Modeling in Meituan Search","summary":" In e-commerce search, relevance between query and documents is an essential\nrequirement for satisfying user experience. Different from traditional\ne-commerce platforms that offer products, users search on life service\nplatforms such as Meituan mainly for product providers, which usually have\nabundant structured information, e.g. name, address, category, thousands of\nproducts. Modeling search relevance with these rich structured contents is\nchallenging due to the following issues: (1) there is language distribution\ndiscrepancy among different fields of structured document, making it difficult\nto directly adopt off-the-shelf pretrained language model based methods like\nBERT. (2) different fields usually have different importance and their length\nvary greatly, making it difficult to extract document information helpful for\nrelevance matching.\n To tackle these issues, in this paper we propose a novel two-stage\npretraining and matching architecture for relevance matching with rich\nstructured documents. At pretraining stage, we propose an effective pretraining\nmethod that employs both query and multiple fields of document as inputs,\nincluding an effective information compression method for lengthy fields. At\nrelevance matching stage, a novel matching method is proposed by leveraging\ndomain knowledge in search query to generate more effective document\nrepresentations for relevance scoring. Extensive offline experiments and online\nA/B tests on millions of users verify that the proposed architectures\neffectively improve the performance of relevance modeling. The model has\nalready been deployed online, serving the search traffic of Meituan for over a\nyear.\n","authors":["Wen Zan","Yaopeng Han","Xiaotian Jiang","Yao Xiao","Yang Yang","Dayao Chen","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07711v2.pdf","comment":"Accepted by CIKM '23"},{"id":"http://arxiv.org/abs/2308.08156v1","updated":"2023-08-16T05:58:12Z","published":"2023-08-16T05:58:12Z","title":"Sarcasm Detection in a Disaster Context","summary":" During natural disasters, people often use social media platforms such as\nTwitter to ask for help, to provide information about the disaster situation,\nor to express contempt about the unfolding event or public policies and\nguidelines. This contempt is in some cases expressed as sarcasm or irony.\nUnderstanding this form of speech in a disaster-centric context is essential to\nimproving natural language understanding of disaster-related tweets. In this\npaper, we introduce HurricaneSARC, a dataset of 15,000 tweets annotated for\nintended sarcasm, and provide a comprehensive investigation of sarcasm\ndetection using pre-trained language models. Our best model is able to obtain\nas much as 0.70 F1 on our dataset. We also demonstrate that the performance on\nHurricaneSARC can be improved by leveraging intermediate task transfer\nlearning. We release our data and code at\nhttps://github.com/tsosea2/HurricaneSarc.\n","authors":["Tiberiu Sosea","Junyi Jessy Li","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2308.08156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08155v1","updated":"2023-08-16T05:57:52Z","published":"2023-08-16T05:57:52Z","title":"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation\n Framework","summary":" This technical report presents AutoGen, a new framework that enables\ndevelopment of LLM applications using multiple agents that can converse with\neach other to solve tasks. AutoGen agents are customizable, conversable, and\nseamlessly allow human participation. They can operate in various modes that\nemploy combinations of LLMs, human inputs, and tools. AutoGen's design offers\nmultiple advantages: a) it gracefully navigates the strong but imperfect\ngeneration and reasoning abilities of these LLMs; b) it leverages human\nunderstanding and intelligence, while providing valuable automation through\nconversations between agents; c) it simplifies and unifies the implementation\nof complex LLM workflows as automated agent chats. We provide many diverse\nexamples of how developers can easily use AutoGen to effectively solve tasks or\nbuild applications, ranging from coding, mathematics, operations research,\nentertainment, online decision-making, question answering, etc.\n","authors":["Qingyun Wu","Gagan Bansal","Jieyu Zhang","Yiran Wu","Shaokun Zhang","Erkang Zhu","Beibin Li","Li Jiang","Xiaoyun Zhang","Chi Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08155v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2308.08153v1","updated":"2023-08-16T05:48:50Z","published":"2023-08-16T05:48:50Z","title":"Fast Training of NMT Model with Data Sorting","summary":" The Transformer model has revolutionized Natural Language Processing tasks\nsuch as Neural Machine Translation, and many efforts have been made to study\nthe Transformer architecture, which increased its efficiency and accuracy. One\npotential area for improvement is to address the computation of empty tokens\nthat the Transformer computes only to discard them later, leading to an\nunnecessary computational burden. To tackle this, we propose an algorithm that\nsorts translation sentence pairs based on their length before batching,\nminimizing the waste of computing power. Since the amount of sorting could\nviolate the independent and identically distributed (i.i.d) data assumption, we\nsort the data partially. In experiments, we apply the proposed method to\nEnglish-Korean and English-Luganda language pairs for machine translation and\nshow that there are gains in computational time while maintaining the\nperformance. Our method is independent of architectures, so that it can be\neasily integrated into any training process with flexible data lengths.\n","authors":["Daniela N. Rim","Kimera Richard","Heeyoul Choi"],"pdf_url":"https://arxiv.org/pdf/2308.08153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08147v1","updated":"2023-08-16T04:56:55Z","published":"2023-08-16T04:56:55Z","title":"MDDial: A Multi-turn Differential Diagnosis Dialogue Dataset with\n Reliability Evaluation","summary":" Dialogue systems for Automatic Differential Diagnosis (ADD) have a wide range\nof real-life applications. These dialogue systems are promising for providing\neasy access and reducing medical costs. Building end-to-end ADD dialogue\nsystems requires dialogue training datasets. However, to the best of our\nknowledge, there is no publicly available ADD dialogue dataset in English\n(although non-English datasets exist). Driven by this, we introduce MDDial, the\nfirst differential diagnosis dialogue dataset in English which can aid to build\nand evaluate end-to-end ADD dialogue systems. Additionally, earlier studies\npresent the accuracy of diagnosis and symptoms either individually or as a\ncombined weighted score. This method overlooks the connection between the\nsymptoms and the diagnosis. We introduce a unified score for the ADD system\nthat takes into account the interplay between symptoms and diagnosis. This\nscore also indicates the system's reliability. To the end, we train two\nmoderate-size of language models on MDDial. Our experiments suggest that while\nthese language models can perform well on many natural language understanding\ntasks, including dialogue tasks in the general domain, they struggle to relate\nrelevant symptoms and disease and thus have poor performance on MDDial. MDDial\nwill be released publicly to aid the study of ADD dialogue research.\n","authors":["Srija Macherla","Man Luo","Mihir Parmar","Chitta Baral"],"pdf_url":"https://arxiv.org/pdf/2308.08147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08125v1","updated":"2023-08-16T03:31:30Z","published":"2023-08-16T03:31:30Z","title":"Radio2Text: Streaming Speech Recognition Using mmWave Radio Signals","summary":" Millimeter wave (mmWave) based speech recognition provides more possibility\nfor audio-related applications, such as conference speech transcription and\neavesdropping. However, considering the practicality in real scenarios, latency\nand recognizable vocabulary size are two critical factors that cannot be\noverlooked. In this paper, we propose Radio2Text, the first mmWave-based system\nfor streaming automatic speech recognition (ASR) with a vocabulary size\nexceeding 13,000 words. Radio2Text is based on a tailored streaming Transformer\nthat is capable of effectively learning representations of speech-related\nfeatures, paving the way for streaming ASR with a large vocabulary. To\nalleviate the deficiency of streaming networks unable to access entire future\ninputs, we propose the Guidance Initialization that facilitates the transfer of\nfeature knowledge related to the global context from the non-streaming\nTransformer to the tailored streaming Transformer through weight inheritance.\nFurther, we propose a cross-modal structure based on knowledge distillation\n(KD), named cross-modal KD, to mitigate the negative effect of low quality\nmmWave signals on recognition performance. In the cross-modal KD, the audio\nstreaming Transformer provides feature and response guidance that inherit\nfruitful and accurate speech information to supervise the training of the\ntailored radio streaming Transformer. The experimental results show that our\nRadio2Text can achieve a character error rate of 5.7% and a word error rate of\n9.4% for the recognition of a vocabulary consisting of over 13,000 words.\n","authors":["Running Zhao","Jiangtao Yu","Hang Zhao","Edith C. H. Ngai"],"pdf_url":"https://arxiv.org/pdf/2308.08125v1.pdf","comment":"Accepted by Proceedings of the ACM on Interactive, Mobile, Wearable\n and Ubiquitous Technologies (ACM IMWUT/UbiComp 2023)"},{"id":"http://arxiv.org/abs/2308.08090v1","updated":"2023-08-16T01:46:01Z","published":"2023-08-16T01:46:01Z","title":"Separate the Wheat from the Chaff: Model Deficiency Unlearning via\n Parameter-Efficient Module Operation","summary":" Large language models (LLMs) have been widely used in various applications\nbut are known to suffer from issues related to untruthfulness and toxicity.\nWhile parameter-efficient modules (PEMs) have demonstrated their effectiveness\nin equipping models with new skills, leveraging PEMs for deficiency unlearning\nremains underexplored. In this work, we propose a PEMs operation approach,\nnamely Extraction-before-Subtraction (Ext-Sub), to enhance the truthfulness and\ndetoxification of LLMs through the integration of ``expert'' PEM and\n``anti-expert'' PEM. Remarkably, even anti-expert PEM possess valuable\ncapabilities due to their proficiency in generating fabricated content, which\nnecessitates language modeling and logical narrative competence. Rather than\nmerely negating the parameters, our approach involves extracting and\neliminating solely the deficiency capability within anti-expert PEM while\npreserving the general capabilities. To evaluate the effectiveness of our\napproach in terms of truthfulness and detoxification, we conduct extensive\nexperiments on LLMs, encompassing additional abilities such as language\nmodeling and mathematical reasoning. Our empirical results demonstrate that our\napproach effectively improves truthfulness and detoxification, while largely\npreserving the fundamental abilities of LLMs.\n","authors":["Xinshuo Hu","Dongfang Li","Zihao Zheng","Zhenyu Liu","Baotian Hu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11095v3","updated":"2023-08-16T00:57:34Z","published":"2023-05-18T16:32:58Z","title":"Prompting the Hidden Talent of Web-Scale Speech Models for Zero-Shot\n Task Generalization","summary":" We investigate the emergent abilities of the recently proposed web-scale\nspeech model Whisper, by adapting it to unseen tasks with prompt engineering.\nWe selected three tasks: audio-visual speech recognition (AVSR), code-switched\nspeech recognition (CS-ASR), and speech translation (ST) on unseen language\npairs. We design task-specific prompts, by either leveraging another\nlarge-scale model, or simply manipulating the special tokens in the default\nprompts. Experiments show that compared to the default prompts, our proposed\nprompts improve performance by 10% to 45% on the three zero-shot tasks, and\neven outperform SotA supervised models on some datasets. In addition, our\nexperiments reveal many interesting properties of Whisper, including its\nrobustness to prompts, bias on accents, and the multilingual understanding in\nits latent space. Code is available at\nhttps://github.com/jasonppy/PromptingWhisper\n","authors":["Puyuan Peng","Brian Yan","Shinji Watanabe","David Harwath"],"pdf_url":"https://arxiv.org/pdf/2305.11095v3.pdf","comment":"Interspeech 2023"},{"id":"http://arxiv.org/abs/2306.13213v2","updated":"2023-08-16T22:38:55Z","published":"2023-06-22T22:13:03Z","title":"Visual Adversarial Examples Jailbreak Aligned Large Language Models","summary":" Recently, there has been a surge of interest in integrating vision into Large\nLanguage Models (LLMs), exemplified by Visual Language Models (VLMs) such as\nFlamingo and GPT-4. This paper sheds light on the security and safety\nimplications of this trend. First, we underscore that the continuous and\nhigh-dimensional nature of the visual input makes it a weak link against\nadversarial attacks, representing an expanded attack surface of\nvision-integrated LLMs. Second, we highlight that the versatility of LLMs also\npresents visual attackers with a wider array of achievable adversarial\nobjectives, extending the implications of security failures beyond mere\nmisclassification. As an illustration, we present a case study in which we\nexploit visual adversarial examples to circumvent the safety guardrail of\naligned LLMs with integrated vision. Intriguingly, we discover that a single\nvisual adversarial example can universally jailbreak an aligned LLM, compelling\nit to heed a wide range of harmful instructions that it otherwise would not)\nand generate harmful content that transcends the narrow scope of a `few-shot'\nderogatory corpus initially employed to optimize the adversarial example. Our\nstudy underscores the escalating adversarial risks associated with the pursuit\nof multimodality. Our findings also connect the long-studied adversarial\nvulnerabilities of neural networks to the nascent field of AI alignment. The\npresented attack suggests a fundamental adversarial challenge for AI alignment,\nespecially in light of the emerging trend toward multimodality in frontier\nfoundation models.\n","authors":["Xiangyu Qi","Kaixuan Huang","Ashwinee Panda","Peter Henderson","Mengdi Wang","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2306.13213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08688v1","updated":"2023-08-16T22:16:00Z","published":"2023-08-16T22:16:00Z","title":"Lightweight Adaptation of Neural Language Models via Subspace Embedding","summary":" Traditional neural word embeddings are usually dependent on a richer\ndiversity of vocabulary. However, the language models recline to cover major\nvocabularies via the word embedding parameters, in particular, for multilingual\nlanguage models that generally cover a significant part of their overall\nlearning parameters. In this work, we present a new compact embedding structure\nto reduce the memory footprint of the pre-trained language models with a\nsacrifice of up to 4% absolute accuracy. The embeddings vectors reconstruction\nfollows a set of subspace embeddings and an assignment procedure via the\ncontextual relationship among tokens from pre-trained language models. The\nsubspace embedding structure calibrates to masked language models, to evaluate\nour compact embedding structure on similarity and textual entailment tasks,\nsentence and paraphrase tasks. Our experimental evaluation shows that the\nsubspace embeddings achieve compression rates beyond 99.8% in comparison with\nthe original embeddings for the language models on XNLI and GLUE benchmark\nsuites.\n","authors":["Amit Kumar Jaiswal","Haiming Liu"],"pdf_url":"https://arxiv.org/pdf/2308.08688v1.pdf","comment":"5 pages, Accepted as a Main Conference Short Paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2208.02743v3","updated":"2023-08-16T21:03:42Z","published":"2022-08-04T16:18:16Z","title":"Integrating Knowledge Graph embedding and pretrained Language Models in\n Hypercomplex Spaces","summary":" Knowledge Graphs, such as Wikidata, comprise structural and textual knowledge\nin order to represent knowledge. For each of the two modalities dedicated\napproaches for graph embedding and language models learn patterns that allow\nfor predicting novel structural knowledge. Few approaches have integrated\nlearning and inference with both modalities and these existing ones could only\npartially exploit the interaction of structural and textual knowledge. In our\napproach, we build on existing strong representations of single modalities and\nwe use hypercomplex algebra to represent both, (i), single-modality embedding\nas well as, (ii), the interaction between different modalities and their\ncomplementary means of knowledge representation. More specifically, we suggest\nDihedron and Quaternion representations of 4D hypercomplex numbers to integrate\nfour modalities namely structural knowledge graph embedding, word-level\nrepresentations (e.g.\\ Word2vec, Fasttext), sentence-level representations\n(Sentence transformer), and document-level representations (sentence\ntransformer, Doc2vec). Our unified vector representation scores the\nplausibility of labelled edges via Hamilton and Dihedron products, thus\nmodeling pairwise interactions between different modalities. Extensive\nexperimental evaluation on standard benchmark datasets shows the superiority of\nour two new models using abundant textual information besides sparse structural\nknowledge to enhance performance in link prediction tasks.\n","authors":["Mojtaba Nayyeri","Zihao Wang","Mst. Mahfuja Akter","Mirza Mohtashim Alam","Md Rashad Al Hasan Rony","Jens Lehmann","Steffen Staab"],"pdf_url":"https://arxiv.org/pdf/2208.02743v3.pdf","comment":"ISWC2023 version"},{"id":"http://arxiv.org/abs/2308.08661v1","updated":"2023-08-16T20:23:16Z","published":"2023-08-16T20:23:16Z","title":"Answering Ambiguous Questions with a Database of Questions, Answers, and\n Revisions","summary":" Many open-domain questions are under-specified and thus have multiple\npossible answers, each of which is correct under a different interpretation of\nthe question. Answering such ambiguous questions is challenging, as it requires\nretrieving and then reasoning about diverse information from multiple passages.\nWe present a new state-of-the-art for answering ambiguous questions that\nexploits a database of unambiguous questions generated from Wikipedia. On the\nchallenging ASQA benchmark, which requires generating long-form answers that\nsummarize the multiple answers to an ambiguous question, our method improves\nperformance by 15% (relative improvement) on recall measures and 10% on\nmeasures which evaluate disambiguating questions from predicted outputs.\nRetrieving from the database of generated questions also gives large\nimprovements in diverse passage retrieval (by matching user questions q to\npassages p indirectly, via questions q' generated from p).\n","authors":["Haitian Sun","William W. Cohen","Ruslan Salakhutdinov"],"pdf_url":"https://arxiv.org/pdf/2308.08661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08660v1","updated":"2023-08-16T20:17:46Z","published":"2023-08-16T20:17:46Z","title":"Large Language Models for Granularized Barrett's Esophagus Diagnosis\n Classification","summary":" Diagnostic codes for Barrett's esophagus (BE), a precursor to esophageal\ncancer, lack granularity and precision for many research or clinical use cases.\nLaborious manual chart review is required to extract key diagnostic phenotypes\nfrom BE pathology reports. We developed a generalizable transformer-based\nmethod to automate data extraction. Using pathology reports from Columbia\nUniversity Irving Medical Center with gastroenterologist-annotated targets, we\nperformed binary dysplasia classification as well as granularized multi-class\nBE-related diagnosis classification. We utilized two clinically pre-trained\nlarge language models, with best model performance comparable to a highly\ntailored rule-based system developed using the same data. Binary dysplasia\nextraction achieves 0.964 F1-score, while the multi-class model achieves 0.911\nF1-score. Our method is generalizable and faster to implement as compared to a\ntailored rule-based approach.\n","authors":["Jenna Kefeli","Ali Soroush","Courtney J. Diamond","Haley M. Zylberberg","Benjamin May","Julian A. Abrams","Chunhua Weng","Nicholas Tatonetti"],"pdf_url":"https://arxiv.org/pdf/2308.08660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08628v1","updated":"2023-08-16T18:53:39Z","published":"2023-08-16T18:53:39Z","title":"Learning the meanings of function words from grounded language using a\n visual question answering model","summary":" Interpreting a seemingly-simple function word like \"or\", \"behind\", or \"more\"\ncan require logical, numerical, and relational reasoning. How are such words\nlearned by children? Prior acquisition theories have often relied on positing a\nfoundation of innate knowledge. Yet recent neural-network based visual question\nanswering models apparently can learn to use function words as part of\nanswering questions about complex visual scenes. In this paper, we study what\nthese models learn about function words, in the hope of better understanding\nhow the meanings of these words can be learnt by both models and children. We\nshow that recurrent models trained on visually grounded language learn gradient\nsemantics for function words requiring spacial and numerical reasoning.\nFurthermore, we find that these models can learn the meanings of logical\nconnectives \"and\" and \"or\" without any prior knowledge of logical reasoning, as\nwell as early evidence that they can develop the ability to reason about\nalternative expressions when interpreting language. Finally, we show that word\nlearning difficulty is dependent on frequency in models' input. Our findings\noffer evidence that it is possible to learn the meanings of function words in\nvisually grounded context by using non-symbolic general statistical learning\nalgorithms, without any prior knowledge of linguistic meaning.\n","authors":["Eva Portelance","Michael C. Frank","Dan Jurafsky"],"pdf_url":"https://arxiv.org/pdf/2308.08628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08625v1","updated":"2023-08-16T18:48:01Z","published":"2023-08-16T18:48:01Z","title":"BIOptimus: Pre-training an Optimal Biomedical Language Model with\n Curriculum Learning for Named Entity Recognition","summary":" Using language models (LMs) pre-trained in a self-supervised setting on large\ncorpora and then fine-tuning for a downstream task has helped to deal with the\nproblem of limited label data for supervised learning tasks such as Named\nEntity Recognition (NER). Recent research in biomedical language processing has\noffered a number of biomedical LMs pre-trained using different methods and\ntechniques that advance results on many BioNLP tasks, including NER. However,\nthere is still a lack of a comprehensive comparison of pre-training approaches\nthat would work more optimally in the biomedical domain. This paper aims to\ninvestigate different pre-training methods, such as pre-training the biomedical\nLM from scratch and pre-training it in a continued fashion. We compare existing\nmethods with our proposed pre-training method of initializing weights for new\ntokens by distilling existing weights from the BERT model inside the context\nwhere the tokens were found. The method helps to speed up the pre-training\nstage and improve performance on NER. In addition, we compare how masking rate,\ncorruption strategy, and masking strategies impact the performance of the\nbiomedical LM. Finally, using the insights from our experiments, we introduce a\nnew biomedical LM (BIOptimus), which is pre-trained using Curriculum Learning\n(CL) and contextualized weight distillation method. Our model sets new states\nof the art on several biomedical Named Entity Recognition (NER) tasks. We\nrelease our code and all pre-trained models\n","authors":["Pavlova Vera","Mohammed Makhlouf"],"pdf_url":"https://arxiv.org/pdf/2308.08625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08614v1","updated":"2023-08-16T18:13:27Z","published":"2023-08-16T18:13:27Z","title":"Boosting Logical Reasoning in Large Language Models through a New\n Framework: The Graph of Thought","summary":" Recent advancements in large-scale models, such as GPT-4, have showcased\nremarkable capabilities in addressing standard queries. However, when facing\ncomplex problems that require multi-step logical reasoning, their accuracy\ndramatically decreases. Current research has explored the realm of\n\\textit{prompting engineering} to bolster the inferential capacities of these\nmodels. Our paper unveils a pioneering prompting technique, dubbed\n\\textit{Graph of Thoughts (GoT)}. Through testing on a trio of escalating\nchallenges: the 24-point game, resolution of high-degree polynomial equations,\nand derivation of formulas for recursive sequences, our method outperformed\nGPT-4, achieving accuracy improvements of $89.7\\%$, $86\\%$, and $56\\%$ for each\nrespective task. Moreover, when juxtaposed with the state-of-the-art (SOTA)\nprompting method, \\textit{Tree of Thought (ToT)}, our approach registered an\naverage accuracy boost of $23\\%$, $24\\%$, and $15\\%$.\n","authors":["Bin Lei","pei-Hung Lin","Chunhua Liao","Caiwen Ding"],"pdf_url":"https://arxiv.org/pdf/2308.08614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08610v1","updated":"2023-08-16T18:03:22Z","published":"2023-08-16T18:03:22Z","title":"FootGPT : A Large Language Model Development Experiment on a Minimal\n Setting","summary":" With recent empirical observations, it has been argued that the most\nsignificant aspect of developing accurate language models may be the proper\ndataset content and training strategy compared to the number of neural\nparameters, training duration or dataset size. Following this argument, we\nopted to fine tune a one billion parameter size trained general purpose causal\nlanguage model with a dataset curated on team statistics of the Italian\nfootball league first ten game weeks, using low rank adaptation. The limited\ntraining dataset was compiled based on a framework where a powerful commercial\nlarge language model provides distilled paragraphs and question answer pairs as\nintended. The training duration was kept relatively short to provide a basis\nfor our minimal setting exploration. We share our key observations on the\nprocess related to developing a specific purpose language model which is\nintended to interpret soccer data with constrained resources in this article.\n","authors":["Eren Unlu"],"pdf_url":"https://arxiv.org/pdf/2308.08610v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.08577v1","updated":"2023-08-16T06:28:29Z","published":"2023-08-16T06:28:29Z","title":"AffectEcho: Speaker Independent and Language-Agnostic Emotion and Affect\n Transfer for Speech Synthesis","summary":" Affect is an emotional characteristic encompassing valence, arousal, and\nintensity, and is a crucial attribute for enabling authentic conversations.\nWhile existing text-to-speech (TTS) and speech-to-speech systems rely on\nstrength embedding vectors and global style tokens to capture emotions, these\nmodels represent emotions as a component of style or represent them in discrete\ncategories. We propose AffectEcho, an emotion translation model, that uses a\nVector Quantized codebook to model emotions within a quantized space featuring\nfive levels of affect intensity to capture complex nuances and subtle\ndifferences in the same emotion. The quantized emotional embeddings are\nimplicitly derived from spoken speech samples, eliminating the need for one-hot\nvectors or explicit strength embeddings. Experimental results demonstrate the\neffectiveness of our approach in controlling the emotions of generated speech\nwhile preserving identity, style, and emotional cadence unique to each speaker.\nWe showcase the language-independent emotion modeling capability of the\nquantized emotional embeddings learned from a bilingual (English and Chinese)\nspeech corpus with an emotion transfer task from a reference speech to a target\nspeech. We achieve state-of-art results on both qualitative and quantitative\nmetrics.\n","authors":["Hrishikesh Viswanath","Aneesh Bhattacharya","Pascal Jutras-Dubé","Prerit Gupta","Mridu Prashanth","Yashvardhan Khaitan","Aniket Bera"],"pdf_url":"https://arxiv.org/pdf/2308.08577v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.08545v1","updated":"2023-08-16T17:59:13Z","published":"2023-08-16T17:59:13Z","title":"TeCH: Text-guided Reconstruction of Lifelike Clothed Humans","summary":" Despite recent research advancements in reconstructing clothed humans from a\nsingle image, accurately restoring the \"unseen regions\" with high-level details\nremains an unsolved challenge that lacks attention. Existing methods often\ngenerate overly smooth back-side surfaces with a blurry texture. But how to\neffectively capture all visual attributes of an individual from a single image,\nwhich are sufficient to reconstruct unseen areas (e.g., the back view)?\nMotivated by the power of foundation models, TeCH reconstructs the 3D human by\nleveraging 1) descriptive text prompts (e.g., garments, colors, hairstyles)\nwhich are automatically generated via a garment parsing model and Visual\nQuestion Answering (VQA), 2) a personalized fine-tuned Text-to-Image diffusion\nmodel (T2I) which learns the \"indescribable\" appearance. To represent\nhigh-resolution 3D clothed humans at an affordable cost, we propose a hybrid 3D\nrepresentation based on DMTet, which consists of an explicit body shape grid\nand an implicit distance field. Guided by the descriptive prompts +\npersonalized T2I diffusion model, the geometry and texture of the 3D humans are\noptimized through multi-view Score Distillation Sampling (SDS) and\nreconstruction losses based on the original observation. TeCH produces\nhigh-fidelity 3D clothed humans with consistent & delicate texture, and\ndetailed full-body geometry. Quantitative and qualitative experiments\ndemonstrate that TeCH outperforms the state-of-the-art methods in terms of\nreconstruction accuracy and rendering quality. The code will be publicly\navailable for research purposes at https://huangyangyi.github.io/tech\n","authors":["Yangyi Huang","Hongwei Yi","Yuliang Xiu","Tingting Liao","Jiaxiang Tang","Deng Cai","Justus Thies"],"pdf_url":"https://arxiv.org/pdf/2308.08545v1.pdf","comment":"Project: https://huangyangyi.github.io/tech"},{"id":"http://arxiv.org/abs/2303.12791v2","updated":"2023-08-16T17:58:35Z","published":"2023-03-22T17:59:12Z","title":"SHERF: Generalizable Human NeRF from a Single Image","summary":" Existing Human NeRF methods for reconstructing 3D humans typically rely on\nmultiple 2D images from multi-view cameras or monocular videos captured from\nfixed camera views. However, in real-world scenarios, human images are often\ncaptured from random camera angles, presenting challenges for high-quality 3D\nhuman reconstruction. In this paper, we propose SHERF, the first generalizable\nHuman NeRF model for recovering animatable 3D humans from a single input image.\nSHERF extracts and encodes 3D human representations in canonical space,\nenabling rendering and animation from free views and poses. To achieve\nhigh-fidelity novel view and pose synthesis, the encoded 3D human\nrepresentations should capture both global appearance and local fine-grained\ntextures. To this end, we propose a bank of 3D-aware hierarchical features,\nincluding global, point-level, and pixel-aligned features, to facilitate\ninformative encoding. Global features enhance the information extracted from\nthe single input image and complement the information missing from the partial\n2D observation. Point-level features provide strong clues of 3D human\nstructure, while pixel-aligned features preserve more fine-grained details. To\neffectively integrate the 3D-aware hierarchical feature bank, we design a\nfeature fusion transformer. Extensive experiments on THuman, RenderPeople,\nZJU_MoCap, and HuMMan datasets demonstrate that SHERF achieves state-of-the-art\nperformance, with better generalizability for novel view and pose synthesis.\n","authors":["Shoukang Hu","Fangzhou Hong","Liang Pan","Haiyi Mei","Lei Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2303.12791v2.pdf","comment":"Accepted by ICCV2023. Project webpage:\n https://skhu101.github.io/SHERF/"},{"id":"http://arxiv.org/abs/2308.08544v1","updated":"2023-08-16T17:58:34Z","published":"2023-08-16T17:58:34Z","title":"MeViS: A Large-scale Benchmark for Video Segmentation with Motion\n Expressions","summary":" This paper strives for motion expressions guided video segmentation, which\nfocuses on segmenting objects in video content based on a sentence describing\nthe motion of the objects. Existing referring video object datasets typically\nfocus on salient objects and use language expressions that contain excessive\nstatic attributes that could potentially enable the target object to be\nidentified in a single frame. These datasets downplay the importance of motion\nin video content for language-guided video object segmentation. To investigate\nthe feasibility of using motion expressions to ground and segment objects in\nvideos, we propose a large-scale dataset called MeViS, which contains numerous\nmotion expressions to indicate target objects in complex environments. We\nbenchmarked 5 existing referring video object segmentation (RVOS) methods and\nconducted a comprehensive comparison on the MeViS dataset. The results show\nthat current RVOS methods cannot effectively address motion expression-guided\nvideo segmentation. We further analyze the challenges and propose a baseline\napproach for the proposed MeViS dataset. The goal of our benchmark is to\nprovide a platform that enables the development of effective language-guided\nvideo segmentation algorithms that leverage motion expressions as a primary cue\nfor object segmentation in complex video scenes. The proposed MeViS dataset has\nbeen released at https://henghuiding.github.io/MeViS.\n","authors":["Henghui Ding","Chang Liu","Shuting He","Xudong Jiang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2308.08544v1.pdf","comment":"ICCV 2023, Project Page: https://henghuiding.github.io/MeViS/"},{"id":"http://arxiv.org/abs/2308.08543v1","updated":"2023-08-16T17:58:28Z","published":"2023-08-16T17:58:28Z","title":"InsightMapper: A Closer Look at Inner-instance Information for\n Vectorized High-Definition Mapping","summary":" Vectorized high-definition (HD) maps contain detailed information about\nsurrounding road elements, which are crucial for various downstream tasks in\nmodern autonomous driving vehicles, such as vehicle planning and control.\nRecent works have attempted to directly detect the vectorized HD map as a point\nset prediction task, resulting in significant improvements in detection\nperformance. However, these approaches fail to analyze and exploit the\ninner-instance correlations between predicted points, impeding further\nadvancements. To address these challenges, we investigate the utilization of\ninner-$\\textbf{INS}$tance information for vectorized h$\\textbf{IGH}$-definition\nmapping through $\\textbf{T}$ransformers and introduce InsightMapper. This paper\npresents three novel designs within InsightMapper that leverage inner-instance\ninformation in distinct ways, including hybrid query generation, inner-instance\nquery fusion, and inner-instance feature aggregation. Comparative experiments\nare conducted on the NuScenes dataset, showcasing the superiority of our\nproposed method. InsightMapper surpasses previous state-of-the-art (SOTA)\nmethods by 5.78 mAP and 5.12 TOPO, which assess topology correctness.\nSimultaneously, InsightMapper maintains high efficiency during both training\nand inference phases, resulting in remarkable comprehensive performance. The\nproject page for this work is available at\nhttps://tonyxuqaq.github.io/projects/InsightMapper .\n","authors":["Zhenhua Xu","Kenneth K. Y. Wong","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.08543v1.pdf","comment":"Code and demo will be available at\n https://tonyxuqaq.github.io/projects/InsightMapper"},{"id":"http://arxiv.org/abs/2211.10946v2","updated":"2023-08-16T17:54:55Z","published":"2022-11-20T11:02:50Z","title":"Normalizing Flows for Human Pose Anomaly Detection","summary":" Video anomaly detection is an ill-posed problem because it relies on many\nparameters such as appearance, pose, camera angle, background, and more. We\ndistill the problem to anomaly detection of human pose, thus decreasing the\nrisk of nuisance parameters such as appearance affecting the result. Focusing\non pose alone also has the side benefit of reducing bias against distinct\nminority groups. Our model works directly on human pose graph sequences and is\nexceptionally lightweight (~1K parameters), capable of running on any machine\nable to run the pose estimation with negligible additional resources. We\nleverage the highly compact pose representation in a normalizing flows\nframework, which we extend to tackle the unique characteristics of\nspatio-temporal pose data and show its advantages in this use case. The\nalgorithm is quite general and can handle training data of only normal examples\nas well as a supervised setting that consists of labeled normal and abnormal\nexamples. We report state-of-the-art results on two anomaly detection\nbenchmarks - the unsupervised ShanghaiTech dataset and the recent supervised\nUBnormal dataset.\n","authors":["Or Hirschorn","Shai Avidan"],"pdf_url":"https://arxiv.org/pdf/2211.10946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09375v3","updated":"2023-08-16T17:54:39Z","published":"2023-03-16T15:04:10Z","title":"DINAR: Diffusion Inpainting of Neural Textures for One-Shot Human\n Avatars","summary":" We present DINAR, an approach for creating realistic rigged fullbody avatars\nfrom single RGB images. Similarly to previous works, our method uses neural\ntextures combined with the SMPL-X body model to achieve photo-realistic quality\nof avatars while keeping them easy to animate and fast to infer. To restore the\ntexture, we use a latent diffusion model and show how such model can be trained\nin the neural texture space. The use of the diffusion model allows us to\nrealistically reconstruct large unseen regions such as the back of a person\ngiven the frontal view. The models in our pipeline are trained using 2D images\nand videos only. In the experiments, our approach achieves state-of-the-art\nrendering quality and good generalization to new poses and viewpoints. In\nparticular, the approach improves state-of-the-art on the SnapshotPeople public\nbenchmark.\n","authors":["David Svitov","Dmitrii Gudkov","Renat Bashirov","Victor Lempitsky"],"pdf_url":"https://arxiv.org/pdf/2303.09375v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08530v1","updated":"2023-08-16T17:40:18Z","published":"2023-08-16T17:40:18Z","title":"Ref-DVGO: Reflection-Aware Direct Voxel Grid Optimization for an\n Improved Quality-Efficiency Trade-Off in Reflective Scene Reconstructio","summary":" Neural Radiance Fields (NeRFs) have revolutionized the field of novel view\nsynthesis, demonstrating remarkable performance. However, the modeling and\nrendering of reflective objects remain challenging problems. Recent methods\nhave shown significant improvements over the baselines in handling reflective\nscenes, albeit at the expense of efficiency. In this work, we aim to strike a\nbalance between efficiency and quality. To this end, we investigate an\nimplicit-explicit approach based on conventional volume rendering to enhance\nthe reconstruction quality and accelerate the training and rendering processes.\nWe adopt an efficient density-based grid representation and reparameterize the\nreflected radiance in our pipeline. Our proposed reflection-aware approach\nachieves a competitive quality efficiency trade-off compared to competing\nmethods. Based on our experimental results, we propose and discuss hypotheses\nregarding the factors influencing the results of density-based methods for\nreconstructing reflective objects. The source code is available at:\nhttps://github.com/gkouros/ref-dvgo\n","authors":["Georgios Kouros","Minye Wu","Sushruth Nagesh","Shubham Shrivastava","Punarjay Chakravarty","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2308.08530v1.pdf","comment":"5 pages, 4 figures, 3 tables, ICCV TRICKY 2023 Workshop"},{"id":"http://arxiv.org/abs/2308.02716v2","updated":"2023-08-16T17:39:15Z","published":"2023-08-04T21:38:29Z","title":"EndoDepthL: Lightweight Endoscopic Monocular Depth Estimation with\n CNN-Transformer","summary":" In this study, we address the key challenges concerning the accuracy and\neffectiveness of depth estimation for endoscopic imaging, with a particular\nemphasis on real-time inference and the impact of light reflections. We propose\na novel lightweight solution named EndoDepthL that integrates Convolutional\nNeural Networks (CNN) and Transformers to predict multi-scale depth maps. Our\napproach includes optimizing the network architecture, incorporating\nmulti-scale dilated convolution, and a multi-channel attention mechanism. We\nalso introduce a statistical confidence boundary mask to minimize the impact of\nreflective areas. To better evaluate the performance of monocular depth\nestimation in endoscopic imaging, we propose a novel complexity evaluation\nmetric that considers network parameter size, floating-point operations, and\ninference frames per second. We comprehensively evaluate our proposed method\nand compare it with existing baseline solutions. The results demonstrate that\nEndoDepthL ensures depth estimation accuracy with a lightweight structure.\n","authors":["Yangke Li"],"pdf_url":"https://arxiv.org/pdf/2308.02716v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08529v1","updated":"2023-08-16T17:39:15Z","published":"2023-08-16T17:39:15Z","title":"Diagnosing Human-object Interaction Detectors","summary":" Although we have witnessed significant progress in human-object interaction\n(HOI) detection with increasingly high mAP (mean Average Precision), a single\nmAP score is too concise to obtain an informative summary of a model's\nperformance and to understand why one approach is better than another. In this\npaper, we introduce a diagnosis toolbox for analyzing the error sources of the\nexisting HOI detection models. We first conduct holistic investigations in the\npipeline of HOI detection, consisting of human-object pair detection and then\ninteraction classification. We define a set of errors and the oracles to fix\neach of them. By measuring the mAP improvement obtained from fixing an error\nusing its oracle, we can have a detailed analysis of the significance of\ndifferent errors. We then delve into the human-object detection and interaction\nclassification, respectively, and check the model's behavior. For the first\ndetection task, we investigate both recall and precision, measuring the\ncoverage of ground-truth human-object pairs as well as the noisiness level in\nthe detections. For the second classification task, we compute mAP for\ninteraction classification only, without considering the detection scores. We\nalso measure the performance of the models in differentiating human-object\npairs with and without actual interactions using the AP (Average Precision)\nscore. Our toolbox is applicable for different methods across different\ndatasets and available at https://github.com/neu-vi/Diag-HOI.\n","authors":["Fangrui Zhu","Yiming Xie","Weidi Xie","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.08529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08525v1","updated":"2023-08-16T17:26:47Z","published":"2023-08-16T17:26:47Z","title":"Likelihood-Based Text-to-Image Evaluation with Patch-Level Perceptual\n and Semantic Credit Assignment","summary":" Text-to-image synthesis has made encouraging progress and attracted lots of\npublic attention recently. However, popular evaluation metrics in this area,\nlike the Inception Score and Fr'echet Inception Distance, incur several issues.\nFirst of all, they cannot explicitly assess the perceptual quality of generated\nimages and poorly reflect the semantic alignment of each text-image pair. Also,\nthey are inefficient and need to sample thousands of images to stabilise their\nevaluation results. In this paper, we propose to evaluate text-to-image\ngeneration performance by directly estimating the likelihood of the generated\nimages using a pre-trained likelihood-based text-to-image generative model,\ni.e., a higher likelihood indicates better perceptual quality and better\ntext-image alignment. To prevent the likelihood of being dominated by the\nnon-crucial part of the generated image, we propose several new designs to\ndevelop a credit assignment strategy based on the semantic and perceptual\nsignificance of the image patches. In the experiments, we evaluate the proposed\nmetric on multiple popular text-to-image generation models and datasets in\naccessing both the perceptual quality and the text-image alignment. Moreover,\nit can successfully assess the generation ability of these models with as few\nas a hundred samples, making it very efficient in practice.\n","authors":["Qi Chen","Chaorui Deng","Zixiong Huang","Bowen Zhang","Mingkui Tan","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2308.08525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08520v1","updated":"2023-08-16T17:18:30Z","published":"2023-08-16T17:18:30Z","title":"Painter: Teaching Auto-regressive Language Models to Draw Sketches","summary":" Large language models (LLMs) have made tremendous progress in natural\nlanguage understanding and they have also been successfully adopted in other\ndomains such as computer vision, robotics, reinforcement learning, etc. In this\nwork, we apply LLMs to image generation tasks by directly generating the\nvirtual brush strokes to paint an image. We present Painter, an LLM that can\nconvert user prompts in text description format to sketches by generating the\ncorresponding brush strokes in an auto-regressive way. We construct Painter\nbased on off-the-shelf LLM that is pre-trained on a large text corpus, by\nfine-tuning it on the new task while preserving language understanding\ncapabilities. We create a dataset of diverse multi-object sketches paired with\ntextual prompts that covers several object types and tasks. Painter can\ngenerate sketches from text descriptions, remove objects from canvas, and\ndetect and classify objects in sketches. Although this is an unprecedented\npioneering work in using LLMs for auto-regressive image generation, the results\nare very encouraging.\n","authors":["Reza Pourreza","Apratim Bhattacharyya","Sunny Panchal","Mingu Lee","Pulkit Madan","Roland Memisevic"],"pdf_url":"https://arxiv.org/pdf/2308.08520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08518v1","updated":"2023-08-16T17:13:45Z","published":"2023-08-16T17:13:45Z","title":"Exploiting Point-Wise Attention in 6D Object Pose Estimation Based on\n Bidirectional Prediction","summary":" Traditional geometric registration based estimation methods only exploit the\nCAD model implicitly, which leads to their dependence on observation quality\nand deficiency to occlusion.To address the problem,the paper proposes a\nbidirectional correspondence prediction network with a point-wise\nattention-aware mechanism. This network not only requires the model points to\npredict the correspondence but also explicitly models the geometric\nsimilarities between observations and the model prior.} Our key insight is that\nthe correlations between each model point and scene point provide essential\ninformation for learning point-pair matches. To further tackle the correlation\nnoises brought by feature distribution divergence, we design a simple but\neffective pseudo-siamese network to improve feature homogeneity.Experimental\nresults on the public datasets of LineMOD, YCB-Video, and Occ-LineMOD show that\nthe proposed method achieves better performance than other state-of-the-art\nmethods under the same evaluation criteria. Its robustness in estimating poses\nis greatly improved, especially in an environment with severe occlusions.\n","authors":["Yuhao Yang","Jun Wu","Guangjian Zhang","Rong Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.08518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.12196v2","updated":"2023-08-16T17:09:41Z","published":"2022-04-26T10:00:28Z","title":"Adaptive Split-Fusion Transformer","summary":" Neural networks for visual content understanding have recently evolved from\nconvolutional ones (CNNs) to transformers. The prior (CNN) relies on\nsmall-windowed kernels to capture the regional clues, demonstrating solid local\nexpressiveness. On the contrary, the latter (transformer) establishes\nlong-range global connections between localities for holistic learning.\nInspired by this complementary nature, there is a growing interest in designing\nhybrid models to best utilize each technique. Current hybrids merely replace\nconvolutions as simple approximations of linear projection or juxtapose a\nconvolution branch with attention, without concerning the importance of\nlocal/global modeling. To tackle this, we propose a new hybrid named Adaptive\nSplit-Fusion Transformer (ASF-former) to treat convolutional and attention\nbranches differently with adaptive weights. Specifically, an ASF-former encoder\nequally splits feature channels into half to fit dual-path inputs. Then, the\noutputs of dual-path are fused with weighting scalars calculated from visual\ncues. We also design the convolutional path compactly for efficiency concerns.\nExtensive experiments on standard benchmarks, such as ImageNet-1K, CIFAR-10,\nand CIFAR-100, show that our ASF-former outperforms its CNN, transformer\ncounterparts, and hybrid pilots in terms of accuracy (83.9% on ImageNet-1K),\nunder similar conditions (12.9G MACs/56.7M Params, without large-scale\npre-training). The code is available at:\nhttps://github.com/szx503045266/ASF-former.\n","authors":["Zixuan Su","Hao Zhang","Jingjing Chen","Lei Pang","Chong-Wah Ngo","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2204.12196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08511v1","updated":"2023-08-16T17:07:40Z","published":"2023-08-16T17:07:40Z","title":"Two-and-a-half Order Score-based Model for Solving 3D Ill-posed Inverse\n Problems","summary":" Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) are crucial\ntechnologies in the field of medical imaging. Score-based models have proven to\nbe effective in addressing different inverse problems encountered in CT and\nMRI, such as sparse-view CT and fast MRI reconstruction. However, these models\nface challenges in achieving accurate three dimensional (3D) volumetric\nreconstruction. The existing score-based models primarily focus on\nreconstructing two dimensional (2D) data distribution, leading to\ninconsistencies between adjacent slices in the reconstructed 3D volumetric\nimages. To overcome this limitation, we propose a novel two-and-a-half order\nscore-based model (TOSM). During the training phase, our TOSM learns data\ndistributions in 2D space, which reduces the complexity of training compared to\ndirectly working on 3D volumes. However, in the reconstruction phase, the TOSM\nupdates the data distribution in 3D space, utilizing complementary scores along\nthree directions (sagittal, coronal, and transaxial) to achieve a more precise\nreconstruction. The development of TOSM is built on robust theoretical\nprinciples, ensuring its reliability and efficacy. Through extensive\nexperimentation on large-scale sparse-view CT and fast MRI datasets, our method\ndemonstrates remarkable advancements and attains state-of-the-art results in\nsolving 3D ill-posed inverse problems. Notably, the proposed TOSM effectively\naddresses the inter-slice inconsistency issue, resulting in high-quality 3D\nvolumetric reconstruction.\n","authors":["Zirong Li","Yanyang Wang","Jianjia Zhang","Weiwen Wu","Hengyong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.08511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08504v1","updated":"2023-08-16T16:58:25Z","published":"2023-08-16T16:58:25Z","title":"ResBuilder: Automated Learning of Depth with Residual Structures","summary":" In this work, we develop a neural architecture search algorithm, termed\nResbuilder, that develops ResNet architectures from scratch that achieve high\naccuracy at moderate computational cost. It can also be used to modify existing\narchitectures and has the capability to remove and insert ResNet blocks, in\nthis way searching for suitable architectures in the space of ResNet\narchitectures. In our experiments on different image classification datasets,\nResbuilder achieves close to state-of-the-art performance while saving\ncomputational cost compared to off-the-shelf ResNets. Noteworthy, we once tune\nthe parameters on CIFAR10 which yields a suitable default choice for all other\ndatasets. We demonstrate that this property generalizes even to industrial\napplications by applying our method with default parameters on a proprietary\nfraud detection dataset.\n","authors":["Julian Burghoff","Matthias Rottmann","Jill von Conta","Sebastian Schoenen","Andreas Witte","Hanno Gottschalk"],"pdf_url":"https://arxiv.org/pdf/2308.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08495v1","updated":"2023-08-16T16:49:50Z","published":"2023-08-16T16:49:50Z","title":"Self-Supervised Online Camera Calibration for Automated Driving and\n Parking Applications","summary":" Camera-based perception systems play a central role in modern autonomous\nvehicles. These camera based perception algorithms require an accurate\ncalibration to map the real world distances to image pixels. In practice,\ncalibration is a laborious procedure requiring specialised data collection and\ncareful tuning. This process must be repeated whenever the parameters of the\ncamera change, which can be a frequent occurrence in autonomous vehicles. Hence\nthere is a need to calibrate at regular intervals to ensure the camera is\naccurate. Proposed is a deep learning framework to learn intrinsic and\nextrinsic calibration of the camera in real time. The framework is\nself-supervised and doesn't require any labelling or supervision to learn the\ncalibration parameters. The framework learns calibration without the need for\nany physical targets or to drive the car on special planar surfaces.\n","authors":["Ciarán Hogan","Ganesh Sistu","Ciarán Eising"],"pdf_url":"https://arxiv.org/pdf/2308.08495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08479v1","updated":"2023-08-16T16:37:02Z","published":"2023-08-16T16:37:02Z","title":"DeDoDe: Detect, Don't Describe -- Describe, Don't Detect for Local\n Feature Matching","summary":" Keypoint detection is a pivotal step in 3D reconstruction, whereby sets of\n(up to) K points are detected in each view of a scene. Crucially, the detected\npoints need to be consistent between views, i.e., correspond to the same 3D\npoint in the scene. One of the main challenges with keypoint detection is the\nformulation of the learning objective. Previous learning-based methods\ntypically jointly learn descriptors with keypoints, and treat the keypoint\ndetection as a binary classification task on mutual nearest neighbours.\nHowever, basing keypoint detection on descriptor nearest neighbours is a proxy\ntask, which is not guaranteed to produce 3D-consistent keypoints. Furthermore,\nthis ties the keypoints to a specific descriptor, complicating downstream\nusage. In this work, we instead learn keypoints directly from 3D consistency.\nTo this end, we train the detector to detect tracks from large-scale SfM. As\nthese points are often overly sparse, we derive a semi-supervised two-view\ndetection objective to expand this set to a desired number of detections. To\ntrain a descriptor, we maximize the mutual nearest neighbour objective over the\nkeypoints with a separate network. Results show that our approach, DeDoDe,\nachieves significant gains on multiple geometry benchmarks. Code is provided at\nhttps://github.com/Parskatt/DeDoDe .\n","authors":["Johan Edstedt","Georg Bökman","Mårten Wadenbäck","Michael Felsberg"],"pdf_url":"https://arxiv.org/pdf/2308.08479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08476v1","updated":"2023-08-16T16:31:36Z","published":"2023-08-16T16:31:36Z","title":"Classification Committee for Active Deep Object Detection","summary":" In object detection, the cost of labeling is much high because it needs not\nonly to confirm the categories of multiple objects in an image but also to\naccurately determine the bounding boxes of each object. Thus, integrating\nactive learning into object detection will raise pretty positive significance.\nIn this paper, we propose a classification committee for active deep object\ndetection method by introducing a discrepancy mechanism of multiple classifiers\nfor samples' selection when training object detectors. The model contains a\nmain detector and a classification committee. The main detector denotes the\ntarget object detector trained from a labeled pool composed of the selected\ninformative images. The role of the classification committee is to select the\nmost informative images according to their uncertainty values from the view of\nclassification, which is expected to focus more on the discrepancy and\nrepresentative of instances. Specifically, they compute the uncertainty for a\nspecified instance within the image by measuring its discrepancy output by the\ncommittee pre-trained via the proposed Maximum Classifiers Discrepancy Group\nLoss (MCDGL). The most informative images are finally determined by selecting\nthe ones with many high-uncertainty instances. Besides, to mitigate the impact\nof interference instances, we design a Focus on Positive Instances Loss (FPIL)\nto make the committee the ability to automatically focus on the representative\ninstances as well as precisely encode their discrepancies for the same\ninstance. Experiments are conducted on Pascal VOC and COCO datasets versus some\npopular object detectors. And results show that our method outperforms the\nstate-of-the-art active learning methods, which verifies the effectiveness of\nthe proposed method.\n","authors":["Lei Zhao","Bo Li","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2308.08476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08465v1","updated":"2023-08-16T16:09:23Z","published":"2023-08-16T16:09:23Z","title":"Hierarchical Uncertainty Estimation for Medical Image Segmentation\n Networks","summary":" Learning a medical image segmentation model is an inherently ambiguous task,\nas uncertainties exist in both images (noise) and manual annotations (human\nerrors and bias) used for model training. To build a trustworthy image\nsegmentation model, it is important to not just evaluate its performance but\nalso estimate the uncertainty of the model prediction. Most state-of-the-art\nimage segmentation networks adopt a hierarchical encoder architecture,\nextracting image features at multiple resolution levels from fine to coarse. In\nthis work, we leverage this hierarchical image representation and propose a\nsimple yet effective method for estimating uncertainties at multiple levels.\nThe multi-level uncertainties are modelled via the skip-connection module and\nthen sampled to generate an uncertainty map for the predicted image\nsegmentation. We demonstrate that a deep learning segmentation network such as\nU-net, when implemented with such hierarchical uncertainty estimation module,\ncan achieve a high segmentation performance, while at the same time provide\nmeaningful uncertainty maps that can be used for out-of-distribution detection.\n","authors":["Xinyu Bai","Wenjia Bai"],"pdf_url":"https://arxiv.org/pdf/2308.08465v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.08463v1","updated":"2023-08-16T16:08:22Z","published":"2023-08-16T16:08:22Z","title":"Learning to Distill Global Representation for Sparse-View CT","summary":" Sparse-view computed tomography (CT) -- using a small number of projections\nfor tomographic reconstruction -- enables much lower radiation dose to patients\nand accelerated data acquisition. The reconstructed images, however, suffer\nfrom strong artifacts, greatly limiting their diagnostic value. Current trends\nfor sparse-view CT turn to the raw data for better information recovery. The\nresultant dual-domain methods, nonetheless, suffer from secondary artifacts,\nespecially in ultra-sparse view scenarios, and their generalization to other\nscanners/protocols is greatly limited. A crucial question arises: have the\nimage post-processing methods reached the limit? Our answer is not yet. In this\npaper, we stick to image post-processing methods due to great flexibility and\npropose global representation (GloRe) distillation framework for sparse-view\nCT, termed GloReDi. First, we propose to learn GloRe with Fourier convolution,\nso each element in GloRe has an image-wide receptive field. Second, unlike\nmethods that only use the full-view images for supervision, we propose to\ndistill GloRe from intermediate-view reconstructed images that are readily\navailable but not explored in previous literature. The success of GloRe\ndistillation is attributed to two key components: representation directional\ndistillation to align the GloRe directions, and band-pass-specific contrastive\ndistillation to gain clinically important details. Extensive experiments\ndemonstrate the superiority of the proposed GloReDi over the state-of-the-art\nmethods, including dual-domain ones. The source code is available at\nhttps://github.com/longzilicart/GloReDi.\n","authors":["Zilong Li","Chenglong Ma","Jie Chen","Junping Zhang","Hongming shan"],"pdf_url":"https://arxiv.org/pdf/2308.08463v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2304.01752v2","updated":"2023-08-16T15:54:54Z","published":"2023-04-04T12:42:29Z","title":"Black Box Few-Shot Adaptation for Vision-Language models","summary":" Vision-Language (V-L) models trained with contrastive learning to align the\nvisual and language modalities have been shown to be strong few-shot learners.\nSoft prompt learning is the method of choice for few-shot downstream adaption\naiming to bridge the modality gap caused by the distribution shift induced by\nthe new domain. While parameter-efficient, prompt learning still requires\naccess to the model weights and can be computationally infeasible for large\nmodels with billions of parameters. To address these shortcomings, in this\nwork, we describe a black-box method for V-L few-shot adaptation that (a)\noperates on pre-computed image and text features and hence works without access\nto the model's weights, (b) it is orders of magnitude faster at training time,\n(c) it is amenable to both supervised and unsupervised training, and (d) it can\nbe even used to align image and text features computed from uni-modal models.\nTo achieve this, we propose Linear Feature Alignment (LFA), a simple linear\napproach for V-L re-alignment in the target domain. LFA is initialized from a\nclosed-form solution to a least-squares problem and then it is iteratively\nupdated by minimizing a re-ranking loss. Despite its simplicity, our approach\ncan even surpass soft-prompt learning methods as shown by extensive experiments\non 11 image and 2 video datasets.\n","authors":["Yassine Ouali","Adrian Bulat","Brais Martinez","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2304.01752v2.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08443v1","updated":"2023-08-16T15:51:05Z","published":"2023-08-16T15:51:05Z","title":"High-Fidelity Lake Extraction via Two-Stage Prompt Enhancement:\n Establishing a Novel Baseline and Benchmark","summary":" The extraction of lakes from remote sensing images is a complex challenge due\nto the varied lake shapes and data noise. Current methods rely on multispectral\nimage datasets, making it challenging to learn lake features accurately from\npixel arrangements. This, in turn, affects model learning and the creation of\naccurate segmentation masks. This paper introduces a unified prompt-based\ndataset construction approach that provides approximate lake locations using\npoint, box, and mask prompts. We also propose a two-stage prompt enhancement\nframework, LEPrompter, which involves prompt-based and prompt-free stages\nduring training. The prompt-based stage employs a prompt encoder to extract\nprior information, integrating prompt tokens and image embeddings through self-\nand cross-attention in the prompt decoder. Prompts are deactivated once the\nmodel is trained to ensure independence during inference, enabling automated\nlake extraction. Evaluations on Surface Water and Qinghai-Tibet Plateau Lake\ndatasets show consistent performance improvements compared to the previous\nstate-of-the-art method. LEPrompter achieves mIoU scores of 91.48% and 97.43%\non the respective datasets without introducing additional parameters or GFLOPs.\nSupplementary materials provide the source code, pre-trained models, and\ndetailed user studies.\n","authors":["Ben Chen","Xuechao Zou","Kai Li","Yu Zhang","Junliang Xing","Pin Tao"],"pdf_url":"https://arxiv.org/pdf/2308.08443v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2009.07140v2","updated":"2023-08-16T15:40:23Z","published":"2020-09-15T14:51:10Z","title":"HGCN-GJS: Hierarchical Graph Convolutional Network with Groupwise Joint\n Sampling for Trajectory Prediction","summary":" Accurate pedestrian trajectory prediction is of great importance for\ndownstream tasks such as autonomous driving and mobile robot navigation. Fully\ninvestigating the social interactions within the crowd is crucial for accurate\npedestrian trajectory prediction. However, most existing methods do not capture\ngroup level interactions well, focusing only on pairwise interactions and\nneglecting group-wise interactions. In this work, we propose a hierarchical\ngraph convolutional network, HGCN-GJS, for trajectory prediction which well\nleverages group level interactions within the crowd. Furthermore, we introduce\na novel joint sampling scheme for modeling the joint distribution of multiple\npedestrians in the future trajectories. Based on the group information, this\nscheme associates the trajectory of one person with the trajectory of other\npeople in the group, but maintains the independence of the trajectories of\noutsiders. We demonstrate the performance of our network on several trajectory\nprediction datasets, achieving state-of-the-art results on all datasets\nconsidered.\n","authors":["Yuying Chen","Congcong Liu","Xiaodong Mei","Bertram E. Shi","Ming Liu"],"pdf_url":"https://arxiv.org/pdf/2009.07140v2.pdf","comment":"8 pages, 8 figures, accepted by IROS 2022"},{"id":"http://arxiv.org/abs/2308.08431v1","updated":"2023-08-16T15:23:14Z","published":"2023-08-16T15:23:14Z","title":"Integrating Visual and Semantic Similarity Using Hierarchies for Image\n Retrieval","summary":" Most of the research in content-based image retrieval (CBIR) focus on\ndeveloping robust feature representations that can effectively retrieve\ninstances from a database of images that are visually similar to a query.\nHowever, the retrieved images sometimes contain results that are not\nsemantically related to the query. To address this, we propose a method for\nCBIR that captures both visual and semantic similarity using a visual\nhierarchy. The hierarchy is constructed by merging classes with overlapping\nfeatures in the latent space of a deep neural network trained for\nclassification, assuming that overlapping classes share high visual and\nsemantic similarities. Finally, the constructed hierarchy is integrated into\nthe distance calculation metric for similarity search. Experiments on standard\ndatasets: CUB-200-2011 and CIFAR100, and a real-life use case using diatom\nmicroscopy images show that our method achieves superior performance compared\nto the existing methods on image retrieval.\n","authors":["Aishwarya Venkataramanan","Martin Laviale","Cédric Pradalier"],"pdf_url":"https://arxiv.org/pdf/2308.08431v1.pdf","comment":"Accepted in ICVS 2023"},{"id":"http://arxiv.org/abs/2308.08428v1","updated":"2023-08-16T15:19:52Z","published":"2023-08-16T15:19:52Z","title":"ALIP: Adaptive Language-Image Pre-training with Synthetic Caption","summary":" Contrastive Language-Image Pre-training (CLIP) has significantly boosted the\nperformance of various vision-language tasks by scaling up the dataset with\nimage-text pairs collected from the web. However, the presence of intrinsic\nnoise and unmatched image-text pairs in web data can potentially affect the\nperformance of representation learning. To address this issue, we first utilize\nthe OFA model to generate synthetic captions that focus on the image content.\nThe generated captions contain complementary information that is beneficial for\npre-training. Then, we propose an Adaptive Language-Image Pre-training (ALIP),\na bi-path model that integrates supervision from both raw text and synthetic\ncaption. As the core components of ALIP, the Language Consistency Gate (LCG)\nand Description Consistency Gate (DCG) dynamically adjust the weights of\nsamples and image-text/caption pairs during the training process. Meanwhile,\nthe adaptive contrastive loss can effectively reduce the impact of noise data\nand enhances the efficiency of pre-training data. We validate ALIP with\nexperiments on different scales of models and pre-training datasets.\nExperiments results show that ALIP achieves state-of-the-art performance on\nmultiple downstream tasks including zero-shot image-text retrieval and linear\nprobe. To facilitate future research, the code and pre-trained models are\nreleased at https://github.com/deepglint/ALIP.\n","authors":["Kaicheng Yang","Jiankang Deng","Xiang An","Jiawei Li","Ziyong Feng","Jia Guo","Jing Yang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.08428v1.pdf","comment":"15pages, 10figures, ICCV2023"},{"id":"http://arxiv.org/abs/2307.12502v2","updated":"2023-08-16T15:19:49Z","published":"2023-07-24T03:27:41Z","title":"Cross Contrasting Feature Perturbation for Domain Generalization","summary":" Domain generalization (DG) aims to learn a robust model from source domains\nthat generalize well on unseen target domains. Recent studies focus on\ngenerating novel domain samples or features to diversify distributions\ncomplementary to source domains. Yet, these approaches can hardly deal with the\nrestriction that the samples synthesized from various domains can cause\nsemantic distortion. In this paper, we propose an online one-stage Cross\nContrasting Feature Perturbation (CCFP) framework to simulate domain shift by\ngenerating perturbed features in the latent space while regularizing the model\nprediction against domain shift. Different from the previous fixed synthesizing\nstrategy, we design modules with learnable feature perturbations and semantic\nconsistency constraints. In contrast to prior work, our method does not use any\ngenerative-based models or domain labels. We conduct extensive experiments on a\nstandard DomainBed benchmark with a strict evaluation protocol for a fair\ncomparison. Comprehensive experiments show that our method outperforms the\nprevious state-of-the-art, and quantitative analyses illustrate that our\napproach can alleviate the domain shift problem in out-of-distribution (OOD)\nscenarios.\n","authors":["Chenming Li","Daoan Zhang","Wenjian Huang","Jianguo Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12502v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09703v3","updated":"2023-08-16T15:16:43Z","published":"2022-11-17T17:38:55Z","title":"EfficientTrain: Exploring Generalized Curriculum Learning for Training\n Visual Backbones","summary":" The superior performance of modern deep networks usually comes with a costly\ntraining procedure. This paper presents a new curriculum learning approach for\nthe efficient training of visual backbones (e.g., vision Transformers). Our\nwork is inspired by the inherent learning dynamics of deep networks: we\nexperimentally show that at an earlier training stage, the model mainly learns\nto recognize some 'easier-to-learn' discriminative patterns within each\nexample, e.g., the lower-frequency components of images and the original\ninformation before data augmentation. Driven by this phenomenon, we propose a\ncurriculum where the model always leverages all the training data at each\nepoch, while the curriculum starts with only exposing the 'easier-to-learn'\npatterns of each example, and introduces gradually more difficult patterns. To\nimplement this idea, we 1) introduce a cropping operation in the Fourier\nspectrum of the inputs, which enables the model to learn from only the\nlower-frequency components efficiently, 2) demonstrate that exposing the\nfeatures of original images amounts to adopting weaker data augmentation, and\n3) integrate 1) and 2) and design a curriculum learning schedule with a\ngreedy-search algorithm. The resulting approach, EfficientTrain, is simple,\ngeneral, yet surprisingly effective. As an off-the-shelf method, it reduces the\nwall-time training cost of a wide variety of popular models (e.g., ResNet,\nConvNeXt, DeiT, PVT, Swin, and CSWin) by >1.5x on ImageNet-1K/22K without\nsacrificing accuracy. It is also effective for self-supervised learning (e.g.,\nMAE). Code is available at https://github.com/LeapLabTHU/EfficientTrain.\n","authors":["Yulin Wang","Yang Yue","Rui Lu","Tianjiao Liu","Zhao Zhong","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2211.09703v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08414v1","updated":"2023-08-16T15:00:50Z","published":"2023-08-16T15:00:50Z","title":"Tem-adapter: Adapting Image-Text Pretraining for Video Question Answer","summary":" Video-language pre-trained models have shown remarkable success in guiding\nvideo question-answering (VideoQA) tasks. However, due to the length of video\nsequences, training large-scale video-based models incurs considerably higher\ncosts than training image-based ones. This motivates us to leverage the\nknowledge from image-based pretraining, despite the obvious gaps between image\nand video domains. To bridge these gaps, in this paper, we propose Tem-Adapter,\nwhich enables the learning of temporal dynamics and complex semantics by a\nvisual Temporal Aligner and a textual Semantic Aligner. Unlike conventional\npretrained knowledge adaptation methods that only concentrate on the downstream\ntask objective, the Temporal Aligner introduces an extra language-guided\nautoregressive task aimed at facilitating the learning of temporal\ndependencies, with the objective of predicting future states based on\nhistorical clues and language guidance that describes event progression.\nBesides, to reduce the semantic gap and adapt the textual representation for\nbetter event description, we introduce a Semantic Aligner that first designs a\ntemplate to fuse question and answer pairs as event descriptions and then\nlearns a Transformer decoder with the whole video sequence as guidance for\nrefinement. We evaluate Tem-Adapter and different pre-train transferring\nmethods on two VideoQA benchmarks, and the significant performance improvement\ndemonstrates the effectiveness of our method.\n","authors":["Guangyi Chen","Xiao Liu","Guangrun Wang","Kun Zhang","Philip H. S. Torr","Xiao-Ping Zhang","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2308.08414v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2303.09472v3","updated":"2023-08-16T14:36:41Z","published":"2023-03-16T16:47:14Z","title":"DiffIR: Efficient Diffusion Model for Image Restoration","summary":" Diffusion model (DM) has achieved SOTA performance by modeling the image\nsynthesis process into a sequential application of a denoising network.\nHowever, different from image synthesis, image restoration (IR) has a strong\nconstraint to generate results in accordance with ground-truth. Thus, for IR,\ntraditional DMs running massive iterations on a large model to estimate whole\nimages or feature maps is inefficient. To address this issue, we propose an\nefficient DM for IR (DiffIR), which consists of a compact IR prior extraction\nnetwork (CPEN), dynamic IR transformer (DIRformer), and denoising network.\nSpecifically, DiffIR has two training stages: pretraining and training DM. In\npretraining, we input ground-truth images into CPEN$_{S1}$ to capture a compact\nIR prior representation (IPR) to guide DIRformer. In the second stage, we train\nthe DM to directly estimate the same IRP as pretrained CPEN$_{S1}$ only using\nLQ images. We observe that since the IPR is only a compact vector, DiffIR can\nuse fewer iterations than traditional DM to obtain accurate estimations and\ngenerate more stable and realistic results. Since the iterations are few, our\nDiffIR can adopt a joint optimization of CPEN$_{S2}$, DIRformer, and denoising\nnetwork, which can further reduce the estimation error influence. We conduct\nextensive experiments on several IR tasks and achieve SOTA performance while\nconsuming less computational costs. Code is available at\n\\url{https://github.com/Zj-BinXia/DiffIR}.\n","authors":["Bin Xia","Yulun Zhang","Shiyin Wang","Yitong Wang","Xinglong Wu","Yapeng Tian","Wenming Yang","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.09472v3.pdf","comment":"This paper is accepted by ICCV2023. Codes and models are available at\n https://github.com/Zj-BinXia/DiffIR"},{"id":"http://arxiv.org/abs/2308.08396v1","updated":"2023-08-16T14:28:36Z","published":"2023-08-16T14:28:36Z","title":"Prediction of post-radiotherapy recurrence volumes in head and neck\n squamous cell carcinoma using 3D U-Net segmentation","summary":" Locoregional recurrences (LRR) are still a frequent site of treatment failure\nfor head and neck squamous cell carcinoma (HNSCC) patients.\n Identification of high risk subvolumes based on pretreatment imaging is key\nto biologically targeted radiation therapy. We investigated the extent to which\na Convolutional neural network (CNN) is able to predict LRR volumes based on\npre-treatment 18F-fluorodeoxyglucose positron emission tomography\n(FDG-PET)/computed tomography (CT) scans in HNSCC patients and thus the\npotential to identify biological high risk volumes using CNNs.\n For 37 patients who had undergone primary radiotherapy for oropharyngeal\nsquamous cell carcinoma, five oncologists contoured the relapse volumes on\nrecurrence CT scans. Datasets of pre-treatment FDG-PET/CT, gross tumour volume\n(GTV) and contoured relapse for each of the patients were randomly divided into\ntraining (n=23), validation (n=7) and test (n=7) datasets. We compared a CNN\ntrained from scratch, a pre-trained CNN, a SUVmax threshold approach, and using\nthe GTV directly.\n The SUVmax threshold method included 5 out of the 7 relapse origin points\nwithin a volume of median 4.6 cubic centimetres (cc). Both the GTV contour and\nbest CNN segmentations included the relapse origin 6 out of 7 times with median\nvolumes of 28 and 18 cc respectively.\n The CNN included the same or greater number of relapse volume POs, with\nsignificantly smaller relapse volumes. Our novel findings indicate that CNNs\nmay predict LRR, yet further work on dataset development is required to attain\nclinically useful prediction accuracy.\n","authors":["Denis Kutnár","Ivan R Vogelius","Katrin Elisabet Håkansson","Jens Petersen","Jeppe Friborg","Lena Specht","Mogens Bernsdorf","Anita Gothelf","Claus Kristensen","Abraham George Smith"],"pdf_url":"https://arxiv.org/pdf/2308.08396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08393v1","updated":"2023-08-16T14:25:30Z","published":"2023-08-16T14:25:30Z","title":"SIGMA: Scale-Invariant Global Sparse Shape Matching","summary":" We propose a novel mixed-integer programming (MIP) formulation for generating\nprecise sparse correspondences for highly non-rigid shapes. To this end, we\nintroduce a projected Laplace-Beltrami operator (PLBO) which combines intrinsic\nand extrinsic geometric information to measure the deformation quality induced\nby predicted correspondences. We integrate the PLBO, together with an\norientation-aware regulariser, into a novel MIP formulation that can be solved\nto global optimality for many practical problems. In contrast to previous\nmethods, our approach is provably invariant to rigid transformations and global\nscaling, initialisation-free, has optimality guarantees, and scales to high\nresolution meshes with (empirically observed) linear time. We show\nstate-of-the-art results for sparse non-rigid matching on several challenging\n3D datasets, including data with inconsistent meshing, as well as applications\nin mesh-to-point-cloud matching.\n","authors":["Maolin Gao","Paul Roetzer","Marvin Eisenberger","Zorah Lähner","Michael Moeller","Daniel Cremers","Florian Bernard"],"pdf_url":"https://arxiv.org/pdf/2308.08393v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2303.09219v2","updated":"2023-08-16T14:12:42Z","published":"2023-03-16T10:48:59Z","title":"MixCycle: Mixup Assisted Semi-Supervised 3D Single Object Tracking with\n Cycle Consistency","summary":" 3D single object tracking (SOT) is an indispensable part of automated\ndriving. Existing approaches rely heavily on large, densely labeled datasets.\nHowever, annotating point clouds is both costly and time-consuming. Inspired by\nthe great success of cycle tracking in unsupervised 2D SOT, we introduce the\nfirst semi-supervised approach to 3D SOT. Specifically, we introduce two\ncycle-consistency strategies for supervision: 1) Self tracking cycles, which\nleverage labels to help the model converge better in the early stages of\ntraining; 2) forward-backward cycles, which strengthen the tracker's robustness\nto motion variations and the template noise caused by the template update\nstrategy. Furthermore, we propose a data augmentation strategy named SOTMixup\nto improve the tracker's robustness to point cloud diversity. SOTMixup\ngenerates training samples by sampling points in two point clouds with a mixing\nrate and assigns a reasonable loss weight for training according to the mixing\nrate. The resulting MixCycle approach generalizes to appearance matching-based\ntrackers. On the KITTI benchmark, based on the P2B tracker, MixCycle trained\nwith $\\textbf{10\\%}$ labels outperforms P2B trained with $\\textbf{100\\%}$\nlabels, and achieves a $\\textbf{28.4\\%}$ precision improvement when using\n$\\textbf{1\\%}$ labels. Our code will be released at\n\\url{https://github.com/Mumuqiao/MixCycle}.\n","authors":["Qiao Wu","Jiaqi Yang","Kun Sun","Chu'ai Zhang","Yanning Zhang","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2303.09219v2.pdf","comment":"Accepted by ICCV23"},{"id":"http://arxiv.org/abs/2308.08380v1","updated":"2023-08-16T14:09:39Z","published":"2023-08-16T14:09:39Z","title":"Robust Autonomous Vehicle Pursuit without Expert Steering Labels","summary":" In this work, we present a learning method for lateral and longitudinal\nmotion control of an ego-vehicle for vehicle pursuit. The car being controlled\ndoes not have a pre-defined route, rather it reactively adapts to follow a\ntarget vehicle while maintaining a safety distance. To train our model, we do\nnot rely on steering labels recorded from an expert driver but effectively\nleverage a classical controller as an offline label generation tool. In\naddition, we account for the errors in the predicted control values, which can\nlead to a loss of tracking and catastrophic crashes of the controlled vehicle.\nTo this end, we propose an effective data augmentation approach, which allows\nto train a network capable of handling different views of the target vehicle.\nDuring the pursuit, the target vehicle is firstly localized using a\nConvolutional Neural Network. The network takes a single RGB image along with\ncars' velocities and estimates the target vehicle's pose with respect to the\nego-vehicle. This information is then fed to a Multi-Layer Perceptron, which\nregresses the control commands for the ego-vehicle, namely throttle and\nsteering angle. We extensively validate our approach using the CARLA simulator\non a wide range of terrains. Our method demonstrates real-time performance and\nrobustness to different scenarios including unseen trajectories and high route\ncompletion. The project page containing code and multimedia can be publicly\naccessed here: https://changyaozhou.github.io/Autonomous-Vehicle-Pursuit/.\n","authors":["Jiaxin Pan","Changyao Zhou","Mariia Gladkova","Qadeer Khan","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2308.08380v1.pdf","comment":"9 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.08376v1","updated":"2023-08-16T13:59:43Z","published":"2023-08-16T13:59:43Z","title":"Automated Semiconductor Defect Inspection in Scanning Electron\n Microscope Images: a Systematic Review","summary":" A growing need exists for efficient and accurate methods for detecting\ndefects in semiconductor materials and devices. These defects can have a\ndetrimental impact on the efficiency of the manufacturing process, because they\ncause critical failures and wafer-yield limitations. As nodes and patterns get\nsmaller, even high-resolution imaging techniques such as Scanning Electron\nMicroscopy (SEM) produce noisy images due to operating close to sensitivity\nlevels and due to varying physical properties of different underlayers or\nresist materials. This inherent noise is one of the main challenges for defect\ninspection. One promising approach is the use of machine learning algorithms,\nwhich can be trained to accurately classify and locate defects in semiconductor\nsamples. Recently, convolutional neural networks have proved to be particularly\nuseful in this regard. This systematic review provides a comprehensive overview\nof the state of automated semiconductor defect inspection on SEM images,\nincluding the most recent innovations and developments. 38 publications were\nselected on this topic, indexed in IEEE Xplore and SPIE databases. For each of\nthese, the application, methodology, dataset, results, limitations and future\nwork were summarized. A comprehensive overview and analysis of their methods is\nprovided. Finally, promising avenues for future work in the field of SEM-based\ndefect inspection are suggested.\n","authors":["Thibault Lechien","Enrique Dehaerne","Bappaditya Dey","Victor Blanco","Stefan De Gendt","Wannes Meert"],"pdf_url":"https://arxiv.org/pdf/2308.08376v1.pdf","comment":"16 pages, 12 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.08370v1","updated":"2023-08-16T13:48:02Z","published":"2023-08-16T13:48:02Z","title":"Agglomerative Transformer for Human-Object Interaction Detection","summary":" We propose an agglomerative Transformer (AGER) that enables Transformer-based\nhuman-object interaction (HOI) detectors to flexibly exploit extra\ninstance-level cues in a single-stage and end-to-end manner for the first time.\nAGER acquires instance tokens by dynamically clustering patch tokens and\naligning cluster centers to instances with textual guidance, thus enjoying two\nbenefits: 1) Integrality: each instance token is encouraged to contain all\ndiscriminative feature regions of an instance, which demonstrates a significant\nimprovement in the extraction of different instance-level cues and subsequently\nleads to a new state-of-the-art performance of HOI detection with 36.75 mAP on\nHICO-Det. 2) Efficiency: the dynamical clustering mechanism allows AGER to\ngenerate instance tokens jointly with the feature learning of the Transformer\nencoder, eliminating the need of an additional object detector or instance\ndecoder in prior methods, thus allowing the extraction of desirable extra cues\nfor HOI detection in a single-stage and end-to-end pipeline. Concretely, AGER\nreduces GFLOPs by 8.5% and improves FPS by 36%, even compared to a vanilla\nDETR-like pipeline without extra cue extraction.\n","authors":["Danyang Tu","Wei Sun","Guangtao Zhai","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2308.08370v1.pdf","comment":"Accepted by ICCV'23"},{"id":"http://arxiv.org/abs/2308.08367v1","updated":"2023-08-16T13:41:29Z","published":"2023-08-16T13:41:29Z","title":"Diff-CAPTCHA: An Image-based CAPTCHA with Security Enhanced by Denoising\n Diffusion Model","summary":" To enhance the security of text CAPTCHAs, various methods have been employed,\nsuch as adding the interference lines on the text, randomly distorting the\ncharacters, and overlapping multiple characters. These methods partly increase\nthe difficulty of automated segmentation and recognition attacks. However,\nfacing the rapid development of the end-to-end breaking algorithms, their\nsecurity has been greatly weakened. The diffusion model is a novel image\ngeneration model that can generate the text images with deep fusion of\ncharacters and background images. In this paper, an image-click CAPTCHA scheme\ncalled Diff-CAPTCHA is proposed based on denoising diffusion models. The\nbackground image and characters of the CAPTCHA are treated as a whole to guide\nthe generation process of a diffusion model, thus weakening the character\nfeatures available for machine learning, enhancing the diversity of character\nfeatures in the CAPTCHA, and increasing the difficulty of breaking algorithms.\nTo evaluate the security of Diff-CAPTCHA, this paper develops several attack\nmethods, including end-to-end attacks based on Faster R-CNN and two-stage\nattacks, and Diff-CAPTCHA is compared with three baseline schemes, including\ncommercial CAPTCHA scheme and security-enhanced CAPTCHA scheme based on style\ntransfer. The experimental results show that diffusion models can effectively\nenhance CAPTCHA security while maintaining good usability in human testing.\n","authors":["Ran Jiang","Sanfeng Zhang","Linfeng Liu","Yanbing Peng"],"pdf_url":"https://arxiv.org/pdf/2308.08367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08365v1","updated":"2023-08-16T13:40:01Z","published":"2023-08-16T13:40:01Z","title":"DeepContrast: Deep Tissue Contrast Enhancement using Synthetic Data\n Degradations and OOD Model Predictions","summary":" Microscopy images are crucial for life science research, allowing detailed\ninspection and characterization of cellular and tissue-level structures and\nfunctions. However, microscopy data are unavoidably affected by image\ndegradations, such as noise, blur, or others. Many such degradations also\ncontribute to a loss of image contrast, which becomes especially pronounced in\ndeeper regions of thick samples. Today, best performing methods to increase the\nquality of images are based on Deep Learning approaches, which typically\nrequire ground truth (GT) data during training. Our inability to counteract\nblurring and contrast loss when imaging deep into samples prevents the\nacquisition of such clean GT data. The fact that the forward process of\nblurring and contrast loss deep into tissue can be modeled, allowed us to\npropose a new method that can circumvent the problem of unobtainable GT data.\nTo this end, we first synthetically degraded the quality of microscopy images\neven further by using an approximate forward model for deep tissue image\ndegradations. Then we trained a neural network that learned the inverse of this\ndegradation function from our generated pairs of raw and degraded images. We\ndemonstrated that networks trained in this way can be used out-of-distribution\n(OOD) to improve the quality of less severely degraded images, e.g. the raw\ndata imaged in a microscope. Since the absolute level of degradation in such\nmicroscopy images can be stronger than the additional degradation introduced by\nour forward model, we also explored the effect of iterative predictions. Here,\nwe observed that in each iteration the measured image contrast kept improving\nwhile detailed structures in the images got increasingly removed. Therefore,\ndependent on the desired downstream analysis, a balance between contrast\nimprovement and retention of image details has to be found.\n","authors":["Nuno Pimpão Martins","Yannis Kalaidzidis","Marino Zerial","Florian Jug"],"pdf_url":"https://arxiv.org/pdf/2308.08365v1.pdf","comment":"8 pages, 7 figures, 1 table"},{"id":"http://arxiv.org/abs/2308.08361v1","updated":"2023-08-16T13:35:09Z","published":"2023-08-16T13:35:09Z","title":"KernelWarehouse: Towards Parameter-Efficient Dynamic Convolution","summary":" Dynamic convolution learns a linear mixture of $n$ static kernels weighted\nwith their sample-dependent attentions, demonstrating superior performance\ncompared to normal convolution. However, existing designs are\nparameter-inefficient: they increase the number of convolutional parameters by\n$n$ times. This and the optimization difficulty lead to no research progress in\ndynamic convolution that can allow us to use a significant large value of $n$\n(e.g., $n>100$ instead of typical setting $n<10$) to push forward the\nperformance boundary. In this paper, we propose $KernelWarehouse$, a more\ngeneral form of dynamic convolution, which can strike a favorable trade-off\nbetween parameter efficiency and representation power. Its key idea is to\nredefine the basic concepts of \"$kernels$\" and \"$assembling$ $kernels$\" in\ndynamic convolution from the perspective of reducing kernel dimension and\nincreasing kernel number significantly. In principle, KernelWarehouse enhances\nconvolutional parameter dependencies within the same layer and across\nsuccessive layers via tactful kernel partition and warehouse sharing, yielding\na high degree of freedom to fit a desired parameter budget. We validate our\nmethod on ImageNet and MS-COCO datasets with different ConvNet architectures,\nand show that it attains state-of-the-art results. For instance, the\nResNet18|ResNet50|MobileNetV2|ConvNeXt-Tiny model trained with KernelWarehouse\non ImageNet reaches 76.05%|81.05%|75.52%|82.51% top-1 accuracy. Thanks to its\nflexible design, KernelWarehouse can even reduce the model size of a ConvNet\nwhile improving the accuracy, e.g., our ResNet18 model with 36.45%|65.10%\nparameter reduction to the baseline shows 2.89%|2.29% absolute improvement to\ntop-1 accuracy.\n","authors":["Chao Li","Anbang Yao"],"pdf_url":"https://arxiv.org/pdf/2308.08361v1.pdf","comment":"This research work was completed and submitted in early May 2023.\n Code and pre-trained models are available at\n https://github.com/OSVAI/KernelWarehouse"},{"id":"http://arxiv.org/abs/2308.08359v1","updated":"2023-08-16T13:32:03Z","published":"2023-08-16T13:32:03Z","title":"Membrane Potential Batch Normalization for Spiking Neural Networks","summary":" As one of the energy-efficient alternatives of conventional neural networks\n(CNNs), spiking neural networks (SNNs) have gained more and more interest\nrecently. To train the deep models, some effective batch normalization (BN)\ntechniques are proposed in SNNs. All these BNs are suggested to be used after\nthe convolution layer as usually doing in CNNs. However, the spiking neuron is\nmuch more complex with the spatio-temporal dynamics. The regulated data flow\nafter the BN layer will be disturbed again by the membrane potential updating\noperation before the firing function, i.e., the nonlinear activation.\nTherefore, we advocate adding another BN layer before the firing function to\nnormalize the membrane potential again, called MPBN. To eliminate the induced\ntime cost of MPBN, we also propose a training-inference-decoupled\nre-parameterization technique to fold the trained MPBN into the firing\nthreshold. With the re-parameterization technique, the MPBN will not introduce\nany extra time burden in the inference. Furthermore, the MPBN can also adopt\nthe element-wised form, while these BNs after the convolution layer can only\nuse the channel-wised form. Experimental results show that the proposed MPBN\nperforms well on both popular non-spiking static and neuromorphic datasets. Our\ncode is open-sourced at \\href{https://github.com/yfguo91/MPBN}{MPBN}.\n","authors":["Yufei Guo","Yuhan Zhang","Yuanpei Chen","Weihang Peng","Xiaode Liu","Liwen Zhang","Xuhui Huang","Zhe Ma"],"pdf_url":"https://arxiv.org/pdf/2308.08359v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2307.15029v2","updated":"2023-08-16T13:22:18Z","published":"2023-07-27T17:37:56Z","title":"Adaptive Segmentation Network for Scene Text Detection","summary":" Inspired by deep convolution segmentation algorithms, scene text detectors\nbreak the performance ceiling of datasets steadily. However, these methods\noften encounter threshold selection bottlenecks and have poor performance on\ntext instances with extreme aspect ratios. In this paper, we propose to\nautomatically learn the discriminate segmentation threshold, which\ndistinguishes text pixels from background pixels for segmentation-based scene\ntext detectors and then further reduces the time-consuming manual parameter\nadjustment. Besides, we design a Global-information Enhanced Feature Pyramid\nNetwork (GE-FPN) for capturing text instances with macro size and extreme\naspect ratios. Following the GE-FPN, we introduce a cascade optimization\nstructure to further refine the text instances. Finally, together with the\nproposed threshold learning strategy and text detection structure, we design an\nAdaptive Segmentation Network (ASNet) for scene text detection. Extensive\nexperiments are carried out to demonstrate that the proposed ASNet can achieve\nthe state-of-the-art performance on four text detection benchmarks, i.e., ICDAR\n2015, MSRA-TD500, ICDAR 2017 MLT and CTW1500. The ablation experiments also\nverify the effectiveness of our contributions.\n","authors":["Guiqin Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.15029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15128v2","updated":"2023-08-16T13:15:03Z","published":"2023-07-27T18:04:45Z","title":"End-to-end Remote Sensing Change Detection of Unregistered Bi-temporal\n Images for Natural Disasters","summary":" Change detection based on remote sensing images has been a prominent area of\ninterest in the field of remote sensing. Deep networks have demonstrated\nsignificant success in detecting changes in bi-temporal remote sensing images\nand have found applications in various fields. Given the degradation of natural\nenvironments and the frequent occurrence of natural disasters, accurately and\nswiftly identifying damaged buildings in disaster-stricken areas through remote\nsensing images holds immense significance. This paper aims to investigate\nchange detection specifically for natural disasters. Considering that existing\npublic datasets used in change detection research are registered, which does\nnot align with the practical scenario where bi-temporal images are not matched,\nthis paper introduces an unregistered end-to-end change detection synthetic\ndataset called xBD-E2ECD. Furthermore, we propose an end-to-end change\ndetection network named E2ECDNet, which takes an unregistered bi-temporal image\npair as input and simultaneously generates the flow field prediction result and\nthe change detection prediction result. It is worth noting that our E2ECDNet\nalso supports change detection for registered image pairs, as registration can\nbe seen as a special case of non-registration. Additionally, this paper\nredefines the criteria for correctly predicting a positive case and introduces\nneighborhood-based change detection evaluation metrics. The experimental\nresults have demonstrated significant improvements.\n","authors":["Guiqin Zhao","Lianlei Shan","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.15128v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08345v1","updated":"2023-08-16T13:10:32Z","published":"2023-08-16T13:10:32Z","title":"GAEI-UNet: Global Attention and Elastic Interaction U-Net for Vessel\n Image Segmentation","summary":" Vessel image segmentation plays a pivotal role in medical diagnostics, aiding\nin the early detection and treatment of vascular diseases. While segmentation\nbased on deep learning has shown promising results, effectively segmenting\nsmall structures and maintaining connectivity between them remains challenging.\nTo address these limitations, we propose GAEI-UNet, a novel model that combines\nglobal attention and elastic interaction-based techniques. GAEI-UNet leverages\nglobal spatial and channel context information to enhance high-level semantic\nunderstanding within the U-Net architecture, enabling precise segmentation of\nsmall vessels. Additionally, we adopt an elastic interaction-based loss\nfunction to improve connectivity among these fine structures. By capturing the\nforces generated by misalignment between target and predicted shapes, our model\neffectively learns to preserve the correct topology of vessel networks.\nEvaluation on retinal vessel dataset -- DRIVE demonstrates the superior\nperformance of GAEI-UNet in terms of SE and connectivity of small structures,\nwithout significantly increasing computational complexity. This research aims\nto advance the field of vessel image segmentation, providing more accurate and\nreliable diagnostic tools for the medical community. The implementation code is\navailable on Code.\n","authors":["Ruiqiang Xiao","Zhuoyue Wan","Yang Xiang"],"pdf_url":"https://arxiv.org/pdf/2308.08345v1.pdf","comment":"BIBM 2023 Under Review"},{"id":"http://arxiv.org/abs/2307.12761v2","updated":"2023-08-16T13:03:33Z","published":"2023-07-24T13:05:36Z","title":"LiDAR Meta Depth Completion","summary":" Depth estimation is one of the essential tasks to be addressed when creating\nmobile autonomous systems. While monocular depth estimation methods have\nimproved in recent times, depth completion provides more accurate and reliable\ndepth maps by additionally using sparse depth information from other sensors\nsuch as LiDAR. However, current methods are specifically trained for a single\nLiDAR sensor. As the scanning pattern differs between sensors, every new sensor\nwould require re-training a specialized depth completion model, which is\ncomputationally inefficient and not flexible. Therefore, we propose to\ndynamically adapt the depth completion model to the used sensor type enabling\nLiDAR adaptive depth completion. Specifically, we propose a meta depth\ncompletion network that uses data patterns derived from the data to learn a\ntask network to alter weights of the main depth completion network to solve a\ngiven depth completion task effectively. The method demonstrates a strong\ncapability to work on multiple LiDAR scanning patterns and can also generalize\nto scanning patterns that are unseen during training. While using a single\nmodel, our method yields significantly better results than a non-adaptive\nbaseline trained on different LiDAR patterns. It outperforms LiDAR-specific\nexpert models for very sparse cases. These advantages allow flexible deployment\nof a single depth completion model on different sensors, which could also prove\nvaluable to process the input of nascent LiDAR technology with adaptive instead\nof fixed scanning patterns.\n","authors":["Wolfgang Boettcher","Lukas Hoyer","Ozan Unal","Ke Li","Dengxin Dai"],"pdf_url":"https://arxiv.org/pdf/2307.12761v2.pdf","comment":"Accepted at IROS 2023, v2 has updated author list and fixed a figure\n caption"},{"id":"http://arxiv.org/abs/2308.08339v1","updated":"2023-08-16T13:01:13Z","published":"2023-08-16T13:01:13Z","title":"Denoising Diffusion Probabilistic Model for Retinal Image Generation and\n Segmentation","summary":" Experts use retinal images and vessel trees to detect and diagnose various\neye, blood circulation, and brain-related diseases. However, manual\nsegmentation of retinal images is a time-consuming process that requires high\nexpertise and is difficult due to privacy issues. Many methods have been\nproposed to segment images, but the need for large retinal image datasets\nlimits the performance of these methods. Several methods synthesize deep\nlearning models based on Generative Adversarial Networks (GAN) to generate\nlimited sample varieties. This paper proposes a novel Denoising Diffusion\nProbabilistic Model (DDPM) that outperformed GANs in image synthesis. We\ndeveloped a Retinal Trees (ReTree) dataset consisting of retinal images,\ncorresponding vessel trees, and a segmentation network based on DDPM trained\nwith images from the ReTree dataset. In the first stage, we develop a two-stage\nDDPM that generates vessel trees from random numbers belonging to a standard\nnormal distribution. Later, the model is guided to generate fundus images from\ngiven vessel trees and random distribution. The proposed dataset has been\nevaluated quantitatively and qualitatively. Quantitative evaluation metrics\ninclude Frechet Inception Distance (FID) score, Jaccard similarity coefficient,\nCohen's kappa, Matthew's Correlation Coefficient (MCC), precision, recall,\nF1-score, and accuracy. We trained the vessel segmentation model with synthetic\ndata to validate our dataset's efficiency and tested it on authentic data. Our\ndeveloped dataset and source code is available at\nhttps://github.com/AAleka/retree.\n","authors":["Alnur Alimanov","Md Baharul Islam"],"pdf_url":"https://arxiv.org/pdf/2308.08339v1.pdf","comment":"International Conference on Computational Photography 2023 (ICCP\n 2023)"},{"id":"http://arxiv.org/abs/2308.08333v1","updated":"2023-08-16T12:46:52Z","published":"2023-08-16T12:46:52Z","title":"Improving Depth Gradient Continuity in Transformers: A Comparative Study\n on Monocular Depth Estimation with CNN","summary":" Monocular depth estimation is an ongoing challenge in computer vision. Recent\nprogress with Transformer models has demonstrated notable advantages over\nconventional CNNs in this area. However, there's still a gap in understanding\nhow these models prioritize different regions in 2D images and how these\nregions affect depth estimation performance. To explore the differences between\nTransformers and CNNs, we employ a sparse pixel approach to contrastively\nanalyze the distinctions between the two. Our findings suggest that while\nTransformers excel in handling global context and intricate textures, they lag\nbehind CNNs in preserving depth gradient continuity. To further enhance the\nperformance of Transformer models in monocular depth estimation, we propose the\nDepth Gradient Refinement (DGR) module that refines depth estimation through\nhigh-order differentiation, feature fusion, and recalibration. Additionally, we\nleverage optimal transport theory, treating depth maps as spatial probability\ndistributions, and employ the optimal transport distance as a loss function to\noptimize our model. Experimental results demonstrate that models integrated\nwith the plug-and-play Depth Gradient Refinement (DGR) module and the proposed\nloss function enhance performance without increasing complexity and\ncomputational costs. This research not only offers fresh insights into the\ndistinctions between Transformers and CNNs in depth estimation but also paves\nthe way for novel depth estimation methodologies.\n","authors":["Jiawei Yao","Tong Wu","Xiaofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08327v1","updated":"2023-08-16T12:40:47Z","published":"2023-08-16T12:40:47Z","title":"AdaBrowse: Adaptive Video Browser for Efficient Continuous Sign Language\n Recognition","summary":" Raw videos have been proven to own considerable feature redundancy where in\nmany cases only a portion of frames can already meet the requirements for\naccurate recognition. In this paper, we are interested in whether such\nredundancy can be effectively leveraged to facilitate efficient inference in\ncontinuous sign language recognition (CSLR). We propose a novel adaptive model\n(AdaBrowse) to dynamically select a most informative subsequence from input\nvideo sequences by modelling this problem as a sequential decision task. In\nspecific, we first utilize a lightweight network to quickly scan input videos\nto extract coarse features. Then these features are fed into a policy network\nto intelligently select a subsequence to process. The corresponding subsequence\nis finally inferred by a normal CSLR model for sentence prediction. As only a\nportion of frames are processed in this procedure, the total computations can\nbe considerably saved. Besides temporal redundancy, we are also interested in\nwhether the inherent spatial redundancy can be seamlessly integrated together\nto achieve further efficiency, i.e., dynamically selecting a lowest input\nresolution for each sample, whose model is referred to as AdaBrowse+. Extensive\nexperimental results on four large-scale CSLR datasets, i.e., PHOENIX14,\nPHOENIX14-T, CSL-Daily and CSL, demonstrate the effectiveness of AdaBrowse and\nAdaBrowse+ by achieving comparable accuracy with state-of-the-art methods with\n1.44$\\times$ throughput and 2.12$\\times$ fewer FLOPs. Comparisons with other\ncommonly-used 2D CNNs and adaptive efficient methods verify the effectiveness\nof AdaBrowse. Code is available at\n\\url{https://github.com/hulianyuyy/AdaBrowse}.\n","authors":["Lianyu Hu","Liqing Gao","Zekang Liu","Chi-Man Pun","Wei Feng"],"pdf_url":"https://arxiv.org/pdf/2308.08327v1.pdf","comment":"ACMMM2023"},{"id":"http://arxiv.org/abs/2308.08325v1","updated":"2023-08-16T12:39:39Z","published":"2023-08-16T12:39:39Z","title":"Visually-Aware Context Modeling for News Image Captioning","summary":" The goal of News Image Captioning is to generate an image caption according\nto the content of both a news article and an image. To leverage the visual\ninformation effectively, it is important to exploit the connection between the\ncontext in the articles/captions and the images. Psychological studies indicate\nthat human faces in images draw higher attention priorities. On top of that,\nhumans often play a central role in news stories, as also proven by the\nface-name co-occurrence pattern we discover in existing News Image Captioning\ndatasets. Therefore, we design a face-naming module for faces in images and\nnames in captions/articles to learn a better name embedding. Apart from names,\nwhich can be directly linked to an image area (faces), news image captions\nmostly contain context information that can only be found in the article.\nHumans typically address this by searching for relevant information from the\narticle based on the image. To emulate this thought process, we design a\nretrieval strategy using CLIP to retrieve sentences that are semantically close\nto the image. We conduct extensive experiments to demonstrate the efficacy of\nour framework. Without using additional paired data, we establish the new\nstate-of-the-art performance on two News Image Captioning datasets, exceeding\nthe previous state-of-the-art by 5 CIDEr points. We will release code upon\nacceptance.\n","authors":["Tingyu Qu","Tinne Tuytelaars","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2308.08325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08321v1","updated":"2023-08-16T12:30:17Z","published":"2023-08-16T12:30:17Z","title":"Stable and Causal Inference for Discriminative Self-supervised Deep\n Visual Representations","summary":" In recent years, discriminative self-supervised methods have made significant\nstrides in advancing various visual tasks. The central idea of learning a data\nencoder that is robust to data distortions/augmentations is straightforward yet\nhighly effective. Although many studies have demonstrated the empirical success\nof various learning methods, the resulting learned representations can exhibit\ninstability and hinder downstream performance. In this study, we analyze\ndiscriminative self-supervised methods from a causal perspective to explain\nthese unstable behaviors and propose solutions to overcome them. Our approach\ndraws inspiration from prior works that empirically demonstrate the ability of\ndiscriminative self-supervised methods to demix ground truth causal sources to\nsome extent. Unlike previous work on causality-empowered representation\nlearning, we do not apply our solutions during the training process but rather\nduring the inference process to improve time efficiency. Through experiments on\nboth controlled image datasets and realistic image datasets, we show that our\nproposed solutions, which involve tempering a linear transformation with\ncontrolled synthetic data, are effective in addressing these issues.\n","authors":["Yuewei Yang","Hai Li","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2308.08321v1.pdf","comment":"ICCV 2023 accepted paper"},{"id":"http://arxiv.org/abs/2308.08316v1","updated":"2023-08-16T12:22:29Z","published":"2023-08-16T12:22:29Z","title":"Dual-Stream Diffusion Net for Text-to-Video Generation","summary":" With the emerging diffusion models, recently, text-to-video generation has\naroused increasing attention. But an important bottleneck therein is that\ngenerative videos often tend to carry some flickers and artifacts. In this\nwork, we propose a dual-stream diffusion net (DSDN) to improve the consistency\nof content variations in generating videos. In particular, the designed two\ndiffusion streams, video content and motion branches, could not only run\nseparately in their private spaces for producing personalized video variations\nas well as content, but also be well-aligned between the content and motion\ndomains through leveraging our designed cross-transformer interaction module,\nwhich would benefit the smoothness of generated videos. Besides, we also\nintroduce motion decomposer and combiner to faciliate the operation on video\nmotion. Qualitative and quantitative experiments demonstrate that our method\ncould produce amazing continuous videos with fewer flickers.\n","authors":["Binhui Liu","Xin Liu","Anbo Dai","Zhiyong Zeng","Zhen Cui","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08316v1.pdf","comment":"8pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.08313v1","updated":"2023-08-16T12:18:27Z","published":"2023-08-16T12:18:27Z","title":"ECPC-IDS:A benchmark endometrail cancer PET/CT image dataset for\n evaluation of semantic segmentation and detection of hypermetabolic regions","summary":" Endometrial cancer is one of the most common tumors in the female\nreproductive system and is the third most common gynecological malignancy that\ncauses death after ovarian and cervical cancer. Early diagnosis can\nsignificantly improve the 5-year survival rate of patients. With the\ndevelopment of artificial intelligence, computer-assisted diagnosis plays an\nincreasingly important role in improving the accuracy and objectivity of\ndiagnosis, as well as reducing the workload of doctors. However, the absence of\npublicly available endometrial cancer image datasets restricts the application\nof computer-assisted diagnostic techniques.In this paper, a publicly available\nEndometrial Cancer PET/CT Image Dataset for Evaluation of Semantic Segmentation\nand Detection of Hypermetabolic Regions (ECPC-IDS) are published. Specifically,\nthe segmentation section includes PET and CT images, with a total of 7159\nimages in multiple formats. In order to prove the effectiveness of segmentation\nmethods on ECPC-IDS, five classical deep learning semantic segmentation methods\nare selected to test the image segmentation task. The object detection section\nalso includes PET and CT images, with a total of 3579 images and XML files with\nannotation information. Six deep learning methods are selected for experiments\non the detection task.This study conduct extensive experiments using deep\nlearning-based semantic segmentation and object detection methods to\ndemonstrate the differences between various methods on ECPC-IDS. As far as we\nknow, this is the first publicly available dataset of endometrial cancer with a\nlarge number of multiple images, including a large amount of information\nrequired for image and target detection. ECPC-IDS can aid researchers in\nexploring new algorithms to enhance computer-assisted technology, benefiting\nboth clinical doctors and patients greatly.\n","authors":["Dechao Tang","Xuanyi Li","Tianming Du","Deguo Ma","Zhiyu Ma","Hongzan Sun","Marcin Grzegorzek","Huiyan Jiang","Chen Li"],"pdf_url":"https://arxiv.org/pdf/2308.08313v1.pdf","comment":"14 pages,6 figures"},{"id":"http://arxiv.org/abs/2303.08646v2","updated":"2023-08-16T12:15:29Z","published":"2023-03-15T14:23:07Z","title":"HFGD: High-level Feature Guided Decoder for Semantic Segmentation","summary":" Existing pyramid-based upsamplers (e.g. SemanticFPN), although efficient,\nusually produce less accurate results compared to dilation-based models when\nusing the same backbone. This is partially caused by the contaminated\nhigh-level features since they are fused and fine-tuned with noisy low-level\nfeatures on limited data. To address this issue, we propose to use powerful\npretrained high-level features as guidance (HFG) when learning to upsample the\nfine-grained low-level features. Specifically, the class tokens are trained\nalong with only the high-level features from the backbone. These class tokens\nare reused by the upsampler for classification, guiding the upsampler features\nto more discriminative backbone features. One key design of the HFG is to\nprotect the high-level features from being contaminated with proper\nstop-gradient operations so that the backbone does not update according to the\ngradient from the upsampler. To push the upper limit of HFG, we introduce an\ncontext augmentation encoder (CAE) that can efficiently and effectively\noperates on low-resolution high-level feature, resulting in improved\nrepresentation and thus better guidance. We evaluate the proposed method on\nthree benchmarks: Pascal Context, COCOStuff164k, and Cityscapes. Our method\nachieves state-of-the-art results among methods that do not use extra training\ndata, demonstrating its effectiveness and generalization ability. The complete\ncode will be released\n","authors":["Ye Huang","Di Kang","Shenghua Gao","Wen Li","Lixin Duan"],"pdf_url":"https://arxiv.org/pdf/2303.08646v2.pdf","comment":"Revised version, refactored presentation and added more experiments"},{"id":"http://arxiv.org/abs/2307.08693v2","updated":"2023-08-16T12:12:45Z","published":"2023-07-17T17:53:36Z","title":"SEMI-DiffusionInst: A Diffusion Model Based Approach for Semiconductor\n Defect Classification and Segmentation","summary":" With continuous progression of Moore's Law, integrated circuit (IC) device\ncomplexity is also increasing. Scanning Electron Microscope (SEM) image based\nextensive defect inspection and accurate metrology extraction are two main\nchallenges in advanced node (2 nm and beyond) technology. Deep learning (DL)\nalgorithm based computer vision approaches gained popularity in semiconductor\ndefect inspection over last few years. In this research work, a new\nsemiconductor defect inspection framework \"SEMI-DiffusionInst\" is investigated\nand compared to previous frameworks. To the best of the authors' knowledge,\nthis work is the first demonstration to accurately detect and precisely segment\nsemiconductor defect patterns by using a diffusion model. Different feature\nextractor networks as backbones and data sampling strategies are investigated\ntowards achieving a balanced trade-off between precision and computing\nefficiency. Our proposed approach outperforms previous work on overall mAP and\nperforms comparatively better or as per for almost all defect classes (per\nclass APs). The bounding box and segmentation mAPs achieved by the proposed\nSEMI-DiffusionInst model are improved by 3.83% and 2.10%, respectively. Among\nindividual defect types, precision on line collapse and thin bridge defects are\nimproved approximately 15\\% on detection task for both defect types. It has\nalso been shown that by tuning inference hyperparameters, inference time can be\nimproved significantly without compromising model precision. Finally, certain\nlimitations and future work strategy to overcome them are discussed.\n","authors":["Vic De Ridder","Bappaditya Dey","Sandip Halder","Bartel Van Waeyenberge"],"pdf_url":"https://arxiv.org/pdf/2307.08693v2.pdf","comment":"6 pages, 5 figures, To be published by IEEE in the proceedings of the\n 2023 ELMAR conference"},{"id":"http://arxiv.org/abs/2308.08303v1","updated":"2023-08-16T12:07:02Z","published":"2023-08-16T12:07:02Z","title":"Leveraging Next-Active Objects for Context-Aware Anticipation in\n Egocentric Videos","summary":" Objects are crucial for understanding human-object interactions. By\nidentifying the relevant objects, one can also predict potential future\ninteractions or actions that may occur with these objects. In this paper, we\nstudy the problem of Short-Term Object interaction anticipation (STA) and\npropose NAOGAT (Next-Active-Object Guided Anticipation Transformer), a\nmulti-modal end-to-end transformer network, that attends to objects in observed\nframes in order to anticipate the next-active-object (NAO) and, eventually, to\nguide the model to predict context-aware future actions. The task is\nchallenging since it requires anticipating future action along with the object\nwith which the action occurs and the time after which the interaction will\nbegin, a.k.a. the time to contact (TTC). Compared to existing video modeling\narchitectures for action anticipation, NAOGAT captures the relationship between\nobjects and the global scene context in order to predict detections for the\nnext active object and anticipate relevant future actions given these\ndetections, leveraging the objects' dynamics to improve accuracy. One of the\nkey strengths of our approach, in fact, is its ability to exploit the motion\ndynamics of objects within a given clip, which is often ignored by other\nmodels, and separately decoding the object-centric and motion-centric\ninformation. Through our experiments, we show that our model outperforms\nexisting methods on two separate datasets, Ego4D and EpicKitchens-100 (\"Unseen\nSet\"), as measured by several additional metrics, such as time to contact, and\nnext-active-object localization. The code will be available upon acceptance.\n","authors":["Sanket Thakur","Cigdem Beyan","Pietro Morerio","Vittorio Murino","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2308.08303v1.pdf","comment":"Accepted in WACV'24"},{"id":"http://arxiv.org/abs/2308.08288v1","updated":"2023-08-16T11:20:23Z","published":"2023-08-16T11:20:23Z","title":"Improving Audio-Visual Segmentation with Bidirectional Generation","summary":" The aim of audio-visual segmentation (AVS) is to precisely differentiate\naudible objects within videos down to the pixel level. Traditional approaches\noften tackle this challenge by combining information from various modalities,\nwhere the contribution of each modality is implicitly or explicitly modeled.\nNevertheless, the interconnections between different modalities tend to be\noverlooked in audio-visual modeling. In this paper, inspired by the human\nability to mentally simulate the sound of an object and its visual appearance,\nwe introduce a bidirectional generation framework. This framework establishes\nrobust correlations between an object's visual characteristics and its\nassociated sound, thereby enhancing the performance of AVS. To achieve this, we\nemploy a visual-to-audio projection component that reconstructs audio features\nfrom object segmentation masks and minimizes reconstruction errors. Moreover,\nrecognizing that many sounds are linked to object movements, we introduce an\nimplicit volumetric motion estimation module to handle temporal dynamics that\nmay be challenging to capture using conventional optical flow methods. To\nshowcase the effectiveness of our approach, we conduct comprehensive\nexperiments and analyses on the widely recognized AVSBench benchmark. As a\nresult, we establish a new state-of-the-art performance level in the AVS\nbenchmark, particularly excelling in the challenging MS3 subset which involves\nsegmenting multiple sound sources. To facilitate reproducibility, we plan to\nrelease both the source code and the pre-trained model.\n","authors":["Dawei Hao","Yuxin Mao","Bowen He","Xiaodong Han","Yuchao Dai","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.08288v1.pdf","comment":"Dawei Hao and Yuxin Mao contribute equality to this paper. Yiran\n Zhong is the corresponding author. The code will be released at\n https://github.com/OpenNLPLab/AVS-bidirectional"},{"id":"http://arxiv.org/abs/2305.13873v2","updated":"2023-08-16T11:16:15Z","published":"2023-05-23T09:48:16Z","title":"Unsafe Diffusion: On the Generation of Unsafe Images and Hateful Memes\n From Text-To-Image Models","summary":" State-of-the-art Text-to-Image models like Stable Diffusion and DALLE$\\cdot$2\nare revolutionizing how people generate visual content. At the same time,\nsociety has serious concerns about how adversaries can exploit such models to\ngenerate unsafe images. In this work, we focus on demystifying the generation\nof unsafe images and hateful memes from Text-to-Image models. We first\nconstruct a typology of unsafe images consisting of five categories (sexually\nexplicit, violent, disturbing, hateful, and political). Then, we assess the\nproportion of unsafe images generated by four advanced Text-to-Image models\nusing four prompt datasets. We find that these models can generate a\nsubstantial percentage of unsafe images; across four models and four prompt\ndatasets, 14.56% of all generated images are unsafe. When comparing the four\nmodels, we find different risk levels, with Stable Diffusion being the most\nprone to generating unsafe content (18.92% of all generated images are unsafe).\nGiven Stable Diffusion's tendency to generate more unsafe content, we evaluate\nits potential to generate hateful meme variants if exploited by an adversary to\nattack a specific individual or community. We employ three image editing\nmethods, DreamBooth, Textual Inversion, and SDEdit, which are supported by\nStable Diffusion. Our evaluation result shows that 24% of the generated images\nusing DreamBooth are hateful meme variants that present the features of the\noriginal hateful meme and the target individual/community; these generated\nimages are comparable to hateful meme variants collected from the real world.\nOverall, our results demonstrate that the danger of large-scale generation of\nunsafe images is imminent. We discuss several mitigating measures, such as\ncurating training data, regulating prompts, and implementing safety filters,\nand encourage better safeguard tools to be developed to prevent unsafe\ngeneration.\n","authors":["Yiting Qu","Xinyue Shen","Xinlei He","Michael Backes","Savvas Zannettou","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13873v2.pdf","comment":"To Appear in the ACM Conference on Computer and Communications\n Security, November 26, 2023"},{"id":"http://arxiv.org/abs/2302.06608v3","updated":"2023-08-16T11:12:42Z","published":"2023-02-13T18:59:52Z","title":"3D-aware Blending with Generative NeRFs","summary":" Image blending aims to combine multiple images seamlessly. It remains\nchallenging for existing 2D-based methods, especially when input images are\nmisaligned due to differences in 3D camera poses and object shapes. To tackle\nthese issues, we propose a 3D-aware blending method using generative Neural\nRadiance Fields (NeRF), including two key components: 3D-aware alignment and\n3D-aware blending. For 3D-aware alignment, we first estimate the camera pose of\nthe reference image with respect to generative NeRFs and then perform 3D local\nalignment for each part. To further leverage 3D information of the generative\nNeRF, we propose 3D-aware blending that directly blends images on the NeRF's\nlatent representation space, rather than raw pixel space. Collectively, our\nmethod outperforms existing 2D baselines, as validated by extensive\nquantitative and qualitative evaluations with FFHQ and AFHQ-Cat.\n","authors":["Hyunsu Kim","Gayoung Lee","Yunjey Choi","Jin-Hwa Kim","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.06608v3.pdf","comment":"ICCV 2023, Project page: https://blandocs.github.io/blendnerf"},{"id":"http://arxiv.org/abs/2308.08283v1","updated":"2023-08-16T10:51:27Z","published":"2023-08-16T10:51:27Z","title":"CARE: A Large Scale CT Image Dataset and Clinical Applicable Benchmark\n Model for Rectal Cancer Segmentation","summary":" Rectal cancer segmentation of CT image plays a crucial role in timely\nclinical diagnosis, radiotherapy treatment, and follow-up. Although current\nsegmentation methods have shown promise in delineating cancerous tissues, they\nstill encounter challenges in achieving high segmentation precision. These\nobstacles arise from the intricate anatomical structures of the rectum and the\ndifficulties in performing differential diagnosis of rectal cancer.\nAdditionally, a major obstacle is the lack of a large-scale, finely annotated\nCT image dataset for rectal cancer segmentation. To address these issues, this\nwork introduces a novel large scale rectal cancer CT image dataset CARE with\npixel-level annotations for both normal and cancerous rectum, which serves as a\nvaluable resource for algorithm research and clinical application development.\nMoreover, we propose a novel medical cancer lesion segmentation benchmark model\nnamed U-SAM. The model is specifically designed to tackle the challenges posed\nby the intricate anatomical structures of abdominal organs by incorporating\nprompt information. U-SAM contains three key components: promptable information\n(e.g., points) to aid in target area localization, a convolution module for\ncapturing low-level lesion details, and skip-connections to preserve and\nrecover spatial information during the encoding-decoding process. To evaluate\nthe effectiveness of U-SAM, we systematically compare its performance with\nseveral popular segmentation methods on the CARE dataset. The generalization of\nthe model is further verified on the WORD dataset. Extensive experiments\ndemonstrate that the proposed U-SAM outperforms state-of-the-art methods on\nthese two datasets. These experiments can serve as the baseline for future\nresearch and clinical application development.\n","authors":["Hantao Zhang","Weidong Guo","Chenyang Qiu","Shouhong Wan","Bingbing Zou","Wanqin Wang","Peiquan Jin"],"pdf_url":"https://arxiv.org/pdf/2308.08283v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2212.05370v2","updated":"2023-08-16T10:45:20Z","published":"2022-12-10T21:57:11Z","title":"Source-free Depth for Object Pop-out","summary":" Depth cues are known to be useful for visual perception. However, direct\nmeasurement of depth is often impracticable. Fortunately, though, modern\nlearning-based methods offer promising depth maps by inference in the wild. In\nthis work, we adapt such depth inference models for object segmentation using\nthe objects' \"pop-out\" prior in 3D. The \"pop-out\" is a simple composition prior\nthat assumes objects reside on the background surface. Such compositional prior\nallows us to reason about objects in the 3D space. More specifically, we adapt\nthe inferred depth maps such that objects can be localized using only 3D\ninformation. Such separation, however, requires knowledge about contact surface\nwhich we learn using the weak supervision of the segmentation mask. Our\nintermediate representation of contact surface, and thereby reasoning about\nobjects purely in 3D, allows us to better transfer the depth knowledge into\nsemantics. The proposed adaptation method uses only the depth model without\nneeding the source data used for training, making the learning process\nefficient and practical. Our experiments on eight datasets of two challenging\ntasks, namely camouflaged object detection and salient object detection,\nconsistently demonstrate the benefit of our method in terms of both performance\nand generalizability.\n","authors":["Zongwei Wu","Danda Pani Paudel","Deng-Ping Fan","Jingjing Wang","Shuo Wang","Cédric Demonceaux","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2212.05370v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2305.10732v2","updated":"2023-08-16T10:39:12Z","published":"2023-05-18T06:04:24Z","title":"BlindHarmony: \"Blind\" Harmonization for MR Images via Flow model","summary":" In MRI, images of the same contrast (e.g., T$_1$) from the same subject can\nexhibit noticeable differences when acquired using different hardware,\nsequences, or scan parameters. These differences in images create a domain gap\nthat needs to be bridged by a step called image harmonization, to process the\nimages successfully using conventional or deep learning-based image analysis\n(e.g., segmentation). Several methods, including deep learning-based\napproaches, have been proposed to achieve image harmonization. However, they\noften require datasets from multiple domains for deep learning training and may\nstill be unsuccessful when applied to images from unseen domains. To address\nthis limitation, we propose a novel concept called `Blind Harmonization', which\nutilizes only target domain data for training but still has the capability to\nharmonize images from unseen domains. For the implementation of blind\nharmonization, we developed BlindHarmony using an unconditional flow model\ntrained on target domain data. The harmonized image is optimized to have a\ncorrelation with the input source domain image while ensuring that the latent\nvector of the flow model is close to the center of the Gaussian distribution.\nBlindHarmony was evaluated on both simulated and real datasets and compared to\nconventional methods. BlindHarmony demonstrated noticeable performance on both\ndatasets, highlighting its potential for future use in clinical settings. The\nsource code is available at: https://github.com/SNU-LIST/BlindHarmony\n","authors":["Hwihun Jeong","Heejoon Byun","Dong Un Kang","Jongho Lee"],"pdf_url":"https://arxiv.org/pdf/2305.10732v2.pdf","comment":"ICCV 2023 accepted. 9 pages and 5 Figures for manuscipt,\n supplementary included"},{"id":"http://arxiv.org/abs/2308.08276v1","updated":"2023-08-16T10:33:24Z","published":"2023-08-16T10:33:24Z","title":"Computer vision-enriched discrete choice models, with an application to\n residential location choice","summary":" Visual imagery is indispensable to many multi-attribute decision situations.\nExamples of such decision situations in travel behaviour research include\nresidential location choices, vehicle choices, tourist destination choices, and\nvarious safety-related choices. However, current discrete choice models cannot\nhandle image data and thus cannot incorporate information embedded in images\ninto their representations of choice behaviour. This gap between discrete\nchoice models' capabilities and the real-world behaviour it seeks to model\nleads to incomplete and, possibly, misleading outcomes. To solve this gap, this\nstudy proposes \"Computer Vision-enriched Discrete Choice Models\" (CV-DCMs).\nCV-DCMs can handle choice tasks involving numeric attributes and images by\nintegrating computer vision and traditional discrete choice models. Moreover,\nbecause CV-DCMs are grounded in random utility maximisation principles, they\nmaintain the solid behavioural foundation of traditional discrete choice\nmodels. We demonstrate the proposed CV-DCM by applying it to data obtained\nthrough a novel stated choice experiment involving residential location\nchoices. In this experiment, respondents faced choice tasks with trade-offs\nbetween commute time, monthly housing cost and street-level conditions,\npresented using images. As such, this research contributes to the growing body\nof literature in the travel behaviour field that seeks to integrate discrete\nchoice modelling and machine learning.\n","authors":["Sander van Cranenburgh","Francisco Garrido-Valenzuela"],"pdf_url":"https://arxiv.org/pdf/2308.08276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08271v1","updated":"2023-08-16T10:19:16Z","published":"2023-08-16T10:19:16Z","title":"Detecting Olives with Synthetic or Real Data? Olive the Above","summary":" Modern robotics has enabled the advancement in yield estimation for precision\nagriculture. However, when applied to the olive industry, the high variation of\nolive colors and their similarity to the background leaf canopy presents a\nchallenge. Labeling several thousands of very dense olive grove images for\nsegmentation is a labor-intensive task. This paper presents a novel approach to\ndetecting olives without the need to manually label data. In this work, we\npresent the world's first olive detection dataset comprised of synthetic and\nreal olive tree images. This is accomplished by generating an auto-labeled\nphotorealistic 3D model of an olive tree. Its geometry is then simplified for\nlightweight rendering purposes. In addition, experiments are conducted with a\nmix of synthetically generated and real images, yielding an improvement of up\nto 66% compared to when only using a small sample of real data. When access to\nreal, human-labeled data is limited, a combination of mostly synthetic data and\na small amount of real data can enhance olive detection.\n","authors":["Yianni Karabatis","Xiaomin Lin","Nitin J. Sanket","Michail G. Lagoudakis","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2308.08271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08269v1","updated":"2023-08-16T10:16:50Z","published":"2023-08-16T10:16:50Z","title":"OnUVS: Online Feature Decoupling Framework for High-Fidelity Ultrasound\n Video Synthesis","summary":" Ultrasound (US) imaging is indispensable in clinical practice. To diagnose\ncertain diseases, sonographers must observe corresponding dynamic anatomic\nstructures to gather comprehensive information. However, the limited\navailability of specific US video cases causes teaching difficulties in\nidentifying corresponding diseases, which potentially impacts the detection\nrate of such cases. The synthesis of US videos may represent a promising\nsolution to this issue. Nevertheless, it is challenging to accurately animate\nthe intricate motion of dynamic anatomic structures while preserving image\nfidelity. To address this, we present a novel online feature-decoupling\nframework called OnUVS for high-fidelity US video synthesis. Our highlights can\nbe summarized by four aspects. First, we introduced anatomic information into\nkeypoint learning through a weakly-supervised training strategy, resulting in\nimproved preservation of anatomical integrity and motion while minimizing the\nlabeling burden. Second, to better preserve the integrity and textural\ninformation of US images, we implemented a dual-decoder that decouples the\ncontent and textural features in the generator. Third, we adopted a\nmultiple-feature discriminator to extract a comprehensive range of visual cues,\nthereby enhancing the sharpness and fine details of the generated videos.\nFourth, we constrained the motion trajectories of keypoints during online\nlearning to enhance the fluidity of generated videos. Our validation and user\nstudies on in-house echocardiographic and pelvic floor US videos showed that\nOnUVS synthesizes US videos with high fidelity.\n","authors":["Han Zhou","Dong Ni","Ao Chang","Xinrui Zhou","Rusi Chen","Yanlin Chen","Lian Liu","Jiamin Liang","Yuhao Huang","Tong Han","Zhe Liu","Deng-Ping Fan","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08269v1.pdf","comment":"14 pages, 13 figures and 6 tables"},{"id":"http://arxiv.org/abs/2102.03973v7","updated":"2023-08-16T10:06:53Z","published":"2021-02-08T02:51:34Z","title":"STS-GAN: Can We Synthesize Solid Texture with High Fidelity from\n Arbitrary 2D Exemplar?","summary":" Solid texture synthesis (STS), an effective way to extend a 2D exemplar to a\n3D solid volume, exhibits advantages in computational photography. However,\nexisting methods generally fail to accurately learn arbitrary textures, which\nmay result in the failure to synthesize solid textures with high fidelity. In\nthis paper, we propose a novel generative adversarial nets-based framework\n(STS-GAN) to extend the given 2D exemplar to arbitrary 3D solid textures. In\nSTS-GAN, multi-scale 2D texture discriminators evaluate the similarity between\nthe given 2D exemplar and slices from the generated 3D texture, promoting the\n3D texture generator synthesizing realistic solid textures. Finally,\nexperiments demonstrate that the proposed method can generate high-fidelity\nsolid textures with similar visual characteristics to the 2D exemplar.\n","authors":["Xin Zhao","Jifeng Guo","Lin Wang","Fanqi Li","Jiahao Li","Junteng Zheng","Bo Yang"],"pdf_url":"https://arxiv.org/pdf/2102.03973v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02510v2","updated":"2023-08-16T09:59:40Z","published":"2023-07-27T12:54:16Z","title":"Seeing through the Brain: Image Reconstruction of Visual Perception from\n Human Brain Signals","summary":" Seeing is believing, however, the underlying mechanism of how human visual\nperceptions are intertwined with our cognitions is still a mystery. Thanks to\nthe recent advances in both neuroscience and artificial intelligence, we have\nbeen able to record the visually evoked brain activities and mimic the visual\nperception ability through computational approaches. In this paper, we pay\nattention to visual stimuli reconstruction by reconstructing the observed\nimages based on portably accessible brain signals, i.e., electroencephalography\n(EEG) data. Since EEG signals are dynamic in the time-series format and are\nnotorious to be noisy, processing and extracting useful information requires\nmore dedicated efforts; In this paper, we propose a comprehensive pipeline,\nnamed NeuroImagen, for reconstructing visual stimuli images from EEG signals.\nSpecifically, we incorporate a novel multi-level perceptual information\ndecoding to draw multi-grained outputs from the given EEG data. A latent\ndiffusion model will then leverage the extracted information to reconstruct the\nhigh-resolution visual stimuli images. The experimental results have\nillustrated the effectiveness of image reconstruction and superior quantitative\nperformance of our proposed method.\n","authors":["Yu-Ting Lan","Kan Ren","Yansen Wang","Wei-Long Zheng","Dongsheng Li","Bao-Liang Lu","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.02510v2.pdf","comment":"A preprint version of an ongoing work"},{"id":"http://arxiv.org/abs/2308.08258v1","updated":"2023-08-16T09:50:35Z","published":"2023-08-16T09:50:35Z","title":"SceNeRFlow: Time-Consistent Reconstruction of General Dynamic Scenes","summary":" Existing methods for the 4D reconstruction of general, non-rigidly deforming\nobjects focus on novel-view synthesis and neglect correspondences. However,\ntime consistency enables advanced downstream tasks like 3D editing, motion\nanalysis, or virtual-asset creation. We propose SceNeRFlow to reconstruct a\ngeneral, non-rigid scene in a time-consistent manner. Our dynamic-NeRF method\ntakes multi-view RGB videos and background images from static cameras with\nknown camera parameters as input. It then reconstructs the deformations of an\nestimated canonical model of the geometry and appearance in an online fashion.\nSince this canonical model is time-invariant, we obtain correspondences even\nfor long-term, long-range motions. We employ neural scene representations to\nparametrize the components of our method. Like prior dynamic-NeRF methods, we\nuse a backwards deformation model. We find non-trivial adaptations of this\nmodel necessary to handle larger motions: We decompose the deformations into a\nstrongly regularized coarse component and a weakly regularized fine component,\nwhere the coarse component also extends the deformation field into the space\nsurrounding the object, which enables tracking over time. We show\nexperimentally that, unlike prior work that only handles small motion, our\nmethod enables the reconstruction of studio-scale motions.\n","authors":["Edith Tretschk","Vladislav Golyanik","Michael Zollhoefer","Aljaz Bozic","Christoph Lassner","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2308.08258v1.pdf","comment":"Project page: https://vcai.mpi-inf.mpg.de/projects/scenerflow/"},{"id":"http://arxiv.org/abs/2308.08256v1","updated":"2023-08-16T09:47:52Z","published":"2023-08-16T09:47:52Z","title":"MultiMediate'23: Engagement Estimation and Bodily Behaviour Recognition\n in Social Interactions","summary":" Automatic analysis of human behaviour is a fundamental prerequisite for the\ncreation of machines that can effectively interact with- and support humans in\nsocial interactions. In MultiMediate'23, we address two key human social\nbehaviour analysis tasks for the first time in a controlled challenge:\nengagement estimation and bodily behaviour recognition in social interactions.\nThis paper describes the MultiMediate'23 challenge and presents novel sets of\nannotations for both tasks. For engagement estimation we collected novel\nannotations on the NOvice eXpert Interaction (NOXI) database. For bodily\nbehaviour recognition, we annotated test recordings of the MPIIGroupInteraction\ncorpus with the BBSI annotation scheme. In addition, we present baseline\nresults for both challenge tasks.\n","authors":["Philipp Müller","Michal Balazia","Tobias Baur","Michael Dietz","Alexander Heimerl","Dominik Schiller","Mohammed Guermal","Dominike Thomas","François Brémond","Jan Alexandersson","Elisabeth André","Andreas Bulling"],"pdf_url":"https://arxiv.org/pdf/2308.08256v1.pdf","comment":"ACM MultiMedia'23"},{"id":"http://arxiv.org/abs/2308.07009v2","updated":"2023-08-16T09:47:08Z","published":"2023-08-14T08:52:41Z","title":"ACTIVE: Towards Highly Transferable 3D Physical Camouflage for Universal\n and Robust Vehicle Evasion","summary":" Adversarial camouflage has garnered attention for its ability to attack\nobject detectors from any viewpoint by covering the entire object's surface.\nHowever, universality and robustness in existing methods often fall short as\nthe transferability aspect is often overlooked, thus restricting their\napplication only to a specific target with limited performance. To address\nthese challenges, we present Adversarial Camouflage for Transferable and\nIntensive Vehicle Evasion (ACTIVE), a state-of-the-art physical camouflage\nattack framework designed to generate universal and robust adversarial\ncamouflage capable of concealing any 3D vehicle from detectors. Our framework\nincorporates innovative techniques to enhance universality and robustness,\nincluding a refined texture rendering that enables common texture application\nto different vehicles without being constrained to a specific texture map, a\nnovel stealth loss that renders the vehicle undetectable, and a smooth and\ncamouflage loss to enhance the naturalness of the adversarial camouflage. Our\nextensive experiments on 15 different models show that ACTIVE consistently\noutperforms existing works on various public detectors, including the latest\nYOLOv7. Notably, our universality evaluations reveal promising transferability\nto other vehicle classes, tasks (segmentation models), and the real world, not\njust other vehicles.\n","authors":["Naufal Suryanto","Yongsu Kim","Harashta Tatimma Larasati","Hyoeun Kang","Thi-Thu-Huong Le","Yoonyoung Hong","Hunmin Yang","Se-Yoon Oh","Howon Kim"],"pdf_url":"https://arxiv.org/pdf/2308.07009v2.pdf","comment":"Accepted for ICCV 2023. Main Paper with Supplementary Material.\n Project Page: https://islab-ai.github.io/active-iccv2023/"},{"id":"http://arxiv.org/abs/2308.08242v1","updated":"2023-08-16T09:16:05Z","published":"2023-08-16T09:16:05Z","title":"Contrastive Learning for Lane Detection via cross-similarity","summary":" Detecting road lanes is challenging due to intricate markings vulnerable to\nunfavorable conditions. Lane markings have strong shape priors, but their\nvisibility is easily compromised. Factors like lighting, weather, vehicles,\npedestrians, and aging colors challenge the detection. A large amount of data\nis required to train a lane detection approach that can withstand natural\nvariations caused by low visibility. This is because there are numerous lane\nshapes and natural variations that exist. Our solution, Contrastive Learning\nfor Lane Detection via cross-similarity (CLLD), is a self-supervised learning\nmethod that tackles this challenge by enhancing lane detection models\nresilience to real-world conditions that cause lane low visibility. CLLD is a\nnovel multitask contrastive learning that trains lane detection approaches to\ndetect lane markings even in low visible situations by integrating local\nfeature contrastive learning (CL) with our new proposed operation\ncross-similarity. Local feature CL focuses on extracting features for small\nimage parts, which is necessary to localize lane segments, while\ncross-similarity captures global features to detect obscured lane segments\nusing their surrounding. We enhance cross-similarity by randomly masking parts\nof input images for augmentation. Evaluated on benchmark datasets, CLLD\noutperforms state-of-the-art contrastive learning, especially in\nvisibility-impairing conditions like shadows. Compared to supervised learning,\nCLLD excels in scenarios like shadows and crowded scenes.\n","authors":["Ali Zoljodi","Sadegh Abadijou","Mina Alibeigi","Masoud Daneshtalab"],"pdf_url":"https://arxiv.org/pdf/2308.08242v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.08231v1","updated":"2023-08-16T09:06:32Z","published":"2023-08-16T09:06:32Z","title":"DDF-HO: Hand-Held Object Reconstruction via Conditional Directed\n Distance Field","summary":" Reconstructing hand-held objects from a single RGB image is an important and\nchallenging problem. Existing works utilizing Signed Distance Fields (SDF)\nreveal limitations in comprehensively capturing the complex hand-object\ninteractions, since SDF is only reliable within the proximity of the target,\nand hence, infeasible to simultaneously encode local hand and object cues. To\naddress this issue, we propose DDF-HO, a novel approach leveraging Directed\nDistance Field (DDF) as the shape representation. Unlike SDF, DDF maps a ray in\n3D space, consisting of an origin and a direction, to corresponding DDF values,\nincluding a binary visibility signal determining whether the ray intersects the\nobjects and a distance value measuring the distance from origin to target in\nthe given direction. We randomly sample multiple rays and collect local to\nglobal geometric features for them by introducing a novel 2D ray-based feature\naggregation scheme and a 3D intersection-aware hand pose embedding, combining\n2D-3D features to model hand-object interactions. Extensive experiments on\nsynthetic and real-world datasets demonstrate that DDF-HO consistently\noutperforms all baseline methods by a large margin, especially under Chamfer\nDistance, with about 80% leap forward. Codes and trained models will be\nreleased soon.\n","authors":["Chenyangguang Zhang","Yan Di","Ruida Zhang","Guangyao Zhai","Fabian Manhardt","Federico Tombari","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2308.08231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08227v1","updated":"2023-08-16T08:58:25Z","published":"2023-08-16T08:58:25Z","title":"Inherent Redundancy in Spiking Neural Networks","summary":" Spiking Neural Networks (SNNs) are well known as a promising energy-efficient\nalternative to conventional artificial neural networks. Subject to the\npreconceived impression that SNNs are sparse firing, the analysis and\noptimization of inherent redundancy in SNNs have been largely overlooked, thus\nthe potential advantages of spike-based neuromorphic computing in accuracy and\nenergy efficiency are interfered. In this work, we pose and focus on three key\nquestions regarding the inherent redundancy in SNNs. We argue that the\nredundancy is induced by the spatio-temporal invariance of SNNs, which enhances\nthe efficiency of parameter utilization but also invites lots of noise spikes.\nFurther, we analyze the effect of spatio-temporal invariance on the\nspatio-temporal dynamics and spike firing of SNNs. Then, motivated by these\nanalyses, we propose an Advance Spatial Attention (ASA) module to harness SNNs'\nredundancy, which can adaptively optimize their membrane potential distribution\nby a pair of individual spatial attention sub-modules. In this way, noise spike\nfeatures are accurately regulated. Experimental results demonstrate that the\nproposed method can significantly drop the spike firing with better performance\nthan state-of-the-art SNN baselines. Our code is available in\n\\url{https://github.com/BICLab/ASA-SNN}.\n","authors":["Man Yao","Jiakui Hu","Guangshe Zhao","Yaoyuan Wang","Ziyang Zhang","Bo Xu","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2308.08227v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2210.04214v2","updated":"2023-08-16T08:55:29Z","published":"2022-10-09T09:59:46Z","title":"VM-NeRF: Tackling Sparsity in NeRF with View Morphing","summary":" NeRF aims to learn a continuous neural scene representation by using a finite\nset of input images taken from various viewpoints. A well-known limitation of\nNeRF methods is their reliance on data: the fewer the viewpoints, the higher\nthe likelihood of overfitting. This paper addresses this issue by introducing a\nnovel method to generate geometrically consistent image transitions between\nviewpoints using View Morphing. Our VM-NeRF approach requires no prior\nknowledge about the scene structure, as View Morphing is based on the\nfundamental principles of projective geometry. VM-NeRF tightly integrates this\ngeometric view generation process during the training procedure of standard\nNeRF approaches. Notably, our method significantly improves novel view\nsynthesis, particularly when only a few views are available. Experimental\nevaluation reveals consistent improvement over current methods that handle\nsparse viewpoints in NeRF models. We report an increase in PSNR of up to 1.8dB\nand 1.0dB when training uses eight and four views, respectively. Source code:\n\\url{https://github.com/mbortolon97/VM-NeRF}\n","authors":["Matteo Bortolon","Alessio Del Bue","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2210.04214v2.pdf","comment":"ICIAP 2023"},{"id":"http://arxiv.org/abs/2308.08224v1","updated":"2023-08-16T08:52:49Z","published":"2023-08-16T08:52:49Z","title":"How To Overcome Confirmation Bias in Semi-Supervised Image\n Classification By Active Learning","summary":" Do we need active learning? The rise of strong deep semi-supervised methods\nraises doubt about the usability of active learning in limited labeled data\nsettings. This is caused by results showing that combining semi-supervised\nlearning (SSL) methods with a random selection for labeling can outperform\nexisting active learning (AL) techniques. However, these results are obtained\nfrom experiments on well-established benchmark datasets that can overestimate\nthe external validity. However, the literature lacks sufficient research on the\nperformance of active semi-supervised learning methods in realistic data\nscenarios, leaving a notable gap in our understanding. Therefore we present\nthree data challenges common in real-world applications: between-class\nimbalance, within-class imbalance, and between-class similarity. These\nchallenges can hurt SSL performance due to confirmation bias. We conduct\nexperiments with SSL and AL on simulated data challenges and find that random\nsampling does not mitigate confirmation bias and, in some cases, leads to worse\nperformance than supervised learning. In contrast, we demonstrate that AL can\novercome confirmation bias in SSL in these realistic settings. Our results\nprovide insights into the potential of combining active and semi-supervised\nlearning in the presence of common real-world challenges, which is a promising\ndirection for robust methods when learning with limited labeled data in\nreal-world applications.\n","authors":["Sandra Gilhuber","Rasmus Hvingelby","Mang Ling Ada Fok","Thomas Seidl"],"pdf_url":"https://arxiv.org/pdf/2308.08224v1.pdf","comment":"Accepted @ ECML PKDD 2023. This is the author's version of the work.\n The definitive Version of Record will be published in the Proceedings of ECML\n PKDD 2023"},{"id":"http://arxiv.org/abs/2306.17558v2","updated":"2023-08-16T08:46:52Z","published":"2023-06-30T11:21:40Z","title":"Towards the extraction of robust sign embeddings for low resource sign\n language recognition","summary":" Isolated Sign Language Recognition (SLR) has mostly been applied on datasets\ncontaining signs executed slowly and clearly by a limited group of signers. In\nreal-world scenarios, however, we are met with challenging visual conditions,\ncoarticulated signing, small datasets, and the need for signer independent\nmodels. To tackle this difficult problem, we require a robust feature extractor\nto process the sign language videos. One could expect human pose estimators to\nbe ideal candidates. However, due to a domain mismatch with their training sets\nand challenging poses in sign language, they lack robustness on sign language\ndata and image-based models often still outperform keypoint-based models.\nFurthermore, whereas the common practice of transfer learning with image-based\nmodels yields even higher accuracy, keypoint-based models are typically trained\nfrom scratch on every SLR dataset. These factors limit their usefulness for\nSLR. From the existing literature, it is also not clear which, if any, pose\nestimator performs best for SLR. We compare the three most popular pose\nestimators for SLR: OpenPose, MMPose and MediaPipe. We show that through\nkeypoint normalization, missing keypoint imputation, and learning a pose\nembedding, we can obtain significantly better results and enable transfer\nlearning. We show that keypoint-based embeddings contain cross-lingual\nfeatures: they can transfer between sign languages and achieve competitive\nperformance even when fine-tuning only the classifier layer of an SLR model on\na target sign language. We furthermore achieve better performance using\nfine-tuned transferred embeddings than models trained only on the target sign\nlanguage. The embeddings can also be learned in a multilingual fashion. The\napplication of these embeddings could prove particularly useful for low\nresource sign languages in the future.\n","authors":["Mathieu De Coster","Ellen Rushe","Ruth Holmes","Anthony Ventresque","Joni Dambre"],"pdf_url":"https://arxiv.org/pdf/2306.17558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08220v1","updated":"2023-08-16T08:46:51Z","published":"2023-08-16T08:46:51Z","title":"Low-Light Image Enhancement with Illumination-Aware Gamma Correction and\n Complete Image Modelling Network","summary":" This paper presents a novel network structure with illumination-aware gamma\ncorrection and complete image modelling to solve the low-light image\nenhancement problem. Low-light environments usually lead to less informative\nlarge-scale dark areas, directly learning deep representations from low-light\nimages is insensitive to recovering normal illumination. We propose to\nintegrate the effectiveness of gamma correction with the strong modelling\ncapacities of deep networks, which enables the correction factor gamma to be\nlearned in a coarse to elaborate manner via adaptively perceiving the deviated\nillumination. Because exponential operation introduces high computational\ncomplexity, we propose to use Taylor Series to approximate gamma correction,\naccelerating the training and inference speed. Dark areas usually occupy large\nscales in low-light images, common local modelling structures, e.g., CNN,\nSwinIR, are thus insufficient to recover accurate illumination across whole\nlow-light images. We propose a novel Transformer block to completely simulate\nthe dependencies of all pixels across images via a local-to-global hierarchical\nattention mechanism, so that dark areas could be inferred by borrowing the\ninformation from far informative regions in a highly effective manner.\nExtensive experiments on several benchmark datasets demonstrate that our\napproach outperforms state-of-the-art methods.\n","authors":["Yinglong Wang","Zhen Liu","Jianzhuang Liu","Songcen Xu","Shuaicheng Liu"],"pdf_url":"https://arxiv.org/pdf/2308.08220v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08213v1","updated":"2023-08-16T08:30:44Z","published":"2023-08-16T08:30:44Z","title":"MEDOE: A Multi-Expert Decoder and Output Ensemble Framework for\n Long-tailed Semantic Segmentation","summary":" Long-tailed distribution of semantic categories, which has been often ignored\nin conventional methods, causes unsatisfactory performance in semantic\nsegmentation on tail categories. In this paper, we focus on the problem of\nlong-tailed semantic segmentation. Although some long-tailed recognition\nmethods (e.g., re-sampling/re-weighting) have been proposed in other problems,\nthey can probably compromise crucial contextual information and are thus hardly\nadaptable to the problem of long-tailed semantic segmentation. To address this\nissue, we propose MEDOE, a novel framework for long-tailed semantic\nsegmentation via contextual information ensemble-and-grouping. The proposed\ntwo-sage framework comprises a multi-expert decoder (MED) and a multi-expert\noutput ensemble (MOE). Specifically, the MED includes several \"experts\". Based\non the pixel frequency distribution, each expert takes the dataset masked\naccording to the specific categories as input and generates contextual\ninformation self-adaptively for classification; The MOE adopts learnable\ndecision weights for the ensemble of the experts' outputs. As a model-agnostic\nframework, our MEDOE can be flexibly and efficiently coupled with various\npopular deep neural networks (e.g., DeepLabv3+, OCRNet, and PSPNet) to improve\ntheir performance in long-tailed semantic segmentation. Experimental results\nshow that the proposed framework outperforms the current methods on both\nCityscapes and ADE20K datasets by up to 1.78% in mIoU and 5.89% in mAcc.\n","authors":["Junao Shen","Long Chen","Kun Kuang","Fei Wu","Tian Feng","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08213v1.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.08210v1","updated":"2023-08-16T08:28:01Z","published":"2023-08-16T08:28:01Z","title":"Neural Spherical Harmonics for structurally coherent continuous\n representation of diffusion MRI signal","summary":" We present a novel way to model diffusion magnetic resonance imaging (dMRI)\ndatasets, that benefits from the structural coherence of the human brain while\nonly using data from a single subject. Current methods model the dMRI signal in\nindividual voxels, disregarding the intervoxel coherence that is present. We\nuse a neural network to parameterize a spherical harmonics series (NeSH) to\nrepresent the dMRI signal of a single subject from the Human Connectome Project\ndataset, continuous in both the angular and spatial domain. The reconstructed\ndMRI signal using this method shows a more structurally coherent representation\nof the data. Noise in gradient images is removed and the fiber orientation\ndistribution functions show a smooth change in direction along a fiber tract.\nWe showcase how the reconstruction can be used to calculate mean diffusivity,\nfractional anisotropy, and total apparent fiber density. These results can be\nachieved with a single model architecture, tuning only one hyperparameter. In\nthis paper we also demonstrate how upsampling in both the angular and spatial\ndomain yields reconstructions that are on par or better than existing methods.\n","authors":["Tom Hendriks","Anna Villanova","Maxime Chamberland"],"pdf_url":"https://arxiv.org/pdf/2308.08210v1.pdf","comment":"12 pages, 6 figures, accepted for cdMRI workshop at MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.08206v1","updated":"2023-08-16T08:13:38Z","published":"2023-08-16T08:13:38Z","title":"Explainable Multi-View Deep Networks Methodology for Experimental\n Physics","summary":" Physical experiments often involve multiple imaging representations, such as\nX-ray scans and microscopic images. Deep learning models have been widely used\nfor supervised analysis in these experiments. Combining different image\nrepresentations is frequently required to analyze and make a decision properly.\nConsequently, multi-view data has emerged - datasets where each sample is\ndescribed by views from different angles, sources, or modalities. These\nproblems are addressed with the concept of multi-view learning. Understanding\nthe decision-making process of deep learning models is essential for reliable\nand credible analysis. Hence, many explainability methods have been devised\nrecently. Nonetheless, there is a lack of proper explainability in multi-view\nmodels, which are challenging to explain due to their architectures. In this\npaper, we suggest different multi-view architectures for the vision domain,\neach suited to another problem, and we also present a methodology for\nexplaining these models. To demonstrate the effectiveness of our methodology,\nwe focus on the domain of High Energy Density Physics (HEDP) experiments, where\nmultiple imaging representations are used to assess the quality of foam\nsamples. We apply our methodology to classify the foam samples quality using\nthe suggested multi-view architectures. Through experimental results, we\nshowcase the improvement of accurate architecture choice on both accuracy - 78%\nto 84% and AUC - 83% to 93% and present a trade-off between performance and\nexplainability. Specifically, we demonstrate that our approach enables the\nexplanation of individual one-view models, providing insights into the\ndecision-making process of each view. This understanding enhances the\ninterpretability of the overall multi-view model. The sources of this work are\navailable at:\nhttps://github.com/Scientific-Computing-Lab-NRCN/Multi-View-Explainability.\n","authors":["Nadav Schneider","Muriel Tzdaka","Galit Sturm","Guy Lazovski","Galit Bar","Gilad Oren","Raz Gvishi","Gal Oren"],"pdf_url":"https://arxiv.org/pdf/2308.08206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10813v2","updated":"2023-08-16T08:02:02Z","published":"2023-06-19T10:03:11Z","title":"Instruct-NeuralTalker: Editing Audio-Driven Talking Radiance Fields with\n Instructions","summary":" Recent neural talking radiance field methods have shown great success in\nphotorealistic audio-driven talking face synthesis. In this paper, we propose a\nnovel interactive framework that utilizes human instructions to edit such\nimplicit neural representations to achieve real-time personalized talking face\ngeneration. Given a short speech video, we first build an efficient talking\nradiance field, and then apply the latest conditional diffusion model for image\nediting based on the given instructions and guiding implicit representation\noptimization towards the editing target. To ensure audio-lip synchronization\nduring the editing process, we propose an iterative dataset updating strategy\nand utilize a lip-edge loss to constrain changes in the lip region. We also\nintroduce a lightweight refinement network for complementing image details and\nachieving controllable detail generation in the final rendered image. Our\nmethod also enables real-time rendering at up to 30FPS on consumer hardware.\nMultiple metrics and user verification show that our approach provides a\nsignificant improvement in rendering quality compared to state-of-the-art\nmethods.\n","authors":["Yuqi Sun","Ruian He","Weimin Tan","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2306.10813v2.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.08197v1","updated":"2023-08-16T07:57:35Z","published":"2023-08-16T07:57:35Z","title":"Self-Reference Deep Adaptive Curve Estimation for Low-Light Image\n Enhancement","summary":" In this paper, we propose a 2-stage low-light image enhancement method called\nSelf-Reference Deep Adaptive Curve Estimation (Self-DACE). In the first stage,\nwe present an intuitive, lightweight, fast, and unsupervised luminance\nenhancement algorithm. The algorithm is based on a novel low-light enhancement\ncurve that can be used to locally boost image brightness. We also propose a new\nloss function with a simplified physical model designed to preserve natural\nimages' color, structure, and fidelity. We use a vanilla CNN to map each pixel\nthrough deep Adaptive Adjustment Curves (AAC) while preserving the local image\nstructure. Secondly, we introduce the corresponding denoising scheme to remove\nthe latent noise in the darkness. We approximately model the noise in the dark\nand deploy a Denoising-Net to estimate and remove the noise after the first\nstage. Exhaustive qualitative and quantitative analysis shows that our method\noutperforms existing state-of-the-art algorithms on multiple real-world\ndatasets.\n","authors":["Jianyu Wen","Chenhao Wu","Tong Zhang","Yixuan Yu","Piotr Swierczynski"],"pdf_url":"https://arxiv.org/pdf/2308.08197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08192v1","updated":"2023-08-16T07:44:34Z","published":"2023-08-16T07:44:34Z","title":"Automatic Vision-Based Parking Slot Detection and Occupancy\n Classification","summary":" Parking guidance information (PGI) systems are used to provide information to\ndrivers about the nearest parking lots and the number of vacant parking slots.\nRecently, vision-based solutions started to appear as a cost-effective\nalternative to standard PGI systems based on hardware sensors mounted on each\nparking slot. Vision-based systems provide information about parking occupancy\nbased on images taken by a camera that is recording a parking lot. However,\nsuch systems are challenging to develop due to various possible viewpoints,\nweather conditions, and object occlusions. Most notably, they require manual\nlabeling of parking slot locations in the input image which is sensitive to\ncamera angle change, replacement, or maintenance. In this paper, the algorithm\nthat performs Automatic Parking Slot Detection and Occupancy Classification\n(APSD-OC) solely on input images is proposed. Automatic parking slot detection\nis based on vehicle detections in a series of parking lot images upon which\nclustering is applied in bird's eye view to detect parking slots. Once the\nparking slots positions are determined in the input image, each detected\nparking slot is classified as occupied or vacant using a specifically trained\nResNet34 deep classifier. The proposed approach is extensively evaluated on\nwell-known publicly available datasets (PKLot and CNRPark+EXT), showing high\nefficiency in parking slot detection and robustness to the presence of illegal\nparking or passing vehicles. Trained classifier achieves high accuracy in\nparking slot occupancy classification.\n","authors":["Ratko Grbić","Brando Koch"],"pdf_url":"https://arxiv.org/pdf/2308.08192v1.pdf","comment":"39 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2308.08182v1","updated":"2023-08-16T07:21:25Z","published":"2023-08-16T07:21:25Z","title":"Unsupervised Domain Adaptive Detection with Network Stability Analysis","summary":" Domain adaptive detection aims to improve the generality of a detector,\nlearned from the labeled source domain, on the unlabeled target domain. In this\nwork, drawing inspiration from the concept of stability from the control theory\nthat a robust system requires to remain consistent both externally and\ninternally regardless of disturbances, we propose a novel framework that\nachieves unsupervised domain adaptive detection through stability analysis. In\nspecific, we treat discrepancies between images and regions from different\ndomains as disturbances, and introduce a novel simple but effective Network\nStability Analysis (NSA) framework that considers various disturbances for\ndomain adaptation. Particularly, we explore three types of perturbations\nincluding heavy and light image-level disturbances and instancelevel\ndisturbance. For each type, NSA performs external consistency analysis on the\noutputs from raw and perturbed images and/or internal consistency analysis on\ntheir features, using teacher-student models. By integrating NSA into Faster\nR-CNN, we immediately achieve state-of-the-art results. In particular, we set a\nnew record of 52.7% mAP on Cityscapes-to-FoggyCityscapes, showing the potential\nof NSA for domain adaptive detection. It is worth noticing, our NSA is designed\nfor general purpose, and thus applicable to one-stage detection model (e.g.,\nFCOS) besides the adopted one, as shown by experiments.\nhttps://github.com/tiankongzhang/NSA.\n","authors":["Wenzhang Zhou","Heng Fan","Tiejian Luo","Libo Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01103v2","updated":"2023-08-16T07:12:41Z","published":"2022-12-02T11:31:49Z","title":"3D-TOGO: Towards Text-Guided Cross-Category 3D Object Generation","summary":" Text-guided 3D object generation aims to generate 3D objects described by\nuser-defined captions, which paves a flexible way to visualize what we\nimagined. Although some works have been devoted to solving this challenging\ntask, these works either utilize some explicit 3D representations (e.g., mesh),\nwhich lack texture and require post-processing for rendering photo-realistic\nviews; or require individual time-consuming optimization for every single case.\nHere, we make the first attempt to achieve generic text-guided cross-category\n3D object generation via a new 3D-TOGO model, which integrates a text-to-views\ngeneration module and a views-to-3D generation module. The text-to-views\ngeneration module is designed to generate different views of the target 3D\nobject given an input caption. prior-guidance, caption-guidance and view\ncontrastive learning are proposed for achieving better view-consistency and\ncaption similarity. Meanwhile, a pixelNeRF model is adopted for the views-to-3D\ngeneration module to obtain the implicit 3D neural representation from the\npreviously-generated views. Our 3D-TOGO model generates 3D objects in the form\nof the neural radiance field with good texture and requires no time-cost\noptimization for every single caption. Besides, 3D-TOGO can control the\ncategory, color and shape of generated 3D objects with the input caption.\nExtensive experiments on the largest 3D object dataset (i.e., ABO) are\nconducted to verify that 3D-TOGO can better generate high-quality 3D objects\naccording to the input captions across 98 different categories, in terms of\nPSNR, SSIM, LPIPS and CLIP-score, compared with text-NeRF and Dreamfields.\n","authors":["Zutao Jiang","Guansong Lu","Xiaodan Liang","Jihua Zhu","Wei Zhang","Xiaojun Chang","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2212.01103v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08172v1","updated":"2023-08-16T07:02:02Z","published":"2023-08-16T07:02:02Z","title":"AATCT-IDS: A Benchmark Abdominal Adipose Tissue CT Image Dataset for\n Image Denoising, Semantic Segmentation, and Radiomics Evaluation","summary":" Methods: In this study, a benchmark \\emph{Abdominal Adipose Tissue CT Image\nDataset} (AATTCT-IDS) containing 300 subjects is prepared and published.\nAATTCT-IDS publics 13,732 raw CT slices, and the researchers individually\nannotate the subcutaneous and visceral adipose tissue regions of 3,213 of those\nslices that have the same slice distance to validate denoising methods, train\nsemantic segmentation models, and study radiomics. For different tasks, this\npaper compares and analyzes the performance of various methods on AATTCT-IDS by\ncombining the visualization results and evaluation data. Thus, verify the\nresearch potential of this data set in the above three types of tasks.\n Results: In the comparative study of image denoising, algorithms using a\nsmoothing strategy suppress mixed noise at the expense of image details and\nobtain better evaluation data. Methods such as BM3D preserve the original image\nstructure better, although the evaluation data are slightly lower. The results\nshow significant differences among them. In the comparative study of semantic\nsegmentation of abdominal adipose tissue, the segmentation results of adipose\ntissue by each model show different structural characteristics. Among them,\nBiSeNet obtains segmentation results only slightly inferior to U-Net with the\nshortest training time and effectively separates small and isolated adipose\ntissue. In addition, the radiomics study based on AATTCT-IDS reveals three\nadipose distributions in the subject population.\n Conclusion: AATTCT-IDS contains the ground truth of adipose tissue regions in\nabdominal CT slices. This open-source dataset can attract researchers to\nexplore the multi-dimensional characteristics of abdominal adipose tissue and\nthus help physicians and patients in clinical practice. AATCT-IDS is freely\npublished for non-commercial purpose at:\n\\url{https://figshare.com/articles/dataset/AATTCT-IDS/23807256}.\n","authors":["Zhiyu Ma","Chen Li","Tianming Du","Le Zhang","Dechao Tang","Deguo Ma","Shanchuan Huang","Yan Liu","Yihao Sun","Zhihao Chen","Jin Yuan","Qianqing Nie","Marcin Grzegorzek","Hongzan Sun"],"pdf_url":"https://arxiv.org/pdf/2308.08172v1.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.07163v2","updated":"2023-08-16T06:52:21Z","published":"2023-08-14T14:18:11Z","title":"HyperSparse Neural Networks: Shifting Exploration to Exploitation\n through Adaptive Regularization","summary":" Sparse neural networks are a key factor in developing resource-efficient\nmachine learning applications. We propose the novel and powerful sparse\nlearning method Adaptive Regularized Training (ART) to compress dense into\nsparse networks. Instead of the commonly used binary mask during training to\nreduce the number of model weights, we inherently shrink weights close to zero\nin an iterative manner with increasing weight regularization. Our method\ncompresses the pre-trained model knowledge into the weights of highest\nmagnitude. Therefore, we introduce a novel regularization loss named\nHyperSparse that exploits the highest weights while conserving the ability of\nweight exploration. Extensive experiments on CIFAR and TinyImageNet show that\nour method leads to notable performance gains compared to other sparsification\nmethods, especially in extremely high sparsity regimes up to 99.8 percent model\nsparsity. Additional investigations provide new insights into the patterns that\nare encoded in weights with high magnitudes.\n","authors":["Patrick Glandorf","Timo Kaiser","Bodo Rosenhahn"],"pdf_url":"https://arxiv.org/pdf/2308.07163v2.pdf","comment":"ICCV'23 Workshops"},{"id":"http://arxiv.org/abs/2303.10385v2","updated":"2023-08-16T06:50:24Z","published":"2023-03-18T10:44:39Z","title":"Social Occlusion Inference with Vectorized Representation for Autonomous\n Driving","summary":" Autonomous vehicles must be capable of handling the occlusion of the\nenvironment to ensure safe and efficient driving. In urban environment,\nocclusion often arises due to other vehicles obscuring the perception of the\nego vehicle. Since the occlusion condition can impact the trajectories of\nvehicles, the behavior of other vehicles is helpful in making inferences about\nthe occlusion as a remedy for perceptual deficiencies. This paper introduces a\nnovel social occlusion inference approach that learns a mapping from agent\ntrajectories and scene context to an occupancy grid map (OGM) representing the\nview of ego vehicle. Specially, vectorized features are encoded through the\npolyline encoder to aggregate features of vectors into features of polylines. A\ntransformer module is then utilized to model the high-order interactions of\npolylines. Importantly, occlusion queries are proposed to fuse polyline\nfeatures and generate the OGM without the input of visual modality. To verify\nthe performance of vectorized representation, we design a baseline based on a\nfully transformer encoder-decoder architecture mapping the OGM with occlusion\nand historical trajectories information to the ground truth OGM. We evaluate\nour approach on an unsignalized intersection in the INTERACTION dataset, which\noutperforms the state-of-the-art results.\n","authors":["Bochao Huang"," Pin"],"pdf_url":"https://arxiv.org/pdf/2303.10385v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08162v1","updated":"2023-08-16T06:09:51Z","published":"2023-08-16T06:09:51Z","title":"Interpretability Benchmark for Evaluating Spatial Misalignment of\n Prototypical Parts Explanations","summary":" Prototypical parts-based networks are becoming increasingly popular due to\ntheir faithful self-explanations. However, their similarity maps are calculated\nin the penultimate network layer. Therefore, the receptive field of the\nprototype activation region often depends on parts of the image outside this\nregion, which can lead to misleading interpretations. We name this undesired\nbehavior a spatial explanation misalignment and introduce an interpretability\nbenchmark with a set of dedicated metrics for quantifying this phenomenon. In\naddition, we propose a method for misalignment compensation and apply it to\nexisting state-of-the-art models. We show the expressiveness of our benchmark\nand the effectiveness of the proposed compensation methodology through\nextensive empirical studies.\n","authors":["Mikołaj Sacha","Bartosz Jura","Dawid Rymarczyk","Łukasz Struski","Jacek Tabor","Bartosz Zieliński"],"pdf_url":"https://arxiv.org/pdf/2308.08162v1.pdf","comment":"Under review. Code will be release upon acceptance"},{"id":"http://arxiv.org/abs/2308.08157v1","updated":"2023-08-16T05:59:33Z","published":"2023-08-16T05:59:33Z","title":"Learning to Generate Semantic Layouts for Higher Text-Image\n Correspondence in Text-to-Image Synthesis","summary":" Existing text-to-image generation approaches have set high standards for\nphotorealism and text-image correspondence, largely benefiting from web-scale\ntext-image datasets, which can include up to 5~billion pairs. However,\ntext-to-image generation models trained on domain-specific datasets, such as\nurban scenes, medical images, and faces, still suffer from low text-image\ncorrespondence due to the lack of text-image pairs. Additionally, collecting\nbillions of text-image pairs for a specific domain can be time-consuming and\ncostly. Thus, ensuring high text-image correspondence without relying on\nweb-scale text-image datasets remains a challenging task. In this paper, we\npresent a novel approach for enhancing text-image correspondence by leveraging\navailable semantic layouts. Specifically, we propose a Gaussian-categorical\ndiffusion process that simultaneously generates both images and corresponding\nlayout pairs. Our experiments reveal that we can guide text-to-image generation\nmodels to be aware of the semantics of different image regions, by training the\nmodel to generate semantic labels for each pixel. We demonstrate that our\napproach achieves higher text-image correspondence compared to existing\ntext-to-image generation approaches in the Multi-Modal CelebA-HQ and the\nCityscapes dataset, where text-image pairs are scarce. Codes are available in\nthis https://pmh9960.github.io/research/GCDP\n","authors":["Minho Park","Jooyeol Yun","Seunghwan Choi","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2308.08157v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08154v1","updated":"2023-08-16T05:57:09Z","published":"2023-08-16T05:57:09Z","title":"Conditional Perceptual Quality Preserving Image Compression","summary":" We propose conditional perceptual quality, an extension of the perceptual\nquality defined in \\citet{blau2018perception}, by conditioning it on user\ndefined information. Specifically, we extend the original perceptual quality\n$d(p_{X},p_{\\hat{X}})$ to the conditional perceptual quality\n$d(p_{X|Y},p_{\\hat{X}|Y})$, where $X$ is the original image, $\\hat{X}$ is the\nreconstructed, $Y$ is side information defined by user and $d(.,.)$ is\ndivergence. We show that conditional perceptual quality has similar theoretical\nproperties as rate-distortion-perception trade-off \\citep{blau2019rethinking}.\nBased on these theoretical results, we propose an optimal framework for\nconditional perceptual quality preserving compression. Experimental results\nshow that our codec successfully maintains high perceptual quality and semantic\nquality at all bitrate. Besides, by providing a lowerbound of common randomness\nrequired, we settle the previous arguments on whether randomness should be\nincorporated into generator for (conditional) perceptual quality compression.\nThe source code is provided in supplementary material.\n","authors":["Tongda Xu","Qian Zhang","Yanghao Li","Dailan He","Zhe Wang","Yuanyuan Wang","Hongwei Qin","Yan Wang","Jingjing Liu","Ya-Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12027v2","updated":"2023-08-16T05:34:20Z","published":"2023-07-22T09:19:26Z","title":"On the Effectiveness of Spectral Discriminators for Perceptual Quality\n Improvement","summary":" Several recent studies advocate the use of spectral discriminators, which\nevaluate the Fourier spectra of images for generative modeling. However, the\neffectiveness of the spectral discriminators is not well interpreted yet. We\ntackle this issue by examining the spectral discriminators in the context of\nperceptual image super-resolution (i.e., GAN-based SR), as SR image quality is\nsusceptible to spectral changes. Our analyses reveal that the spectral\ndiscriminator indeed performs better than the ordinary (a.k.a. spatial)\ndiscriminator in identifying the differences in the high-frequency range;\nhowever, the spatial discriminator holds an advantage in the low-frequency\nrange. Thus, we suggest that the spectral and spatial discriminators shall be\nused simultaneously. Moreover, we improve the spectral discriminators by first\ncalculating the patch-wise Fourier spectrum and then aggregating the spectra by\nTransformer. We verify the effectiveness of the proposed method twofold. On the\none hand, thanks to the additional spectral discriminator, our obtained SR\nimages have their spectra better aligned to those of the real images, which\nleads to a better PD tradeoff. On the other hand, our ensembled discriminator\npredicts the perceptual quality more accurately, as evidenced in the\nno-reference image quality assessment task.\n","authors":["Xin Luo","Yunan Zhu","Shunxin Xu","Dong Liu"],"pdf_url":"https://arxiv.org/pdf/2307.12027v2.pdf","comment":"Accepted to ICCV 2023. Code and Models are publicly available at\n https://github.com/Luciennnnnnn/DualFormer"},{"id":"http://arxiv.org/abs/2308.07687v2","updated":"2023-08-16T05:24:46Z","published":"2023-08-15T10:37:04Z","title":"DiffGuard: Semantic Mismatch-Guided Out-of-Distribution Detection using\n Pre-trained Diffusion Models","summary":" Given a classifier, the inherent property of semantic Out-of-Distribution\n(OOD) samples is that their contents differ from all legal classes in terms of\nsemantics, namely semantic mismatch. There is a recent work that directly\napplies it to OOD detection, which employs a conditional Generative Adversarial\nNetwork (cGAN) to enlarge semantic mismatch in the image space. While achieving\nremarkable OOD detection performance on small datasets, it is not applicable to\nImageNet-scale datasets due to the difficulty in training cGANs with both input\nimages and labels as conditions. As diffusion models are much easier to train\nand amenable to various conditions compared to cGANs, in this work, we propose\nto directly use pre-trained diffusion models for semantic mismatch-guided OOD\ndetection, named DiffGuard. Specifically, given an OOD input image and the\npredicted label from the classifier, we try to enlarge the semantic difference\nbetween the reconstructed OOD image under these conditions and the original\ninput image. We also present several test-time techniques to further strengthen\nsuch differences. Experimental results show that DiffGuard is effective on both\nCifar-10 and hard cases of the large-scale ImageNet, and it can be easily\ncombined with existing OOD detection techniques to achieve state-of-the-art OOD\ndetection results.\n","authors":["Ruiyuan Gao","Chenchen Zhao","Lanqing Hong","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.07687v2.pdf","comment":"Accepted by ICCV2023, with supplementary materials"},{"id":"http://arxiv.org/abs/2307.16586v3","updated":"2023-08-16T04:53:42Z","published":"2023-07-31T11:40:53Z","title":"SAMFlow: Eliminating Any Fragmentation in Optical Flow with Segment\n Anything Model","summary":" Optical Flow Estimation aims to find the 2D dense motion field between two\nframes. Due to the limitation of model structures and training datasets,\nexisting methods often rely too much on local clues and ignore the integrity of\nobjects, resulting in fragmented motion estimation. Through theoretical\nanalysis, we find the pre-trained large vision models are helpful in optical\nflow estimation, and we notice that the recently famous Segment Anything Model\n(SAM) demonstrates a strong ability to segment complete objects, which is\nsuitable for solving the fragmentation problem. We thus propose a solution to\nembed the frozen SAM image encoder into FlowFormer to enhance object\nperception. To address the challenge of in-depth utilizing SAM in\nnon-segmentation tasks like optical flow estimation, we propose an Optical Flow\nTask-Specific Adaption scheme, including a Context Fusion Module to fuse the\nSAM encoder with the optical flow context encoder, and a Context Adaption\nModule to adapt the SAM features for optical flow task with Learned\nTask-Specific Embedding. Our proposed SAMFlow model reaches 0.86/2.10\nclean/final EPE and 3.55/12.32 EPE/F1-all on Sintel and KITTI-15 training set,\nsurpassing Flowformer by 8.5%/9.9% and 13.2%/16.3%. Furthermore, our model\nachieves state-of-the-art performance on the Sintel and KITTI-15 benchmarks,\nranking #1 among all two-frame methods on Sintel clean pass.\n","authors":["Shili Zhou","Ruian He","Weimin Tan","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2307.16586v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08143v1","updated":"2023-08-16T04:31:33Z","published":"2023-08-16T04:31:33Z","title":"SCANet: A Self- and Cross-Attention Network for Audio-Visual Speech\n Separation","summary":" The integration of different modalities, such as audio and visual\ninformation, plays a crucial role in human perception of the surrounding\nenvironment. Recent research has made significant progress in designing fusion\nmodules for audio-visual speech separation. However, they predominantly focus\non multi-modal fusion architectures situated either at the top or bottom\npositions, rather than comprehensively considering multi-modal fusion at\nvarious hierarchical positions within the network. In this paper, we propose a\nnovel model called self- and cross-attention network (SCANet), which leverages\nthe attention mechanism for efficient audio-visual feature fusion. SCANet\nconsists of two types of attention blocks: self-attention (SA) and\ncross-attention (CA) blocks, where the CA blocks are distributed at the top\n(TCA), middle (MCA) and bottom (BCA) of SCANet. These blocks maintain the\nability to learn modality-specific features and enable the extraction of\ndifferent semantics from audio-visual features. Comprehensive experiments on\nthree standard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2)\ndemonstrate the effectiveness of SCANet, outperforming existing\nstate-of-the-art (SOTA) methods while maintaining comparable inference time.\n","authors":["Kai Li","Runxuan Yang","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2308.08143v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.08142v1","updated":"2023-08-16T04:27:44Z","published":"2023-08-16T04:27:44Z","title":"S2R: Exploring a Double-Win Transformer-Based Framework for Ideal and\n Blind Super-Resolution","summary":" Nowadays, deep learning based methods have demonstrated impressive\nperformance on ideal super-resolution (SR) datasets, but most of these methods\nincur dramatically performance drops when directly applied in real-world SR\nreconstruction tasks with unpredictable blur kernels. To tackle this issue,\nblind SR methods are proposed to improve the visual results on random blur\nkernels, which causes unsatisfactory reconstruction effects on ideal\nlow-resolution images similarly. In this paper, we propose a double-win\nframework for ideal and blind SR task, named S2R, including a light-weight\ntransformer-based SR model (S2R transformer) and a novel coarse-to-fine\ntraining strategy, which can achieve excellent visual results on both ideal and\nrandom fuzzy conditions. On algorithm level, S2R transformer smartly combines\nsome efficient and light-weight blocks to enhance the representation ability of\nextracted features with relatively low number of parameters. For training\nstrategy, a coarse-level learning process is firstly performed to improve the\ngeneralization of the network with the help of a large-scale external dataset,\nand then, a fast fine-tune process is developed to transfer the pre-trained\nmodel to real-world SR tasks by mining the internal features of the image.\nExperimental results show that the proposed S2R outperforms other single-image\nSR models in ideal SR condition with only 578K parameters. Meanwhile, it can\nachieve better visual results than regular blind SR models in blind fuzzy\nconditions with only 10 gradient updates, which improve convergence speed by\n300 times, significantly accelerating the transfer-learning process in\nreal-world situations.\n","authors":["Minghao She","Wendong Mao","Huihong Shi","Zhongfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08140v1","updated":"2023-08-16T04:15:21Z","published":"2023-08-16T04:15:21Z","title":"GPA-3D: Geometry-aware Prototype Alignment for Unsupervised Domain\n Adaptive 3D Object Detection from Point Clouds","summary":" LiDAR-based 3D detection has made great progress in recent years. However,\nthe performance of 3D detectors is considerably limited when deployed in unseen\nenvironments, owing to the severe domain gap problem. Existing domain adaptive\n3D detection methods do not adequately consider the problem of the\ndistributional discrepancy in feature space, thereby hindering generalization\nof detectors across domains. In this work, we propose a novel unsupervised\ndomain adaptive \\textbf{3D} detection framework, namely \\textbf{G}eometry-aware\n\\textbf{P}rototype \\textbf{A}lignment (\\textbf{GPA-3D}), which explicitly\nleverages the intrinsic geometric relationship from point cloud objects to\nreduce the feature discrepancy, thus facilitating cross-domain transferring.\nSpecifically, GPA-3D assigns a series of tailored and learnable prototypes to\npoint cloud objects with distinct geometric structures. Each prototype aligns\nBEV (bird's-eye-view) features derived from corresponding point cloud objects\non source and target domains, reducing the distributional discrepancy and\nachieving better adaptation. The evaluation results obtained on various\nbenchmarks, including Waymo, nuScenes and KITTI, demonstrate the superiority of\nour GPA-3D over the state-of-the-art approaches for different adaptation\nscenarios. The MindSpore version code will be publicly available at\n\\url{https://github.com/Liz66666/GPA3D}.\n","authors":["Ziyu Li","Jingming Guo","Tongtong Cao","Liu Bingbing","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08140v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08137v1","updated":"2023-08-16T04:03:59Z","published":"2023-08-16T04:03:59Z","title":"SYENet: A Simple Yet Effective Network for Multiple Low-Level Vision\n Tasks with Real-time Performance on Mobile Device","summary":" With the rapid development of AI hardware accelerators, applying deep\nlearning-based algorithms to solve various low-level vision tasks on mobile\ndevices has gradually become possible. However, two main problems still need to\nbe solved: task-specific algorithms make it difficult to integrate them into a\nsingle neural network architecture, and large amounts of parameters make it\ndifficult to achieve real-time inference. To tackle these problems, we propose\na novel network, SYENet, with only $~$6K parameters, to handle multiple\nlow-level vision tasks on mobile devices in a real-time manner. The SYENet\nconsists of two asymmetrical branches with simple building blocks. To\neffectively connect the results by asymmetrical branches, a Quadratic\nConnection Unit(QCU) is proposed. Furthermore, to improve performance, a new\nOutlier-Aware Loss is proposed to process the image. The proposed method proves\nits superior performance with the best PSNR as compared with other networks in\nreal-time applications such as Image Signal Processing(ISP), Low-Light\nEnhancement(LLE), and Super-Resolution(SR) with 2K60FPS throughput on Qualcomm\n8 Gen 1 mobile SoC(System-on-Chip). Particularly, for ISP task, SYENet got the\nhighest score in MAI 2022 Learned Smartphone ISP challenge.\n","authors":["Weiran Gou","Ziyao Yi","Yan Xiang","Shaoqing Li","Zibin Liu","Dehui Kong","Ke Xu"],"pdf_url":"https://arxiv.org/pdf/2308.08137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08131v1","updated":"2023-08-16T03:48:19Z","published":"2023-08-16T03:48:19Z","title":"Ranking-aware Uncertainty for Text-guided Image Retrieval","summary":" Text-guided image retrieval is to incorporate conditional text to better\ncapture users' intent. Traditionally, the existing methods focus on minimizing\nthe embedding distances between the source inputs and the targeted image, using\nthe provided triplets $\\langle$source image, source text, target\nimage$\\rangle$. However, such triplet optimization may limit the learned\nretrieval model to capture more detailed ranking information, e.g., the\ntriplets are one-to-one correspondences and they fail to account for\nmany-to-many correspondences arising from semantic diversity in feedback\nlanguages and images. To capture more ranking information, we propose a novel\nranking-aware uncertainty approach to model many-to-many correspondences by\nonly using the provided triplets. We introduce uncertainty learning to learn\nthe stochastic ranking list of features. Specifically, our approach mainly\ncomprises three components: (1) In-sample uncertainty, which aims to capture\nsemantic diversity using a Gaussian distribution derived from both combined and\ntarget features; (2) Cross-sample uncertainty, which further mines the ranking\ninformation from other samples' distributions; and (3) Distribution\nregularization, which aligns the distributional representations of source\ninputs and targeted image. Compared to the existing state-of-the-art methods,\nour proposed method achieves significant results on two public datasets for\ncomposed image retrieval.\n","authors":["Junyang Chen","Hanjiang Lai"],"pdf_url":"https://arxiv.org/pdf/2308.08131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06567v2","updated":"2023-08-16T03:45:46Z","published":"2023-01-16T19:04:23Z","title":"Scalable Surface Water Mapping up to Fine-scale using Geometric Features\n of Water from Topographic Airborne LiDAR Data","summary":" Despite substantial technological advancements, the comprehensive mapping of\nsurface water, particularly smaller bodies (<1ha), continues to be a challenge\ndue to a lack of robust, scalable methods. Standard methods require either\ntraining labels or site-specific parameter tuning, which complicates automated\nmapping and introduces biases related to training data and parameters. The\nreliance on water's reflectance properties, including LiDAR intensity, further\ncomplicates the matter, as higher-resolution images inherently produce more\nnoise. To mitigate these difficulties, we propose a unique method that focuses\non the geometric characteristics of water instead of its variable reflectance\nproperties. Unlike preceding approaches, our approach relies entirely on 3D\ncoordinate observations from airborne LiDAR data, taking advantage of the\nprinciple that connected surface water remains flat due to gravity. By\nharnessing this natural law in conjunction with connectivity, our method can\naccurately and scalably identify small water bodies, eliminating the need for\ntraining labels or repetitive parameter tuning. Consequently, our approach\nenables the creation of comprehensive 3D topographic maps that include both\nwater and terrain, all performed in an unsupervised manner using only airborne\nlaser scanning data, potentially enhancing the process of generating reliable\n3D topographic maps. We validated our method across extensive and diverse\nlandscapes, while comparing it to highly competitive Normalized Difference\nWater Index (NDWI)-based methods and assessing it using a reference surface\nwater map. In conclusion, our method offers a new approach to address\npersistent difficulties in robust, scalable surface water mapping and 3D\ntopographic mapping, using solely airborne LiDAR data.\n","authors":["Hunsoo Song","Jinha Jung"],"pdf_url":"https://arxiv.org/pdf/2301.06567v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.14585v3","updated":"2023-08-16T03:40:41Z","published":"2022-05-29T07:00:34Z","title":"An unsupervised, open-source workflow for 2D and 3D building mapping\n from airborne LiDAR data","summary":" Despite the substantial demand for high-quality, large-area building maps, no\nestablished open-source workflow for generating 2D and 3D maps currently\nexists. This study introduces an automated, open-source workflow for\nlarge-scale 2D and 3D building mapping utilizing airborne LiDAR data. Uniquely,\nour workflow operates entirely unsupervised, eliminating the need for any\ntraining procedures. We have integrated a specifically tailored DTM generation\nalgorithm into our workflow to prevent errors in complex urban landscapes,\nespecially around highways and overpasses. Through fine rasterization of LiDAR\npoint clouds, we've enhanced building-tree differentiation, reduced errors near\nwater bodies, and augmented computational efficiency by introducing a new\nplanarity calculation. Our workflow offers a practical and scalable solution\nfor the mass production of rasterized 2D and 3D building maps from raw airborne\nLiDAR data. Also, we elaborate on the influence of parameters and potential\nerror sources to provide users with practical guidance. Our method's robustness\nhas been rigorously optimized and tested using an extensive dataset (> 550\nkm$^2$), and further validated through comparison with deep learning-based and\nhand-digitized products. Notably, through these unparalleled, large-scale\ncomparisons, we offer a valuable analysis of large-scale building maps\ngenerated via different methodologies, providing insightful evaluations of the\neffectiveness of each approach. We anticipate that our highly scalable building\nmapping workflow will facilitate the production of reliable 2D and 3D building\nmaps, fostering advances in large-scale urban analysis. The code will be\nreleased upon publication.\n","authors":["Hunsoo Song","Jinha Jung"],"pdf_url":"https://arxiv.org/pdf/2205.14585v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08114v1","updated":"2023-08-16T02:58:43Z","published":"2023-08-16T02:58:43Z","title":"OmniZoomer: Learning to Move and Zoom in on Sphere at High-Resolution","summary":" Omnidirectional images (ODIs) have become increasingly popular, as their\nlarge field-of-view (FoV) can offer viewers the chance to freely choose the\nview directions in immersive environments such as virtual reality. The M\\\"obius\ntransformation is typically employed to further provide the opportunity for\nmovement and zoom on ODIs, but applying it to the image level often results in\nblurry effect and aliasing problem. In this paper, we propose a novel deep\nlearning-based approach, called \\textbf{OmniZoomer}, to incorporate the\nM\\\"obius transformation into the network for movement and zoom on ODIs. By\nlearning various transformed feature maps under different conditions, the\nnetwork is enhanced to handle the increasing edge curvatures, which alleviates\nthe blurry effect. Moreover, to address the aliasing problem, we propose two\nkey components. Firstly, to compensate for the lack of pixels for describing\ncurves, we enhance the feature maps in the high-resolution (HR) space and\ncalculate the transformed index map with a spatial index generation module.\nSecondly, considering that ODIs are inherently represented in the spherical\nspace, we propose a spherical resampling module that combines the index map and\nHR feature maps to transform the feature maps for better spherical correlation.\nThe transformed feature maps are decoded to output a zoomed ODI. Experiments\nshow that our method can produce HR and high-quality ODIs with the flexibility\nto move and zoom in to the object of interest. Project page is available at\nhttp://vlislab22.github.io/OmniZoomer/.\n","authors":["Zidong Cao","Hao Ai","Yan-Pei Cao","Ying Shan","Xiaohu Qie","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08114v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08110v1","updated":"2023-08-16T02:51:52Z","published":"2023-08-16T02:51:52Z","title":"View Consistent Purification for Accurate Cross-View Localization","summary":" This paper proposes a fine-grained self-localization method for outdoor\nrobotics that utilizes a flexible number of onboard cameras and readily\naccessible satellite images. The proposed method addresses limitations in\nexisting cross-view localization methods that struggle to handle noise sources\nsuch as moving objects and seasonal variations. It is the first sparse\nvisual-only method that enhances perception in dynamic environments by\ndetecting view-consistent key points and their corresponding deep features from\nground and satellite views, while removing off-the-ground objects and\nestablishing homography transformation between the two views. Moreover, the\nproposed method incorporates a spatial embedding approach that leverages camera\nintrinsic and extrinsic information to reduce the ambiguity of purely visual\nmatching, leading to improved feature matching and overall pose estimation\naccuracy. The method exhibits strong generalization and is robust to\nenvironmental changes, requiring only geo-poses as ground truth. Extensive\nexperiments on the KITTI and Ford Multi-AV Seasonal datasets demonstrate that\nour proposed method outperforms existing state-of-the-art methods, achieving\nmedian spatial accuracy errors below $0.5$ meters along the lateral and\nlongitudinal directions, and a median orientation accuracy error below 2\ndegrees.\n","authors":["Shan Wang","Yanhao Zhang","Akhil Perincherry","Ankit Vora","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2308.08110v1.pdf","comment":"Accepted for ICCV 2023"},{"id":"http://arxiv.org/abs/2307.14051v4","updated":"2023-08-16T02:29:50Z","published":"2023-07-26T09:04:27Z","title":"3D Semantic Subspace Traverser: Empowering 3D Generative Model with\n Shape Editing Capability","summary":" Shape generation is the practice of producing 3D shapes as various\nrepresentations for 3D content creation. Previous studies on 3D shape\ngeneration have focused on shape quality and structure, without or less\nconsidering the importance of semantic information. Consequently, such\ngenerative models often fail to preserve the semantic consistency of shape\nstructure or enable manipulation of the semantic attributes of shapes during\ngeneration. In this paper, we proposed a novel semantic generative model named\n3D Semantic Subspace Traverser that utilizes semantic attributes for\ncategory-specific 3D shape generation and editing. Our method utilizes implicit\nfunctions as the 3D shape representation and combines a novel latent-space GAN\nwith a linear subspace model to discover semantic dimensions in the local\nlatent space of 3D shapes. Each dimension of the subspace corresponds to a\nparticular semantic attribute, and we can edit the attributes of generated\nshapes by traversing the coefficients of those dimensions. Experimental results\ndemonstrate that our method can produce plausible shapes with complex\nstructures and enable the editing of semantic attributes. The code and trained\nmodels are available at\nhttps://github.com/TrepangCat/3D_Semantic_Subspace_Traverser\n","authors":["Ruowei Wang","Yu Liu","Pei Su","Jianwei Zhang","Qijun Zhao"],"pdf_url":"https://arxiv.org/pdf/2307.14051v4.pdf","comment":"Published in ICCV 2023. Code:\n https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser"},{"id":"http://arxiv.org/abs/2308.08094v1","updated":"2023-08-16T02:04:34Z","published":"2023-08-16T02:04:34Z","title":"Snapshot High Dynamic Range Imaging with a Polarization Camera","summary":" High dynamic range (HDR) images are important for a range of tasks, from\nnavigation to consumer photography. Accordingly, a host of specialized HDR\nsensors have been developed, the most successful of which are based on\ncapturing variable per-pixel exposures. In essence, these methods capture an\nentire exposure bracket sequence at once in a single shot. This paper presents\na straightforward but highly effective approach for turning an off-the-shelf\npolarization camera into a high-performance HDR camera. By placing a linear\npolarizer in front of the polarization camera, we are able to simultaneously\ncapture four images with varied exposures, which are determined by the\norientation of the polarizer. We develop an outlier-robust and self-calibrating\nalgorithm to reconstruct an HDR image (at a single polarity) from these\nmeasurements. Finally, we demonstrate the efficacy of our approach with\nextensive real-world experiments.\n","authors":["Mingyang Xie","Matthew Chan","Christopher Metzler"],"pdf_url":"https://arxiv.org/pdf/2308.08094v1.pdf","comment":"9 pages, 10 figures"},{"id":"http://arxiv.org/abs/2304.06906v3","updated":"2023-08-16T01:53:02Z","published":"2023-04-14T02:49:08Z","title":"Swin3D: A Pretrained Transformer Backbone for 3D Indoor Scene\n Understanding","summary":" The use of pretrained backbones with fine-tuning has been successful for 2D\nvision and natural language processing tasks, showing advantages over\ntask-specific networks. In this work, we introduce a pretrained 3D backbone,\ncalled {\\SST}, for 3D indoor scene understanding. We design a 3D Swin\ntransformer as our backbone network, which enables efficient self-attention on\nsparse voxels with linear memory complexity, making the backbone scalable to\nlarge models and datasets. We also introduce a generalized contextual relative\npositional embedding scheme to capture various irregularities of point signals\nfor improved network performance. We pretrained a large {\\SST} model on a\nsynthetic Structured3D dataset, which is an order of magnitude larger than the\nScanNet dataset. Our model pretrained on the synthetic dataset not only\ngeneralizes well to downstream segmentation and detection on real 3D point\ndatasets, but also outperforms state-of-the-art methods on downstream tasks\nwith +2.3 mIoU and +2.2 mIoU on S3DIS Area5 and 6-fold semantic segmentation,\n+1.8 mIoU on ScanNet segmentation (val), +1.9 mAP@0.5 on ScanNet detection, and\n+8.1 mAP@0.5 on S3DIS detection. A series of extensive ablation studies further\nvalidate the scalability, generality, and superior performance enabled by our\napproach. The code and models are available at\nhttps://github.com/microsoft/Swin3D .\n","authors":["Yu-Qi Yang","Yu-Xiao Guo","Jian-Yu Xiong","Yang Liu","Hao Pan","Peng-Shuai Wang","Xin Tong","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2304.06906v3.pdf","comment":"Project page: https://yukichiii.github.io/project/swin3D/swin3D.html"},{"id":"http://arxiv.org/abs/2308.08089v1","updated":"2023-08-16T01:43:41Z","published":"2023-08-16T01:43:41Z","title":"DragNUWA: Fine-grained Control in Video Generation by Integrating Text,\n Image, and Trajectory","summary":" Controllable video generation has gained significant attention in recent\nyears. However, two main limitations persist: Firstly, most existing works\nfocus on either text, image, or trajectory-based control, leading to an\ninability to achieve fine-grained control in videos. Secondly, trajectory\ncontrol research is still in its early stages, with most experiments being\nconducted on simple datasets like Human3.6M. This constraint limits the models'\ncapability to process open-domain images and effectively handle complex curved\ntrajectories. In this paper, we propose DragNUWA, an open-domain\ndiffusion-based video generation model. To tackle the issue of insufficient\ncontrol granularity in existing works, we simultaneously introduce text, image,\nand trajectory information to provide fine-grained control over video content\nfrom semantic, spatial, and temporal perspectives. To resolve the problem of\nlimited open-domain trajectory control in current research, We propose\ntrajectory modeling with three aspects: a Trajectory Sampler (TS) to enable\nopen-domain control of arbitrary trajectories, a Multiscale Fusion (MF) to\ncontrol trajectories in different granularities, and an Adaptive Training (AT)\nstrategy to generate consistent videos following trajectories. Our experiments\nvalidate the effectiveness of DragNUWA, demonstrating its superior performance\nin fine-grained control in video generation. The homepage link is\n\\url{https://www.microsoft.com/en-us/research/project/dragnuwa/}\n","authors":["Shengming Yin","Chenfei Wu","Jian Liang","Jie Shi","Houqiang Li","Gong Ming","Nan Duan"],"pdf_url":"https://arxiv.org/pdf/2308.08089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08088v1","updated":"2023-08-16T01:38:49Z","published":"2023-08-16T01:38:49Z","title":"Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme\n Detection","summary":" Hateful meme detection is a challenging multimodal task that requires\ncomprehension of both vision and language, as well as cross-modal interactions.\nRecent studies have tried to fine-tune pre-trained vision-language models\n(PVLMs) for this task. However, with increasing model sizes, it becomes\nimportant to leverage powerful PVLMs more efficiently, rather than simply\nfine-tuning them. Recently, researchers have attempted to convert meme images\ninto textual captions and prompt language models for predictions. This approach\nhas shown good performance but suffers from non-informative image captions.\nConsidering the two factors mentioned above, we propose a probing-based\ncaptioning approach to leverage PVLMs in a zero-shot visual question answering\n(VQA) manner. Specifically, we prompt a frozen PVLM by asking hateful\ncontent-related questions and use the answers as image captions (which we call\nPro-Cap), so that the captions contain information critical for hateful content\ndetection. The good performance of models with Pro-Cap on three benchmarks\nvalidates the effectiveness and generalization of the proposed method.\n","authors":["Rui Cao","Ming Shan Hee","Adriel Kuek","Wen-Haw Chong","Roy Ka-Wei Lee","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.08088v1.pdf","comment":"Camera-ready for 23, ACM MM"},{"id":"http://arxiv.org/abs/2308.07439v2","updated":"2023-08-16T01:29:39Z","published":"2023-08-14T20:20:26Z","title":"Interaction-Aware Personalized Vehicle Trajectory Prediction Using\n Temporal Graph Neural Networks","summary":" Accurate prediction of vehicle trajectories is vital for advanced driver\nassistance systems and autonomous vehicles. Existing methods mainly rely on\ngeneric trajectory predictions derived from large datasets, overlooking the\npersonalized driving patterns of individual drivers. To address this gap, we\npropose an approach for interaction-aware personalized vehicle trajectory\nprediction that incorporates temporal graph neural networks. Our method\nutilizes Graph Convolution Networks (GCN) and Long Short-Term Memory (LSTM) to\nmodel the spatio-temporal interactions between target vehicles and their\nsurrounding traffic. To personalize the predictions, we establish a pipeline\nthat leverages transfer learning: the model is initially pre-trained on a\nlarge-scale trajectory dataset and then fine-tuned for each driver using their\nspecific driving data. We employ human-in-the-loop simulation to collect\npersonalized naturalistic driving trajectories and corresponding surrounding\nvehicle trajectories. Experimental results demonstrate the superior performance\nof our personalized GCN-LSTM model, particularly for longer prediction\nhorizons, compared to its generic counterpart. Moreover, the personalized model\noutperforms individual models created without pre-training, emphasizing the\nsignificance of pre-training on a large dataset to avoid overfitting. By\nincorporating personalization, our approach enhances trajectory prediction\naccuracy.\n","authors":["Amr Abdelraouf","Rohit Gupta","Kyungtae Han"],"pdf_url":"https://arxiv.org/pdf/2308.07439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16751v2","updated":"2023-08-16T01:25:03Z","published":"2023-07-31T15:18:54Z","title":"High-Performance Fine Defect Detection in Artificial Leather Using Dual\n Feature Pool Object Detection","summary":" In this study, the structural problems of the YOLOv5 model were analyzed\nemphatically. Based on the characteristics of fine defects in artificial\nleather, four innovative structures, namely DFP, IFF, AMP, and EOS, were\ndesigned. These advancements led to the proposal of a high-performance\nartificial leather fine defect detection model named YOLOD. YOLOD demonstrated\noutstanding performance on the artificial leather defect dataset, achieving an\nimpressive increase of 11.7% - 13.5% in AP_50 compared to YOLOv5, along with a\nsignificant reduction of 5.2% - 7.2% in the error detection rate. Moreover,\nYOLOD also exhibited remarkable performance on the general MS-COCO dataset,\nwith an increase of 0.4% - 2.6% in AP compared to YOLOv5, and a rise of 2.5% -\n4.1% in AP_S compared to YOLOv5. These results demonstrate the superiority of\nYOLOD in both artificial leather defect detection and general object detection\ntasks, making it a highly efficient and effective model for real-world\napplications.\n","authors":["Lin Huang","Weisheng Li","Linlin Shen","Xue Xiao","Suihan Xiao"],"pdf_url":"https://arxiv.org/pdf/2307.16751v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04170v4","updated":"2023-08-16T01:10:43Z","published":"2023-05-07T03:00:06Z","title":"YOLOCS: Object Detection based on Dense Channel Compression for Feature\n Spatial Solidification","summary":" In this study, we examine the associations between channel features and\nconvolutional kernels during the processes of feature purification and gradient\nbackpropagation, with a focus on the forward and backward propagation within\nthe network. Consequently, we propose a method called Dense Channel Compression\nfor Feature Spatial Solidification. Drawing upon the central concept of this\nmethod, we introduce two innovative modules for backbone and head networks: the\nDense Channel Compression for Feature Spatial Solidification Structure (DCFS)\nand the Asymmetric Multi-Level Compression Decoupled Head (ADH). When\nintegrated into the YOLOv5 model, these two modules demonstrate exceptional\nperformance, resulting in a modified model referred to as YOLOCS. Evaluated on\nthe MSCOCO dataset, the large, medium, and small YOLOCS models yield AP of\n50.1%, 47.6%, and 42.5%, respectively. Maintaining inference speeds remarkably\nsimilar to those of the YOLOv5 model, the large, medium, and small YOLOCS\nmodels surpass the YOLOv5 model's AP by 1.1%, 2.3%, and 5.2%, respectively.\n","authors":["Lin Huang","Weisheng Li","Linlin Shen","Haojie Fu","Xue Xiao","Suihan Xiao"],"pdf_url":"https://arxiv.org/pdf/2305.04170v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13855v2","updated":"2023-08-16T00:57:52Z","published":"2023-05-23T09:23:05Z","title":"A Two-Step Deep Learning Method for 3DCT-2DUS Kidney Registration During\n Breathing","summary":" This work proposed a novel deep registration pipeline for 3D CT and 2D U/S\nkidney scans of free breathing, which consists of a feature network, and a\n3D-2D CNN-based registration network. The feature network has handcraft texture\nfeature layers to reduce the semantic gap. The registration network is\nencoder-decoder structure with loss of feature-image-motion (FIM), which\nenables hierarchical regression at decoder layers and avoids multiple network\nconcatenation. It was first pretrained with retrospective datasets cum training\ndata generation strategy, then adapted to specific patient data under\nunsupervised one-cycle transfer learning in onsite application. The experiment\nwas on 132 U/S sequences, 39 multiple phase CT and 210 public single phase CT\nimages, and 25 pairs of CT and U/S sequences. It resulted in mean contour\ndistance (MCD) of 0.94 mm between kidneys on CT and U/S images and MCD of 1.15\nmm on CT and reference CT images. For datasets with small transformations, it\nresulted in MCD of 0.82 and 1.02 mm respectively. For large transformations, it\nresulted in MCD of 1.10 and 1.28 mm respectively. This work addressed\ndifficulties in 3DCT-2DUS kidney registration during free breathing via novel\nnetwork structures and training strategy.\n","authors":["Chi Yanling","Xu Yuyu","Liu Huiying","Wu Xiaoxiang","Liu Zhiqiang","Mao Jiawei","Xu Guibin","Huang Weimin"],"pdf_url":"https://arxiv.org/pdf/2305.13855v2.pdf","comment":"16 pages, 8 figures, 10 tables"},{"id":"http://arxiv.org/abs/2307.04246v3","updated":"2023-08-16T00:46:04Z","published":"2023-07-09T18:52:01Z","title":"Convex Decomposition of Indoor Scenes","summary":" We describe a method to parse a complex, cluttered indoor scene into\nprimitives which offer a parsimonious abstraction of scene structure. Our\nprimitives are simple convexes. Our method uses a learned regression procedure\nto parse a scene into a fixed number of convexes from RGBD input, and can\noptionally accept segmentations to improve the decomposition. The result is\nthen polished with a descent method which adjusts the convexes to produce a\nvery good fit, and greedily removes superfluous primitives. Because the entire\nscene is parsed, we can evaluate using traditional depth, normal, and\nsegmentation error metrics. Our evaluation procedure demonstrates that the\nerror from our primitive representation is comparable to that of predicting\ndepth from a single image.\n","authors":["Vaibhav Vavilala","David Forsyth"],"pdf_url":"https://arxiv.org/pdf/2307.04246v3.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2303.13516v3","updated":"2023-08-16T00:00:47Z","published":"2023-03-23T17:59:42Z","title":"Ablating Concepts in Text-to-Image Diffusion Models","summary":" Large-scale text-to-image diffusion models can generate high-fidelity images\nwith powerful compositional ability. However, these models are typically\ntrained on an enormous amount of Internet data, often containing copyrighted\nmaterial, licensed images, and personal photos. Furthermore, they have been\nfound to replicate the style of various living artists or memorize exact\ntraining samples. How can we remove such copyrighted concepts or images without\nretraining the model from scratch? To achieve this goal, we propose an\nefficient method of ablating concepts in the pretrained model, i.e., preventing\nthe generation of a target concept. Our algorithm learns to match the image\ndistribution for a target style, instance, or text prompt we wish to ablate to\nthe distribution corresponding to an anchor concept. This prevents the model\nfrom generating target concepts given its text condition. Extensive experiments\nshow that our method can successfully prevent the generation of the ablated\nconcept while preserving closely related concepts in the model.\n","authors":["Nupur Kumari","Bingliang Zhang","Sheng-Yu Wang","Eli Shechtman","Richard Zhang","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.13516v3.pdf","comment":"ICCV 2023. Project website: https://www.cs.cmu.edu/~concept-ablation/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2307.15780v2","updated":"2023-08-16T17:59:07Z","published":"2023-07-24T18:47:38Z","title":"LLM-Rec: Personalized Recommendation via Prompting Large Language Models","summary":" We investigate various prompting strategies for enhancing personalized\nrecommendation performance with large language models (LLMs) through input\naugmentation. Our proposed approach, termed LLM-Rec, encompasses four distinct\nprompting strategies: (1) basic prompting, (2) recommendation-driven prompting,\n(3) engagement-guided prompting, and (4) recommendation-driven +\nengagement-guided prompting. Our empirical experiments show that incorporating\nthe augmented input text generated by LLM leads to improved recommendation\nperformance. Recommendation-driven and engagement-guided prompting strategies\nare found to elicit LLM's understanding of global and local item\ncharacteristics. This finding highlights the importance of leveraging diverse\nprompts and input augmentation techniques to enhance the recommendation\ncapabilities with LLMs.\n","authors":["Hanjia Lyu","Song Jiang","Hanqing Zeng","Qifan Wang","Si Zhang","Ren Chen","Chris Leung","Jiajie Tang","Yinglong Xia","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2307.15780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08434v1","updated":"2023-08-16T15:28:22Z","published":"2023-08-16T15:28:22Z","title":"A Bi-Step Grounding Paradigm for Large Language Models in Recommendation\n Systems","summary":" As the focus on Large Language Models (LLMs) in the field of recommendation\nintensifies, the optimization of LLMs for recommendation purposes (referred to\nas LLM4Rec) assumes a crucial role in augmenting their effectiveness in\nproviding recommendations. However, existing approaches for LLM4Rec often\nassess performance using restricted sets of candidates, which may not\naccurately reflect the models' overall ranking capabilities. In this paper, our\nobjective is to investigate the comprehensive ranking capacity of LLMs and\npropose a two-step grounding framework known as BIGRec (Bi-step Grounding\nParadigm for Recommendation). It initially grounds LLMs to the recommendation\nspace by fine-tuning them to generate meaningful tokens for items and\nsubsequently identifies appropriate actual items that correspond to the\ngenerated tokens. By conducting extensive experiments on two datasets, we\nsubstantiate the superior performance, capacity for handling few-shot\nscenarios, and versatility across multiple domains exhibited by BIGRec.\nFurthermore, we observe that the marginal benefits derived from increasing the\nquantity of training samples are modest for BIGRec, implying that LLMs possess\nthe limited capability to assimilate statistical information, such as\npopularity and collaborative filtering, due to their robust semantic priors.\nThese findings also underline the efficacy of integrating diverse statistical\ninformation into the LLM4Rec framework, thereby pointing towards a potential\navenue for future research. Our code and data are available at\nhttps://github.com/SAI990323/Grounding4Rec.\n","authors":["Keqin Bao","Jizhi Zhang","Wenjie Wang","Yang Zhang","Zhengyi Yang","Yancheng Luo","Fuli Feng","Xiangnaan He","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2308.08434v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2308.08413v1","updated":"2023-08-16T14:58:12Z","published":"2023-08-16T14:58:12Z","title":"Knowledge-Enhanced Multi-Label Few-Shot Product Attribute-Value\n Extraction","summary":" Existing attribute-value extraction (AVE) models require large quantities of\nlabeled data for training. However, new products with new attribute-value pairs\nenter the market every day in real-world e-Commerce. Thus, we formulate AVE in\nmulti-label few-shot learning (FSL), aiming to extract unseen attribute value\npairs based on a small number of training examples. We propose a\nKnowledge-Enhanced Attentive Framework (KEAF) based on prototypical networks,\nleveraging the generated label description and category information to learn\nmore discriminative prototypes. Besides, KEAF integrates with hybrid attention\nto reduce noise and capture more informative semantics for each class by\ncalculating the label-relevant and query-related weights. To achieve\nmulti-label inference, KEAF further learns a dynamic threshold by integrating\nthe semantic information from both the support set and the query set. Extensive\nexperiments with ablation studies conducted on two datasets demonstrate that\nKEAF outperforms other SOTA models for information extraction in FSL. The code\ncan be found at: https://github.com/gjiaying/KEAF\n","authors":["Jiaying Gong","Wei-Te Chen","Hoda Eldardiry"],"pdf_url":"https://arxiv.org/pdf/2308.08413v1.pdf","comment":"6 pages, 2 figures, published in CIKM 2023"},{"id":"http://arxiv.org/abs/2308.08406v1","updated":"2023-08-16T14:50:51Z","published":"2023-08-16T14:50:51Z","title":"Content-based Recommendation Engine for Video Streaming Platform","summary":" Recommendation engine suggest content, product or services to the user by\nusing machine learning algorithm. This paper proposed a content-based\nrecommendation engine for providing video suggestion to the user based on their\nprevious interests and choices. We will use TF-IDF text vectorization method to\ndetermine the relevance of words in a document. Then we will find out the\nsimilarity between each content by calculating cosine similarity between them.\nFinally, engine will recommend videos to the users based on the obtained\nsimilarity score value. In addition, we will measure the engine's performance\nby computing precision, recall, and F1 core of the proposed system.\n","authors":["Puskal Khadka","Prabhav Lamichhane"],"pdf_url":"https://arxiv.org/pdf/2308.08406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08378v1","updated":"2023-08-16T14:01:25Z","published":"2023-08-16T14:01:25Z","title":"Advancing continual lifelong learning in neural information retrieval:\n definition, dataset, framework, and empirical evaluation","summary":" Continual learning refers to the capability of a machine learning model to\nlearn and adapt to new information, without compromising its performance on\npreviously learned tasks. Although several studies have investigated continual\nlearning methods for information retrieval tasks, a well-defined task\nformulation is still lacking, and it is unclear how typical learning strategies\nperform in this context. To address this challenge, a systematic task\nformulation of continual neural information retrieval is presented, along with\na multiple-topic dataset that simulates continuous information retrieval. A\ncomprehensive continual neural information retrieval framework consisting of\ntypical retrieval models and continual learning strategies is then proposed.\nEmpirical evaluations illustrate that the proposed framework can successfully\nprevent catastrophic forgetting in neural information retrieval and enhance\nperformance on previously learned tasks. The results indicate that\nembedding-based retrieval models experience a decline in their continual\nlearning performance as the topic shift distance and dataset volume of new\ntasks increase. In contrast, pretraining-based models do not show any such\ncorrelation. Adopting suitable learning strategies can mitigate the effects of\ntopic shift and data augmentation.\n","authors":["Jingrui Hou","Georgina Cosma","Axel Finke"],"pdf_url":"https://arxiv.org/pdf/2308.08378v1.pdf","comment":"Submitted to Information Sciences"},{"id":"http://arxiv.org/abs/2308.08354v1","updated":"2023-08-16T13:24:47Z","published":"2023-08-16T13:24:47Z","title":"Is Meta-Learning the Right Approach for the Cold-Start Problem in\n Recommender Systems?","summary":" Recommender systems have become fundamental building blocks of modern online\nproducts and services, and have a substantial impact on user experience. In the\npast few years, deep learning methods have attracted a lot of research, and are\nnow heavily used in modern real-world recommender systems. Nevertheless,\ndealing with recommendations in the cold-start setting, e.g., when a user has\ndone limited interactions in the system, is a problem that remains far from\nsolved. Meta-learning techniques, and in particular optimization-based\nmeta-learning, have recently become the most popular approaches in the academic\nresearch literature for tackling the cold-start problem in deep learning models\nfor recommender systems. However, current meta-learning approaches are not\npractical for real-world recommender systems, which have billions of users and\nitems, and strict latency requirements. In this paper we show that it is\npossible to obtaining similar, or higher, performance on commonly used\nbenchmarks for the cold-start problem without using meta-learning techniques.\nIn more detail, we show that, when tuned correctly, standard and widely adopted\ndeep learning models perform just as well as newer meta-learning models. We\nfurther show that an extremely simple modular approach using common\nrepresentation learning techniques, can perform comparably to meta-learning\ntechniques specifically designed for the cold-start setting while being much\nmore easily deployable in real-world applications.\n","authors":["Davide Buffelli","Ashish Gupta","Agnieszka Strzalka","Vassilis Plachouras"],"pdf_url":"https://arxiv.org/pdf/2308.08354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08328v1","updated":"2023-08-16T12:42:28Z","published":"2023-08-16T12:42:28Z","title":"Phase Retrieval with Background Information: Decreased References and\n Efficient Methods","summary":" Fourier phase retrieval(PR) is a severely ill-posed inverse problem that\narises in various applications. To guarantee a unique solution and relieve the\ndependence on the initialization, background information can be exploited as a\nstructural priors. However, the requirement for the background information may\nbe challenging when moving to the high-resolution imaging. At the same time,\nthe previously proposed projected gradient descent(PGD) method also demands\nmuch background information.\n In this paper, we present an improved theoretical result about the demand for\nthe background information, along with two Douglas Rachford(DR) based methods.\nAnalytically, we demonstrate that the background required to ensure a unique\nsolution can be decreased by nearly $1/2$ for the 2-D signals compared to the\n1-D signals. By generalizing the results into $d$-dimension, we show that the\nlength of the background information more than $(2^{\\frac{d+1}{d}}-1)$ folds of\nthe signal is sufficient to ensure the uniqueness. At the same time, we also\nanalyze the stability and robustness of the model when measurements and\nbackground information are corrupted by the noise. Furthermore, two methods\ncalled Background Douglas-Rachford (BDR) and Convex Background Douglas-Rachford\n(CBDR) are proposed. BDR which is a kind of non-convex method is proven to have\nthe local R-linear convergence rate under mild assumptions. Instead, CBDR\nmethod uses the techniques of convexification and can be proven to own a global\nconvergence guarantee as long as the background information is sufficient. To\nsupport this, a new property called F-RIP is established. We test the\nperformance of the proposed methods through simulations as well as real\nexperimental measurements, and demonstrate that they achieve a higher recovery\nrate with less background information compared to the PGD method.\n","authors":["Ziyang Yuan","Haoxing Yang","Ningyi Leng","Hongxia Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08285v1","updated":"2023-08-16T11:10:43Z","published":"2023-08-16T11:10:43Z","title":"Pre-training with Large Language Model-based Document Expansion for\n Dense Passage Retrieval","summary":" In this paper, we systematically study the potential of pre-training with\nLarge Language Model(LLM)-based document expansion for dense passage retrieval.\nConcretely, we leverage the capabilities of LLMs for document expansion, i.e.\nquery generation, and effectively transfer expanded knowledge to retrievers\nusing pre-training strategies tailored for passage retrieval. These strategies\ninclude contrastive learning and bottlenecked query generation. Furthermore, we\nincorporate a curriculum learning strategy to reduce the reliance on LLM\ninferences. Experimental results demonstrate that pre-training with LLM-based\ndocument expansion significantly boosts the retrieval performance on\nlarge-scale web-search tasks. Our work shows strong zero-shot and out-of-domain\nretrieval abilities, making it more widely applicable for retrieval when\ninitializing with no human-labeled data.\n","authors":["Guangyuan Ma","Xing Wu","Peng Wang","Zijia Lin","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2308.08285v1.pdf","comment":"10 pages, 3 tables, 4 figures, under review"},{"id":"http://arxiv.org/abs/2301.10405v5","updated":"2023-08-16T10:57:58Z","published":"2023-01-25T04:45:06Z","title":"Editing Language Model-based Knowledge Graph Embeddings","summary":" Recently decades have witnessed the empirical success of framing Knowledge\nGraph (KG) embeddings via language models. However, language model-based KG\nembeddings are usually deployed as static artifacts, making them difficult to\nmodify post-deployment without re-training after deployment. To address this\nissue, we propose a new task of editing language model-based KG embeddings in\nthis paper. This task is designed to facilitate rapid, data-efficient updates\nto KG embeddings without compromising the performance of other aspects. We\nbuild four new datasets: E-FB15k237, A-FB15k237, E-WN18RR, and A-WN18RR, and\nevaluate several knowledge editing baselines demonstrating the limited ability\nof previous models to handle the proposed challenging task. We further propose\na simple yet strong baseline dubbed KGEditor, which utilizes additional\nparametric layers of the hyper network to edit/add facts. Our comprehensive\nexperimental results reveal that KGEditor excels in updating specific facts\nwithout impacting the overall performance, even when faced with limited\ntraining resources. Code and datasets are available in\nhttps://github.com/zjunlp/PromptKG/tree/main/deltaKG.\n","authors":["Siyuan Cheng","Ningyu Zhang","Bozhong Tian","Xi Chen","Qingbing Liu","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2301.10405v5.pdf","comment":"Work in progress and the project website is\n https://zjunlp.github.io/project/KGE_Editing/"},{"id":"http://arxiv.org/abs/2308.07426v2","updated":"2023-08-16T10:10:45Z","published":"2023-08-14T19:36:57Z","title":"A Survey on Point-of-Interest Recommendations Leveraging Heterogeneous\n Data","summary":" Tourism is an important application domain for recommender systems. In this\ndomain, recommender systems are for example tasked with providing personalized\nrecommendations for transportation, accommodation, points-of-interest (POIs),\nor tourism services. Among these tasks, in particular the problem of\nrecommending POIs that are of likely interest to individual tourists has gained\ngrowing attention in recent years. Providing POI recommendations to tourists\n\\emph{during their trip} can however be especially challenging due to the\nvariability of the users' context. With the rapid development of the Web and\ntoday's multitude of online services, vast amounts of data from various sources\nhave become available, and these heterogeneous data sources represent a huge\npotential to better address the challenges of in-trip POI recommendation\nproblems. In this work, we provide a comprehensive survey of published research\non POI recommendation between 2017 and 2022 from the perspective of\nheterogeneous data sources. Specifically, we investigate which types of data\nare used in the literature and which technical approaches and evaluation\nmethods are predominant. Among other aspects, we find that today's research\nworks often focus on a narrow range of data sources, leaving great potential\nfor future works that better utilize heterogeneous data sources and diverse\ndata types for improved in-trip recommendations.\n","authors":["Zehui Wang","Wolfram Höpken","Dietmar Jannach"],"pdf_url":"https://arxiv.org/pdf/2308.07426v2.pdf","comment":"35 pages, 19 figures"},{"id":"http://arxiv.org/abs/2308.07711v2","updated":"2023-08-16T05:58:16Z","published":"2023-08-15T11:45:34Z","title":"SPM: Structured Pretraining and Matching Architectures for Relevance\n Modeling in Meituan Search","summary":" In e-commerce search, relevance between query and documents is an essential\nrequirement for satisfying user experience. Different from traditional\ne-commerce platforms that offer products, users search on life service\nplatforms such as Meituan mainly for product providers, which usually have\nabundant structured information, e.g. name, address, category, thousands of\nproducts. Modeling search relevance with these rich structured contents is\nchallenging due to the following issues: (1) there is language distribution\ndiscrepancy among different fields of structured document, making it difficult\nto directly adopt off-the-shelf pretrained language model based methods like\nBERT. (2) different fields usually have different importance and their length\nvary greatly, making it difficult to extract document information helpful for\nrelevance matching.\n To tackle these issues, in this paper we propose a novel two-stage\npretraining and matching architecture for relevance matching with rich\nstructured documents. At pretraining stage, we propose an effective pretraining\nmethod that employs both query and multiple fields of document as inputs,\nincluding an effective information compression method for lengthy fields. At\nrelevance matching stage, a novel matching method is proposed by leveraging\ndomain knowledge in search query to generate more effective document\nrepresentations for relevance scoring. Extensive offline experiments and online\nA/B tests on millions of users verify that the proposed architectures\neffectively improve the performance of relevance modeling. The model has\nalready been deployed online, serving the search traffic of Meituan for over a\nyear.\n","authors":["Wen Zan","Yaopeng Han","Xiaotian Jiang","Yao Xiao","Yang Yang","Dayao Chen","Sheng Chen"],"pdf_url":"https://arxiv.org/pdf/2308.07711v2.pdf","comment":"Accepted by CIKM '23"},{"id":"http://arxiv.org/abs/2308.05379v3","updated":"2023-08-16T03:59:23Z","published":"2023-08-10T06:52:53Z","title":"Beyond Semantics: Learning a Behavior Augmented Relevance Model with\n Self-supervised Learning","summary":" Relevance modeling aims to locate desirable items for corresponding queries,\nwhich is crucial for search engines to ensure user experience. Although most\nconventional approaches address this problem by assessing the semantic\nsimilarity between the query and item, pure semantic matching is not\neverything.\n","authors":["Zeyuan Chen","Wei Chen","Jia Xu","Zhongyi Liu","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.05379v3.pdf","comment":"Partial content"},{"id":"http://arxiv.org/abs/2308.08120v1","updated":"2023-08-16T03:21:23Z","published":"2023-08-16T03:21:23Z","title":"Uncovering User Interest from Biased and Noised Watch Time in Video\n Recommendation","summary":" In the video recommendation, watch time is commonly adopted as an indicator\nof user interest. However, watch time is not only influenced by the matching of\nusers' interests but also by other factors, such as duration bias and noisy\nwatching. Duration bias refers to the tendency for users to spend more time on\nvideos with longer durations, regardless of their actual interest level. Noisy\nwatching, on the other hand, describes users taking time to determine whether\nthey like a video or not, which can result in users spending time watching\nvideos they do not like. Consequently, the existence of duration bias and noisy\nwatching make watch time an inadequate label for indicating user interest.\nFurthermore, current methods primarily address duration bias and ignore the\nimpact of noisy watching, which may limit their effectiveness in uncovering\nuser interest from watch time. In this study, we first analyze the generation\nmechanism of users' watch time from a unified causal viewpoint. Specifically,\nwe considered the watch time as a mixture of the user's actual interest level,\nthe duration-biased watch time, and the noisy watch time. To mitigate both the\nduration bias and noisy watching, we propose Debiased and Denoised watch time\nCorrection (D$^2$Co), which can be divided into two steps: First, we employ a\nduration-wise Gaussian Mixture Model plus frequency-weighted moving average for\nestimating the bias and noise terms; then we utilize a sensitivity-controlled\ncorrection function to separate the user interest from the watch time, which is\nrobust to the estimation error of bias and noise terms. The experiments on two\npublic video recommendation datasets and online A/B testing indicate the\neffectiveness of the proposed method.\n","authors":["Haiyuan Zhao","Lei Zhang","Jun Xu","Guohao Cai","Zhenhua Dong","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.08120v1.pdf","comment":"Accepted by Recsys'23"},{"id":"http://arxiv.org/abs/2308.08088v1","updated":"2023-08-16T01:38:49Z","published":"2023-08-16T01:38:49Z","title":"Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme\n Detection","summary":" Hateful meme detection is a challenging multimodal task that requires\ncomprehension of both vision and language, as well as cross-modal interactions.\nRecent studies have tried to fine-tune pre-trained vision-language models\n(PVLMs) for this task. However, with increasing model sizes, it becomes\nimportant to leverage powerful PVLMs more efficiently, rather than simply\nfine-tuning them. Recently, researchers have attempted to convert meme images\ninto textual captions and prompt language models for predictions. This approach\nhas shown good performance but suffers from non-informative image captions.\nConsidering the two factors mentioned above, we propose a probing-based\ncaptioning approach to leverage PVLMs in a zero-shot visual question answering\n(VQA) manner. Specifically, we prompt a frozen PVLM by asking hateful\ncontent-related questions and use the answers as image captions (which we call\nPro-Cap), so that the captions contain information critical for hateful content\ndetection. The good performance of models with Pro-Cap on three benchmarks\nvalidates the effectiveness and generalization of the proposed method.\n","authors":["Rui Cao","Ming Shan Hee","Adriel Kuek","Wen-Haw Chong","Roy Ka-Wei Lee","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.08088v1.pdf","comment":"Camera-ready for 23, ACM MM"},{"id":"http://arxiv.org/abs/2308.08620v1","updated":"2023-08-16T18:24:54Z","published":"2023-08-16T18:24:54Z","title":"Group Identification via Transitional Hypergraph Convolution with\n Cross-view Self-supervised Learning","summary":" With the proliferation of social media, a growing number of users search for\nand join group activities in their daily life. This develops a need for the\nstudy on the group identification (GI) task, i.e., recommending groups to\nusers. The major challenge in this task is how to predict users' preferences\nfor groups based on not only previous group participation of users but also\nusers' interests in items. Although recent developments in Graph Neural\nNetworks (GNNs) accomplish embedding multiple types of objects in graph-based\nrecommender systems, they, however, fail to address this GI problem\ncomprehensively. In this paper, we propose a novel framework named Group\nIdentification via Transitional Hypergraph Convolution with Graph\nSelf-supervised Learning (GTGS). We devise a novel transitional hypergraph\nconvolution layer to leverage users' preferences for items as prior knowledge\nwhen seeking their group preferences. To construct comprehensive user/group\nrepresentations for GI task, we design the cross-view self-supervised learning\nto encourage the intrinsic consistency between item and group preferences for\neach user, and the group-based regularization to enhance the distinction among\ngroup embeddings. Experimental results on three benchmark datasets verify the\nsuperiority of GTGS. Additional detailed investigations are conducted to\ndemonstrate the effectiveness of the proposed framework.\n","authors":["Mingdai Yang","Zhiwei Liu","Liangwei Yang","Xiaolong Liu","Chen Wang","Hao Peng","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2308.08620v1.pdf","comment":"11 pages. Accepted by CIKM'23"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.08538v1","updated":"2023-08-16T17:53:40Z","published":"2023-08-16T17:53:40Z","title":"Proprioceptive Learning with Soft Polyhedral Networks","summary":" Proprioception is the \"sixth sense\" that detects limb postures with motor\nneurons. It requires a natural integration between the musculoskeletal systems\nand sensory receptors, which is challenging among modern robots that aim for\nlightweight, adaptive, and sensitive designs at a low cost. Here, we present\nthe Soft Polyhedral Network with an embedded vision for physical interactions,\ncapable of adaptive kinesthesia and viscoelastic proprioception by learning\nkinetic features. This design enables passive adaptations to omni-directional\ninteractions, visually captured by a miniature high-speed motion tracking\nsystem embedded inside for proprioceptive learning. The results show that the\nsoft network can infer real-time 6D forces and torques with accuracies of\n0.25/0.24/0.35 N and 0.025/0.034/0.006 Nm in dynamic interactions. We also\nincorporate viscoelasticity in proprioception during static adaptation by\nadding a creep and relaxation modifier to refine the predicted results. The\nproposed soft network combines simplicity in design, omni-adaptation, and\nproprioceptive sensing with high accuracy, making it a versatile solution for\nrobotics at a low cost with more than 1 million use cycles for tasks such as\nsensitive and competitive grasping, and touch-based geometry reconstruction.\nThis study offers new insights into vision-based proprioception for soft robots\nin adaptive grasping, soft manipulation, and human-robot interaction.\n","authors":["Xiaobo Liu","Xudong Han","Wei Hong","Fang Wan","Chaoyang Song"],"pdf_url":"https://arxiv.org/pdf/2308.08538v1.pdf","comment":"20 pages, 10 figures, 2 tables, submitted to the International\n Journal of Robotics Research for review"},{"id":"http://arxiv.org/abs/2308.08536v1","updated":"2023-08-16T17:52:11Z","published":"2023-08-16T17:52:11Z","title":"Can Transformers Learn Optimal Filtering for Unknown Systems?","summary":" Transformers have demonstrated remarkable success in natural language\nprocessing; however, their potential remains mostly unexplored for problems\narising in dynamical systems. In this work, we investigate the optimal output\nestimation problem using transformers, which generate output predictions using\nall the past ones. We train the transformer using various systems drawn from a\nprior distribution and then evaluate its performance on previously unseen\nsystems from the same distribution. As a result, the obtained transformer acts\nlike a prediction algorithm that learns in-context and quickly adapts to and\npredicts well for different systems - thus we call it meta-output-predictor\n(MOP). MOP matches the performance of the optimal output estimator, based on\nKalman filter, for most linear dynamical systems even though it does not have\naccess to a model. We observe via extensive numerical experiments that MOP also\nperforms well in challenging scenarios with non-i.i.d. noise, time-varying\ndynamics, and nonlinear dynamics like a quadrotor system with unknown\nparameters. To further support this observation, in the second part of the\npaper, we provide statistical guarantees on the performance of MOP and quantify\nthe required amount of training to achieve a desired excess risk during\ntest-time. Finally, we point out some limitations of MOP by identifying two\nclasses of problems MOP fails to perform well, highlighting the need for\ncaution when using transformers for control and estimation.\n","authors":["Haldun Balim","Zhe Du","Samet Oymak","Necmiye Ozay"],"pdf_url":"https://arxiv.org/pdf/2308.08536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.00563v3","updated":"2023-08-16T17:45:13Z","published":"2022-10-02T16:25:47Z","title":"AI-Assisted Discovery of Quantitative and Formal Models in Social\n Science","summary":" In social science, formal and quantitative models, such as ones describing\neconomic growth and collective action, are used to formulate mechanistic\nexplanations, provide predictions, and uncover questions about observed\nphenomena. Here, we demonstrate the use of a machine learning system to aid the\ndiscovery of symbolic models that capture nonlinear and dynamical relationships\nin social science datasets. By extending neuro-symbolic methods to find compact\nfunctions and differential equations in noisy and longitudinal data, we show\nthat our system can be used to discover interpretable models from real-world\ndata in economics and sociology. Augmenting existing workflows with symbolic\nregression can help uncover novel relationships and explore counterfactual\nmodels during the scientific process. We propose that this AI-assisted\nframework can bridge parametric and non-parametric models commonly employed in\nsocial science research by systematically exploring the space of nonlinear\nmodels and enabling fine-grained control over expressivity and\ninterpretability.\n","authors":["Julia Balla","Sihao Huang","Owen Dugan","Rumen Dangovski","Marin Soljacic"],"pdf_url":"https://arxiv.org/pdf/2210.00563v3.pdf","comment":"19 pages, 4 figures"},{"id":"http://arxiv.org/abs/2210.03921v2","updated":"2023-08-16T17:42:22Z","published":"2022-10-08T05:16:49Z","title":"Data Selection: A Surprisingly Effective and General Principle for\n Building Small Interpretable Models","summary":" We present convincing empirical evidence for an effective and general\nstrategy for building accurate small models. Such models are attractive for\ninterpretability and also find use in resource-constrained environments. The\nstrategy is to learn the training distribution instead of using data from the\ntest distribution. The distribution learning algorithm is not a contribution of\nthis work; we highlight the broad usefulness of this simple strategy on a\ndiverse set of tasks, and as such these rigorous empirical results are our\ncontribution. We apply it to the tasks of (1) building cluster explanation\ntrees, (2) prototype-based classification, and (3) classification using Random\nForests, and show that it improves the accuracy of weak traditional baselines\nto the point that they are surprisingly competitive with specialized modern\ntechniques.\n This strategy is also versatile wrt the notion of model size. In the first\ntwo tasks, model size is identified by number of leaves in the tree and the\nnumber of prototypes respectively. In the final task involving Random Forests\nthe strategy is shown to be effective even when model size is determined by\nmore than one factor: number of trees and their maximum depth.\n Positive results using multiple datasets are presented that are shown to be\nstatistically significant. These lead us to conclude that this strategy is both\neffective, i.e, leads to significant improvements, and general, i.e., is\napplicable to different tasks and model families, and therefore merits further\nattention in domains that require small accurate models.\n","authors":["Abhishek Ghose"],"pdf_url":"https://arxiv.org/pdf/2210.03921v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13565v2","updated":"2023-08-16T17:26:28Z","published":"2023-07-25T15:17:31Z","title":"Decision-Focused Learning: Foundations, State of the Art, Benchmark and\n Future Opportunities","summary":" Decision-focused learning (DFL) is an emerging paradigm in machine learning\nwhich trains a model to optimize decisions, integrating prediction and\noptimization in an end-to-end system. This paradigm holds the promise to\nrevolutionize decision-making in many real-world applications which operate\nunder uncertainty, where the estimation of unknown parameters within these\ndecision models often becomes a substantial roadblock. This paper presents a\ncomprehensive review of DFL. It provides an in-depth analysis of the various\ntechniques devised to integrate machine learning and optimization models,\nintroduces a taxonomy of DFL methods distinguished by their unique\ncharacteristics, and conducts an extensive empirical evaluation of these\nmethods proposing suitable benchmark dataset and tasks for DFL. Finally, the\nstudy provides valuable insights into current and potential future avenues in\nDFL research.\n","authors":["Jayanta Mandi","James Kotary","Senne Berden","Maxime Mulamba","Victor Bucarey","Tias Guns","Ferdinando Fioretto"],"pdf_url":"https://arxiv.org/pdf/2307.13565v2.pdf","comment":"Experimental Survey and Benchmarking"},{"id":"http://arxiv.org/abs/2308.08520v1","updated":"2023-08-16T17:18:30Z","published":"2023-08-16T17:18:30Z","title":"Painter: Teaching Auto-regressive Language Models to Draw Sketches","summary":" Large language models (LLMs) have made tremendous progress in natural\nlanguage understanding and they have also been successfully adopted in other\ndomains such as computer vision, robotics, reinforcement learning, etc. In this\nwork, we apply LLMs to image generation tasks by directly generating the\nvirtual brush strokes to paint an image. We present Painter, an LLM that can\nconvert user prompts in text description format to sketches by generating the\ncorresponding brush strokes in an auto-regressive way. We construct Painter\nbased on off-the-shelf LLM that is pre-trained on a large text corpus, by\nfine-tuning it on the new task while preserving language understanding\ncapabilities. We create a dataset of diverse multi-object sketches paired with\ntextual prompts that covers several object types and tasks. Painter can\ngenerate sketches from text descriptions, remove objects from canvas, and\ndetect and classify objects in sketches. Although this is an unprecedented\npioneering work in using LLMs for auto-regressive image generation, the results\nare very encouraging.\n","authors":["Reza Pourreza","Apratim Bhattacharyya","Sunny Panchal","Mingu Lee","Pulkit Madan","Roland Memisevic"],"pdf_url":"https://arxiv.org/pdf/2308.08520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08511v1","updated":"2023-08-16T17:07:40Z","published":"2023-08-16T17:07:40Z","title":"Two-and-a-half Order Score-based Model for Solving 3D Ill-posed Inverse\n Problems","summary":" Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) are crucial\ntechnologies in the field of medical imaging. Score-based models have proven to\nbe effective in addressing different inverse problems encountered in CT and\nMRI, such as sparse-view CT and fast MRI reconstruction. However, these models\nface challenges in achieving accurate three dimensional (3D) volumetric\nreconstruction. The existing score-based models primarily focus on\nreconstructing two dimensional (2D) data distribution, leading to\ninconsistencies between adjacent slices in the reconstructed 3D volumetric\nimages. To overcome this limitation, we propose a novel two-and-a-half order\nscore-based model (TOSM). During the training phase, our TOSM learns data\ndistributions in 2D space, which reduces the complexity of training compared to\ndirectly working on 3D volumes. However, in the reconstruction phase, the TOSM\nupdates the data distribution in 3D space, utilizing complementary scores along\nthree directions (sagittal, coronal, and transaxial) to achieve a more precise\nreconstruction. The development of TOSM is built on robust theoretical\nprinciples, ensuring its reliability and efficacy. Through extensive\nexperimentation on large-scale sparse-view CT and fast MRI datasets, our method\ndemonstrates remarkable advancements and attains state-of-the-art results in\nsolving 3D ill-posed inverse problems. Notably, the proposed TOSM effectively\naddresses the inter-slice inconsistency issue, resulting in high-quality 3D\nvolumetric reconstruction.\n","authors":["Zirong Li","Yanyang Wang","Jianjia Zhang","Weiwen Wu","Hengyong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.08511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08510v1","updated":"2023-08-16T17:07:37Z","published":"2023-08-16T17:07:37Z","title":"Autoencoding a Soft Touch to Learn Grasping from On-land to Underwater","summary":" Robots play a critical role as the physical agent of human operators in\nexploring the ocean. However, it remains challenging to grasp objects reliably\nwhile fully submerging under a highly pressurized aquatic environment with\nlittle visible light, mainly due to the fluidic interference on the tactile\nmechanics between the finger and object surfaces. This study investigates the\ntransferability of grasping knowledge from on-land to underwater via a\nvision-based soft robotic finger that learns 6D forces and torques (FT) using a\nSupervised Variational Autoencoder (SVAE). A high-framerate camera captures the\nwhole-body deformations while a soft robotic finger interacts with physical\nobjects on-land and underwater. Results show that the trained SVAE model\nlearned a series of latent representations of the soft mechanics transferrable\nfrom land to water, presenting a superior adaptation to the changing\nenvironments against commercial FT sensors. Soft, delicate, and reactive\ngrasping enabled by tactile intelligence enhances the gripper's underwater\ninteraction with improved reliability and robustness at a much-reduced cost,\npaving the path for learning-based intelligent grasping to support fundamental\nscientific discoveries in environmental and ocean research.\n","authors":["Ning Guo","Xudong Han","Xiaobo Liu","Shuqiao Zhong","Zhiyuan Zhou","Jian Lin","Jiansheng Dai","Fang Wan","Chaoyang Song"],"pdf_url":"https://arxiv.org/pdf/2308.08510v1.pdf","comment":"17 pages, 5 figures, 1 table, submitted to Advanced Intelligent\n Systems for review"},{"id":"http://arxiv.org/abs/2208.11061v2","updated":"2023-08-16T17:02:51Z","published":"2022-08-23T16:07:28Z","title":"Large-Scale Traffic Congestion Prediction based on Multimodal Fusion and\n Representation Mapping","summary":" With the progress of the urbanisation process, the urban transportation\nsystem is extremely critical to the development of cities and the quality of\nlife of the citizens. Among them, it is one of the most important tasks to\njudge traffic congestion by analysing the congestion factors. Recently, various\ntraditional and machine-learning-based models have been introduced for\npredicting traffic congestion. However, these models are either poorly\naggregated for massive congestion factors or fail to make accurate predictions\nfor every precise location in large-scale space. To alleviate these problems, a\nnovel end-to-end framework based on convolutional neural networks is proposed\nin this paper. With learning representations, the framework proposes a novel\nmultimodal fusion module and a novel representation mapping module to achieve\ntraffic congestion predictions on arbitrary query locations on a large-scale\nmap, combined with various global reference information. The proposed framework\nachieves significant results and efficient inference on real-world large-scale\ndatasets.\n","authors":["Bodong Zhou","Jiahui Liu","Songyi Cui","Yaping Zhao"],"pdf_url":"https://arxiv.org/pdf/2208.11061v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08504v1","updated":"2023-08-16T16:58:25Z","published":"2023-08-16T16:58:25Z","title":"ResBuilder: Automated Learning of Depth with Residual Structures","summary":" In this work, we develop a neural architecture search algorithm, termed\nResbuilder, that develops ResNet architectures from scratch that achieve high\naccuracy at moderate computational cost. It can also be used to modify existing\narchitectures and has the capability to remove and insert ResNet blocks, in\nthis way searching for suitable architectures in the space of ResNet\narchitectures. In our experiments on different image classification datasets,\nResbuilder achieves close to state-of-the-art performance while saving\ncomputational cost compared to off-the-shelf ResNets. Noteworthy, we once tune\nthe parameters on CIFAR10 which yields a suitable default choice for all other\ndatasets. We demonstrate that this property generalizes even to industrial\napplications by applying our method with default parameters on a proprietary\nfraud detection dataset.\n","authors":["Julian Burghoff","Matthias Rottmann","Jill von Conta","Sebastian Schoenen","Andreas Witte","Hanno Gottschalk"],"pdf_url":"https://arxiv.org/pdf/2308.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08493v1","updated":"2023-08-16T16:48:57Z","published":"2023-08-16T16:48:57Z","title":"Time Travel in LLMs: Tracing Data Contamination in Large Language Models","summary":" Data contamination, i.e., the presence of test data from downstream tasks in\nthe training data of large language models (LLMs), is a potential major issue\nin understanding LLMs' effectiveness on other tasks. We propose a\nstraightforward yet effective method for identifying data contamination within\nLLMs. At its core, our approach starts by identifying potential contamination\nin individual instances that are drawn from a small random sample; using this\ninformation, our approach then assesses if an entire dataset partition is\ncontaminated. To estimate contamination of individual instances, we employ\n\"guided instruction:\" a prompt consisting of the dataset name, partition type,\nand the initial segment of a reference instance, asking the LLM to complete it.\nAn instance is flagged as contaminated if the LLM's output either exactly or\nclosely matches the latter segment of the reference. To understand if an entire\npartition is contaminated, we propose two ideas. The first idea marks a dataset\npartition as contaminated if the average overlap score with the reference\ninstances (as measured by ROUGE or BLEURT) is statistically significantly\nbetter with the guided instruction vs. a general instruction that does not\ninclude the dataset and partition name. The second idea marks a dataset as\ncontaminated if a classifier based on GPT-4 with in-context learning prompting\nmarks multiple instances as contaminated. Our best method achieves an accuracy\nbetween 92% and 100% in detecting if an LLM is contaminated with seven\ndatasets, containing train and test/validation partitions, when contrasted with\nmanual evaluation by human expert. Further, our findings indicate that GPT-4 is\ncontaminated with AG News, WNLI, and XSum datasets.\n","authors":["Shahriar Golchin","Mihai Surdeanu"],"pdf_url":"https://arxiv.org/pdf/2308.08493v1.pdf","comment":"v1 preprint"},{"id":"http://arxiv.org/abs/2308.08480v1","updated":"2023-08-16T16:38:03Z","published":"2023-08-16T16:38:03Z","title":"Label Propagation Techniques for Artifact Detection in Imbalanced\n Classes using Photoplethysmogram Signals","summary":" Photoplethysmogram (PPG) signals are widely used in healthcare for monitoring\nvital signs, but they are susceptible to motion artifacts that can lead to\ninaccurate interpretations. In this study, the use of label propagation\ntechniques to propagate labels among PPG samples is explored, particularly in\nimbalanced class scenarios where clean PPG samples are significantly\noutnumbered by artifact-contaminated samples. With a precision of 91%, a recall\nof 90% and an F1 score of 90% for the class without artifacts, the results\ndemonstrate its effectiveness in labeling a medical dataset, even when clean\nsamples are rare. For the classification of artifacts our study compares\nsupervised classifiers such as conventional classifiers and neural networks\n(MLP, Transformers, FCN) with the semi-supervised label propagation algorithm.\nWith a precision of 89%, a recall of 95% and an F1 score of 92%, the KNN\nsupervised model gives good results, but the semi-supervised algorithm performs\nbetter in detecting artifacts. The findings suggest that the semi-supervised\nalgorithm label propagation hold promise for artifact detection in PPG signals,\nwhich can enhance the reliability of PPG-based health monitoring systems in\nreal-world applications.\n","authors":["Clara Macabiau","Thanh-Dung Le","Kevin Albert","Philippe Jouvet","Rita Noumeir"],"pdf_url":"https://arxiv.org/pdf/2308.08480v1.pdf","comment":"Under preparation to submit to IEEE for possible publications"},{"id":"http://arxiv.org/abs/2308.08469v1","updated":"2023-08-16T16:19:50Z","published":"2023-08-16T16:19:50Z","title":"LLM4TS: Two-Stage Fine-Tuning for Time-Series Forecasting with\n Pre-Trained LLMs","summary":" In this work, we leverage pre-trained Large Language Models (LLMs) to enhance\ntime-series forecasting. Mirroring the growing interest in unifying models for\nNatural Language Processing and Computer Vision, we envision creating an\nanalogous model for long-term time-series forecasting. Due to limited\nlarge-scale time-series data for building robust foundation models, our\napproach LLM4TS focuses on leveraging the strengths of pre-trained LLMs. By\ncombining time-series patching with temporal encoding, we have enhanced the\ncapability of LLMs to handle time-series data effectively. Inspired by the\nsupervised fine-tuning in chatbot domains, we prioritize a two-stage\nfine-tuning process: first conducting supervised fine-tuning to orient the LLM\ntowards time-series data, followed by task-specific downstream fine-tuning.\nFurthermore, to unlock the flexibility of pre-trained LLMs without extensive\nparameter adjustments, we adopt several Parameter-Efficient Fine-Tuning (PEFT)\ntechniques. Drawing on these innovations, LLM4TS has yielded state-of-the-art\nresults in long-term forecasting. Our model has also shown exceptional\ncapabilities as both a robust representation learner and an effective few-shot\nlearner, thanks to the knowledge transferred from the pre-trained LLM.\n","authors":["Ching Chang","Wen-Chih Peng","Tien-Fu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.08469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08468v1","updated":"2023-08-16T16:19:25Z","published":"2023-08-16T16:19:25Z","title":"An Expert's Guide to Training Physics-informed Neural Networks","summary":" Physics-informed neural networks (PINNs) have been popularized as a deep\nlearning framework that can seamlessly synthesize observational data and\npartial differential equation (PDE) constraints. Their practical effectiveness\nhowever can be hampered by training pathologies, but also oftentimes by poor\nchoices made by users who lack deep learning expertise. In this paper we\npresent a series of best practices that can significantly improve the training\nefficiency and overall accuracy of PINNs. We also put forth a series of\nchallenging benchmark problems that highlight some of the most prominent\ndifficulties in training PINNs, and present comprehensive and fully\nreproducible ablation studies that demonstrate how different architecture\nchoices and training strategies affect the test accuracy of the resulting\nmodels. We show that the methods and guiding principles put forth in this study\nlead to state-of-the-art results and provide strong baselines that future\nstudies should use for comparison purposes. To this end, we also release a\nhighly optimized library in JAX that can be used to reproduce all results\nreported in this paper, enable future research studies, as well as facilitate\neasy adaptation to new use-case scenarios.\n","authors":["Sifan Wang","Shyam Sankaran","Hanwen Wang","Paris Perdikaris"],"pdf_url":"https://arxiv.org/pdf/2308.08468v1.pdf","comment":"36 pages, 25 figures, 13 tables"},{"id":"http://arxiv.org/abs/2308.06422v2","updated":"2023-08-16T16:18:28Z","published":"2023-08-12T00:16:51Z","title":"Sensitivity-Aware Mixed-Precision Quantization and Width Optimization of\n Deep Neural Networks Through Cluster-Based Tree-Structured Parzen Estimation","summary":" As the complexity and computational demands of deep learning models rise, the\nneed for effective optimization methods for neural network designs becomes\nparamount. This work introduces an innovative search mechanism for\nautomatically selecting the best bit-width and layer-width for individual\nneural network layers. This leads to a marked enhancement in deep neural\nnetwork efficiency. The search domain is strategically reduced by leveraging\nHessian-based pruning, ensuring the removal of non-crucial parameters.\nSubsequently, we detail the development of surrogate models for favorable and\nunfavorable outcomes by employing a cluster-based tree-structured Parzen\nestimator. This strategy allows for a streamlined exploration of architectural\npossibilities and swift pinpointing of top-performing designs. Through rigorous\ntesting on well-known datasets, our method proves its distinct advantage over\nexisting methods. Compared to leading compression strategies, our approach\nrecords an impressive 20% decrease in model size without compromising accuracy.\nAdditionally, our method boasts a 12x reduction in search time relative to the\nbest search-focused strategies currently available. As a result, our proposed\nmethod represents a leap forward in neural network design optimization, paving\nthe way for quick model design and implementation in settings with limited\nresources, thereby propelling the potential of scalable deep learning\nsolutions.\n","authors":["Seyedarmin Azizi","Mahdi Nazemi","Arash Fayyazi","Massoud Pedram"],"pdf_url":"https://arxiv.org/pdf/2308.06422v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08467v1","updated":"2023-08-16T16:15:47Z","published":"2023-08-16T16:15:47Z","title":"On Neural Quantum Support Vector Machines","summary":" In \\cite{simon2023algorithms} we introduced four algorithms for the training\nof neural support vector machines (NSVMs) and demonstrated their feasibility.\nIn this note we introduce neural quantum support vector machines, that is,\nNSVMs with a quantum kernel, and extend our results to this setting.\n","authors":["Lars Simon","Manuel Radons"],"pdf_url":"https://arxiv.org/pdf/2308.08467v1.pdf","comment":"13 pages, 0 figures. arXiv admin note: substantial text overlap with\n arXiv:2308.07204"},{"id":"http://arxiv.org/abs/2301.11118v3","updated":"2023-08-16T16:11:30Z","published":"2023-01-26T14:13:37Z","title":"Box$^2$EL: Concept and Role Box Embeddings for the Description Logic\n EL++","summary":" Description logic (DL) ontologies extend knowledge graphs (KGs) with\nconceptual information and logical background knowledge. In recent years, there\nhas been growing interest in inductive reasoning techniques for such\nontologies, which promise to complement classical deductive reasoning\nalgorithms. Similar to KG completion, several existing approaches learn\nontology embeddings in a latent space, while additionally ensuring that they\nfaithfully capture the logical semantics of the underlying DL. However, they\nsuffer from several shortcomings, mainly due to a limiting role representation.\nWe propose Box$^2$EL, which represents both concepts and roles as boxes (i.e.,\naxis-aligned hyperrectangles) and demonstrate how it overcomes the limitations\nof previous methods. We theoretically prove the soundness of our model and\nconduct an extensive experimental evaluation, achieving state-of-the-art\nresults across a variety of datasets. As part of our evaluation, we introduce a\nnovel benchmark for subsumption prediction involving both atomic and complex\nconcepts.\n","authors":["Mathias Jackermeier","Jiaoyan Chen","Ian Horrocks"],"pdf_url":"https://arxiv.org/pdf/2301.11118v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08465v1","updated":"2023-08-16T16:09:23Z","published":"2023-08-16T16:09:23Z","title":"Hierarchical Uncertainty Estimation for Medical Image Segmentation\n Networks","summary":" Learning a medical image segmentation model is an inherently ambiguous task,\nas uncertainties exist in both images (noise) and manual annotations (human\nerrors and bias) used for model training. To build a trustworthy image\nsegmentation model, it is important to not just evaluate its performance but\nalso estimate the uncertainty of the model prediction. Most state-of-the-art\nimage segmentation networks adopt a hierarchical encoder architecture,\nextracting image features at multiple resolution levels from fine to coarse. In\nthis work, we leverage this hierarchical image representation and propose a\nsimple yet effective method for estimating uncertainties at multiple levels.\nThe multi-level uncertainties are modelled via the skip-connection module and\nthen sampled to generate an uncertainty map for the predicted image\nsegmentation. We demonstrate that a deep learning segmentation network such as\nU-net, when implemented with such hierarchical uncertainty estimation module,\ncan achieve a high segmentation performance, while at the same time provide\nmeaningful uncertainty maps that can be used for out-of-distribution detection.\n","authors":["Xinyu Bai","Wenjia Bai"],"pdf_url":"https://arxiv.org/pdf/2308.08465v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2211.11695v2","updated":"2023-08-16T16:05:00Z","published":"2022-11-21T18:14:38Z","title":"Disentangled Representation Learning","summary":" Disentangled Representation Learning (DRL) aims to learn a model capable of\nidentifying and disentangling the underlying factors hidden in the observable\ndata in representation form. The process of separating underlying factors of\nvariation into variables with semantic meaning benefits in learning explainable\nrepresentations of data, which imitates the meaningful understanding process of\nhumans when observing an object or relation. As a general learning strategy,\nDRL has demonstrated its power in improving the model explainability,\ncontrolability, robustness, as well as generalization capacity in a wide range\nof scenarios such as computer vision, natural language processing, data mining\netc. In this article, we comprehensively review DRL from various aspects\nincluding motivations, definitions, methodologies, evaluations, applications\nand model designs. We discuss works on DRL based on two well-recognized\ndefinitions, i.e., Intuitive Definition and Group Theory Definition. We further\ncategorize the methodologies for DRL into four groups, i.e., Traditional\nStatistical Approaches, Variational Auto-encoder Based Approaches, Generative\nAdversarial Networks Based Approaches, Hierarchical Approaches and Other\nApproaches. We also analyze principles to design different DRL models that may\nbenefit different tasks in practical applications. Finally, we point out\nchallenges in DRL as well as potential research directions deserving future\ninvestigations. We believe this work may provide insights for promoting the DRL\nresearch in the community.\n","authors":["Xin Wang","Hong Chen","Si'ao Tang","Zihao Wu","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2211.11695v2.pdf","comment":"26 pages, 11 figures"},{"id":"http://arxiv.org/abs/2304.01752v2","updated":"2023-08-16T15:54:54Z","published":"2023-04-04T12:42:29Z","title":"Black Box Few-Shot Adaptation for Vision-Language models","summary":" Vision-Language (V-L) models trained with contrastive learning to align the\nvisual and language modalities have been shown to be strong few-shot learners.\nSoft prompt learning is the method of choice for few-shot downstream adaption\naiming to bridge the modality gap caused by the distribution shift induced by\nthe new domain. While parameter-efficient, prompt learning still requires\naccess to the model weights and can be computationally infeasible for large\nmodels with billions of parameters. To address these shortcomings, in this\nwork, we describe a black-box method for V-L few-shot adaptation that (a)\noperates on pre-computed image and text features and hence works without access\nto the model's weights, (b) it is orders of magnitude faster at training time,\n(c) it is amenable to both supervised and unsupervised training, and (d) it can\nbe even used to align image and text features computed from uni-modal models.\nTo achieve this, we propose Linear Feature Alignment (LFA), a simple linear\napproach for V-L re-alignment in the target domain. LFA is initialized from a\nclosed-form solution to a least-squares problem and then it is iteratively\nupdated by minimizing a re-ranking loss. Despite its simplicity, our approach\ncan even surpass soft-prompt learning methods as shown by extensive experiments\non 11 image and 2 video datasets.\n","authors":["Yassine Ouali","Adrian Bulat","Brais Martinez","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2304.01752v2.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08438v1","updated":"2023-08-16T15:42:24Z","published":"2023-08-16T15:42:24Z","title":"Accurate synthesis of Dysarthric Speech for ASR data augmentation","summary":" Dysarthria is a motor speech disorder often characterized by reduced speech\nintelligibility through slow, uncoordinated control of speech production\nmuscles. Automatic Speech recognition (ASR) systems can help dysarthric talkers\ncommunicate more effectively. However, robust dysarthria-specific ASR requires\na significant amount of training speech, which is not readily available for\ndysarthric talkers. This paper presents a new dysarthric speech synthesis\nmethod for the purpose of ASR training data augmentation. Differences in\nprosodic and acoustic characteristics of dysarthric spontaneous speech at\nvarying severity levels are important components for dysarthric speech\nmodeling, synthesis, and augmentation. For dysarthric speech synthesis, a\nmodified neural multi-talker TTS is implemented by adding a dysarthria severity\nlevel coefficient and a pause insertion model to synthesize dysarthric speech\nfor varying severity levels. To evaluate the effectiveness for synthesis of\ntraining data for ASR, dysarthria-specific speech recognition was used. Results\nshow that a DNN-HMM model trained on additional synthetic dysarthric speech\nachieves WER improvement of 12.2% compared to the baseline, and that the\naddition of the severity level and pause insertion controls decrease WER by\n6.5%, showing the effectiveness of adding these parameters. Overall results on\nthe TORGO database demonstrate that using dysarthric synthetic speech to\nincrease the amount of dysarthric-patterned speech for training has significant\nimpact on the dysarthric ASR systems. In addition, we have conducted a\nsubjective evaluation to evaluate the dysarthric-ness and similarity of\nsynthesized speech. Our subjective evaluation shows that the perceived\ndysartrhic-ness of synthesized speech is similar to that of true dysarthric\nspeech, especially for higher levels of dysarthria\n","authors":["Mohammad Soleymanpour","Michael T. Johnson","Rahim Soleymanpour","Jeffrey Berry"],"pdf_url":"https://arxiv.org/pdf/2308.08438v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2201.11571"},{"id":"http://arxiv.org/abs/2304.06783v2","updated":"2023-08-16T15:38:17Z","published":"2023-04-13T19:10:06Z","title":"A Distributionally Robust Approach to Regret Optimal Control using the\n Wasserstein Distance","summary":" This paper proposes a distributionally robust approach to regret optimal\ncontrol of discrete-time linear dynamical systems with quadratic costs subject\nto a stochastic additive disturbance on the state process. The underlying\nprobability distribution of the disturbance process is unknown, but assumed to\nlie in a given ball of distributions defined in terms of the type-2 Wasserstein\ndistance. In this framework, strictly causal linear disturbance feedback\ncontrollers are designed to minimize the worst-case expected regret. The regret\nincurred by a controller is defined as the difference between the cost it\nincurs in response to a realization of the disturbance process and the cost\nincurred by the optimal noncausal controller which has perfect knowledge of the\ndisturbance process realization at the outset. Building on a well-established\nduality theory for optimal transport problems, we derive a reformulation of the\nminimax regret optimal control problem as a tractable semidefinite program.\nUsing the equivalent dual reformulation, we characterize a worst-case\ndistribution achieving the worst-case expected regret in relation to the\ndistribution at the center of the Wasserstein ball. We compare the minimax\nregret optimal control design method with the distributionally robust optimal\ncontrol approach using an illustrative example and numerical experiments.\n","authors":["Feras Al Taha","Shuhao Yan","Eilyan Bitar"],"pdf_url":"https://arxiv.org/pdf/2304.06783v2.pdf","comment":"8 pages, 3 figures, to appear in the proceedings of the 2023 IEEE\n Conference on Decision and Control (CDC)"},{"id":"http://arxiv.org/abs/2304.04137v2","updated":"2023-08-16T15:36:07Z","published":"2023-04-09T02:22:31Z","title":"RD-DPP: Rate-Distortion Theory Meets Determinantal Point Process to\n Diversify Learning Data Samples","summary":" In some practical learning tasks, such as traffic video analysis, the number\nof available training samples is restricted by different factors, such as\nlimited communication bandwidth and computation power. Determinantal Point\nProcess (DPP) is a common method for selecting the most diverse samples to\nenhance learning quality. However, the number of selected samples is restricted\nto the rank of the kernel matrix implied by the dimensionality of data samples.\nSecondly, it is not easily customizable to different learning tasks. In this\npaper, we propose a new way of measuring task-oriented diversity based on the\nRate-Distortion (RD) theory, appropriate for multi-level classification. To\nthis end, we establish a fundamental relationship between DPP and RD theory. We\nobserve that the upper bound of the diversity of data selected by DPP has a\nuniversal trend of $\\textit{phase transition}$, which suggests that DPP is\nbeneficial only at the beginning of sample accumulation. This led to the design\nof a bi-modal method, where RD-DPP is used in the first mode to select initial\ndata samples, then classification inconsistency (as an uncertainty measure) is\nused to select the subsequent samples in the second mode. This phase transition\nsolves the limitation to the rank of the similarity matrix. Applying our method\nto six different datasets and five benchmark models suggests that our method\nconsistently outperforms random selection, DPP-based methods, and alternatives\nlike uncertainty-based and coreset methods under all sampling budgets, while\nexhibiting high generalizability to different learning tasks.\n","authors":["Xiwen Chen","Huayu Li","Rahul Amin","Abolfazl Razi"],"pdf_url":"https://arxiv.org/pdf/2304.04137v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08427v1","updated":"2023-08-16T15:17:57Z","published":"2023-08-16T15:17:57Z","title":"Eliciting Risk Aversion with Inverse Reinforcement Learning via\n Interactive Questioning","summary":" This paper proposes a novel framework for identifying an agent's risk\naversion using interactive questioning. Our study is conducted in two\nscenarios: a one-period case and an infinite horizon case. In the one-period\ncase, we assume that the agent's risk aversion is characterized by a cost\nfunction of the state and a distortion risk measure. In the infinite horizon\ncase, we model risk aversion with an additional component, a discount factor.\nAssuming the access to a finite set of candidates containing the agent's true\nrisk aversion, we show that asking the agent to demonstrate her optimal\npolicies in various environment, which may depend on their previous answers, is\nan effective means of identifying the agent's risk aversion. Specifically, we\nprove that the agent's risk aversion can be identified as the number of\nquestions tends to infinity, and the questions are randomly designed. We also\ndevelop an algorithm for designing optimal questions and provide empirical\nevidence that our method learns risk aversion significantly faster than\nrandomly designed questions in simulations. Our framework has important\napplications in robo-advising and provides a new approach for identifying an\nagent's risk preferences.\n","authors":["Ziteng Cheng","Anthony Coache","Sebastian Jaimungal"],"pdf_url":"https://arxiv.org/pdf/2308.08427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09703v3","updated":"2023-08-16T15:16:43Z","published":"2022-11-17T17:38:55Z","title":"EfficientTrain: Exploring Generalized Curriculum Learning for Training\n Visual Backbones","summary":" The superior performance of modern deep networks usually comes with a costly\ntraining procedure. This paper presents a new curriculum learning approach for\nthe efficient training of visual backbones (e.g., vision Transformers). Our\nwork is inspired by the inherent learning dynamics of deep networks: we\nexperimentally show that at an earlier training stage, the model mainly learns\nto recognize some 'easier-to-learn' discriminative patterns within each\nexample, e.g., the lower-frequency components of images and the original\ninformation before data augmentation. Driven by this phenomenon, we propose a\ncurriculum where the model always leverages all the training data at each\nepoch, while the curriculum starts with only exposing the 'easier-to-learn'\npatterns of each example, and introduces gradually more difficult patterns. To\nimplement this idea, we 1) introduce a cropping operation in the Fourier\nspectrum of the inputs, which enables the model to learn from only the\nlower-frequency components efficiently, 2) demonstrate that exposing the\nfeatures of original images amounts to adopting weaker data augmentation, and\n3) integrate 1) and 2) and design a curriculum learning schedule with a\ngreedy-search algorithm. The resulting approach, EfficientTrain, is simple,\ngeneral, yet surprisingly effective. As an off-the-shelf method, it reduces the\nwall-time training cost of a wide variety of popular models (e.g., ResNet,\nConvNeXt, DeiT, PVT, Swin, and CSWin) by >1.5x on ImageNet-1K/22K without\nsacrificing accuracy. It is also effective for self-supervised learning (e.g.,\nMAE). Code is available at https://github.com/LeapLabTHU/EfficientTrain.\n","authors":["Yulin Wang","Yang Yue","Rui Lu","Tianjiao Liu","Zhao Zhong","Shiji Song","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2211.09703v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08410v1","updated":"2023-08-16T14:57:12Z","published":"2023-08-16T14:57:12Z","title":"Digital twinning of cardiac electrophysiology models from the surface\n ECG: a geodesic backpropagation approach","summary":" The eikonal equation has become an indispensable tool for modeling cardiac\nelectrical activation accurately and efficiently. In principle, by matching\nclinically recorded and eikonal-based electrocardiograms (ECGs), it is possible\nto build patient-specific models of cardiac electrophysiology in a purely\nnon-invasive manner. Nonetheless, the fitting procedure remains a challenging\ntask. The present study introduces a novel method, Geodesic-BP, to solve the\ninverse eikonal problem. Geodesic-BP is well-suited for GPU-accelerated machine\nlearning frameworks, allowing us to optimize the parameters of the eikonal\nequation to reproduce a given ECG. We show that Geodesic-BP can reconstruct a\nsimulated cardiac activation with high accuracy in a synthetic test case, even\nin the presence of modeling inaccuracies. Furthermore, we apply our algorithm\nto a publicly available dataset of a rabbit model, with very positive results.\nGiven the future shift towards personalized medicine, Geodesic-BP has the\npotential to help in future functionalizations of cardiac models meeting\nclinical time constraints while maintaining the physiological accuracy of\nstate-of-the-art cardiac models.\n","authors":["Thomas Grandits","Jan Verhülsdonk","Gundolf Haase","Alexander Effland","Simone Pezzuto"],"pdf_url":"https://arxiv.org/pdf/2308.08410v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.08407v1","updated":"2023-08-16T14:51:51Z","published":"2023-08-16T14:51:51Z","title":"Explainable AI for clinical risk prediction: a survey of concepts,\n methods, and modalities","summary":" Recent advancements in AI applications to healthcare have shown incredible\npromise in surpassing human performance in diagnosis and disease prognosis.\nWith the increasing complexity of AI models, however, concerns regarding their\nopacity, potential biases, and the need for interpretability. To ensure trust\nand reliability in AI systems, especially in clinical risk prediction models,\nexplainability becomes crucial. Explainability is usually referred to as an AI\nsystem's ability to provide a robust interpretation of its decision-making\nlogic or the decisions themselves to human stakeholders. In clinical risk\nprediction, other aspects of explainability like fairness, bias, trust, and\ntransparency also represent important concepts beyond just interpretability. In\nthis review, we address the relationship between these concepts as they are\noften used together or interchangeably. This review also discusses recent\nprogress in developing explainable models for clinical risk prediction,\nhighlighting the importance of quantitative and clinical evaluation and\nvalidation across multiple common modalities in clinical practice. It\nemphasizes the need for external validation and the combination of diverse\ninterpretability methods to enhance trust and fairness. Adopting rigorous\ntesting, such as using synthetic datasets with known generative factors, can\nfurther improve the reliability of explainability methods. Open access and\ncode-sharing resources are essential for transparency and reproducibility,\nenabling the growth and trustworthiness of explainable research. While\nchallenges exist, an end-to-end approach to explainability in clinical risk\nprediction, incorporating stakeholders from clinicians to developers, is\nessential for success.\n","authors":["Munib Mesinovic","Peter Watkinson","Tingting Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.08407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08406v1","updated":"2023-08-16T14:50:51Z","published":"2023-08-16T14:50:51Z","title":"Content-based Recommendation Engine for Video Streaming Platform","summary":" Recommendation engine suggest content, product or services to the user by\nusing machine learning algorithm. This paper proposed a content-based\nrecommendation engine for providing video suggestion to the user based on their\nprevious interests and choices. We will use TF-IDF text vectorization method to\ndetermine the relevance of words in a document. Then we will find out the\nsimilarity between each content by calculating cosine similarity between them.\nFinally, engine will recommend videos to the users based on the obtained\nsimilarity score value. In addition, we will measure the engine's performance\nby computing precision, recall, and F1 core of the proposed system.\n","authors":["Puskal Khadka","Prabhav Lamichhane"],"pdf_url":"https://arxiv.org/pdf/2308.08406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05989v2","updated":"2023-08-16T14:47:10Z","published":"2023-06-09T15:59:27Z","title":"QBSD: Quartile-Based Seasonality Decomposition for Cost-Effective Time\n Series Forecasting","summary":" In the telecom domain, precise forecasting of time series patterns, such as\ncell key performance indicators (KPIs), plays a pivotal role in enhancing\nservice quality and operational efficiency. State-of-the-art forecasting\napproaches prioritize forecasting accuracy at the expense of computational\nperformance, rendering them less suitable for data-intensive applications\nencompassing systems with a multitude of time series variables. To address this\nissue, we introduce QBSD, a live forecasting approach tailored to optimize the\ntrade-off between accuracy and computational complexity. We have evaluated the\nperformance of QBSD against state-of-the-art forecasting approaches on publicly\navailable datasets. We have also extended this investigation to our curated\nnetwork KPI dataset, now publicly accessible, to showcase the effect of dynamic\noperating ranges that varies with time. The results demonstrate that the\nproposed method excels in runtime efficiency compared to the leading algorithms\navailable while maintaining competitive forecast accuracy.\n","authors":["Ebenezer RHP Isaac","Bulbul Singh"],"pdf_url":"https://arxiv.org/pdf/2306.05989v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08391v1","updated":"2023-08-16T14:23:24Z","published":"2023-08-16T14:23:24Z","title":"Fast Uncertainty Quantification of Spent Nuclear Fuel with Neural\n Networks","summary":" The accurate calculation and uncertainty quantification of the\ncharacteristics of spent nuclear fuel (SNF) play a crucial role in ensuring the\nsafety, efficiency, and sustainability of nuclear energy production, waste\nmanagement, and nuclear safeguards. State of the art physics-based models,\nwhile reliable, are computationally intensive and time-consuming. This paper\npresents a surrogate modeling approach using neural networks (NN) to predict a\nnumber of SNF characteristics with reduced computational costs compared to\nphysics-based models. An NN is trained using data generated from CASMO5 lattice\ncalculations. The trained NN accurately predicts decay heat and nuclide\nconcentrations of SNF, as a function of key input parameters, such as\nenrichment, burnup, cooling time between cycles, mean boron concentration and\nfuel temperature. The model is validated against physics-based decay heat\nsimulations and measurements of different uranium oxide fuel assemblies from\ntwo different pressurized water reactors. In addition, the NN is used to\nperform sensitivity analysis and uncertainty quantification. The results are in\nvery good alignment to CASMO5, while the computational costs (taking into\naccount the costs of generating training samples) are reduced by a factor of 10\nor more. Our findings demonstrate the feasibility of using NNs as surrogate\nmodels for fast characterization of SNF, providing a promising avenue for\nimproving computational efficiency in assessing nuclear fuel behavior and\nassociated risks.\n","authors":["Arnau Albà","Andreas Adelmann","Lucas Münster","Dimitri Rochman","Romana Boiger"],"pdf_url":"https://arxiv.org/pdf/2308.08391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08387v1","updated":"2023-08-16T14:18:31Z","published":"2023-08-16T14:18:31Z","title":"Continuous Sweep: an improved, binary quantifier","summary":" Quantification is a supervised machine learning task, focused on estimating\nthe class prevalence of a dataset rather than labeling its individual\nobservations. We introduce Continuous Sweep, a new parametric binary quantifier\ninspired by the well-performing Median Sweep. Median Sweep is currently one of\nthe best binary quantifiers, but we have changed this quantifier on three\npoints, namely 1) using parametric class distributions instead of empirical\ndistributions, 2) optimizing decision boundaries instead of applying discrete\ndecision rules, and 3) calculating the mean instead of the median. We derive\nanalytic expressions for the bias and variance of Continuous Sweep under\ngeneral model assumptions. This is one of the first theoretical contributions\nin the field of quantification learning. Moreover, these derivations enable us\nto find the optimal decision boundaries. Finally, our simulation study shows\nthat Continuous Sweep outperforms Median Sweep in a wide range of situations.\n","authors":["Kevin Kloos","Julian D. Karch","Quinten A. Meertens","Mark de Rooij"],"pdf_url":"https://arxiv.org/pdf/2308.08387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07077v2","updated":"2023-08-16T14:16:28Z","published":"2023-06-12T12:43:27Z","title":"Latent Dynamical Implicit Diffusion Processes","summary":" Latent dynamical models are commonly used to learn the distribution of a\nlatent dynamical process that represents a sequence of noisy data samples.\nHowever, producing samples from such models with high fidelity is challenging\ndue to the complexity and variability of latent and observation dynamics.\nRecent advances in diffusion-based generative models, such as DDPM and NCSN,\nhave shown promising alternatives to state-of-the-art latent generative models,\nsuch as Neural ODEs, RNNs, and Normalizing flow networks, for generating\nhigh-quality sequential samples from a prior distribution. However, their\napplication in modeling sequential data with latent dynamical models is yet to\nbe explored. Here, we propose a novel latent variable model named latent\ndynamical implicit diffusion processes (LDIDPs), which utilizes implicit\ndiffusion processes to sample from dynamical latent processes and generate\nsequential observation samples accordingly. We tested LDIDPs on synthetic and\nsimulated neural decoding problems. We demonstrate that LDIDPs can accurately\nlearn the dynamics over latent dimensions. Furthermore, the implicit sampling\nmethod allows for the computationally efficient generation of high-quality\nsequential data samples from the latent and observation spaces.\n","authors":["Mohammad R. Rezaei"],"pdf_url":"https://arxiv.org/pdf/2306.07077v2.pdf","comment":"I request a withdrawal because there are no experiments with\n real-world datasets and also the method section requires major changes to\n look mathematically sounds"},{"id":"http://arxiv.org/abs/2308.08381v1","updated":"2023-08-16T14:09:48Z","published":"2023-08-16T14:09:48Z","title":"Precision and Recall Reject Curves for Classification","summary":" For some classification scenarios, it is desirable to use only those\nclassification instances that a trained model associates with a high certainty.\nTo obtain such high-certainty instances, previous work has proposed\naccuracy-reject curves. Reject curves allow to evaluate and compare the\nperformance of different certainty measures over a range of thresholds for\naccepting or rejecting classifications. However, the accuracy may not be the\nmost suited evaluation metric for all applications, and instead precision or\nrecall may be preferable. This is the case, for example, for data with\nimbalanced class distributions. We therefore propose reject curves that\nevaluate precision and recall, the recall-reject curve and the precision-reject\ncurve. Using prototype-based classifiers from learning vector quantization, we\nfirst validate the proposed curves on artificial benchmark data against the\naccuracy reject curve as a baseline. We then show on imbalanced benchmarks and\nmedical, real-world data that for these scenarios, the proposed precision- and\nrecall-curves yield more accurate insights into classifier performance than\naccuracy reject curves.\n","authors":["Lydia Fischer","Patricia Wollstadt"],"pdf_url":"https://arxiv.org/pdf/2308.08381v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.08379v1","updated":"2023-08-16T14:04:50Z","published":"2023-08-16T14:04:50Z","title":"A distributed neural network architecture for dynamic sensor selection\n with application to bandwidth-constrained body-sensor networks","summary":" We propose a dynamic sensor selection approach for deep neural networks\n(DNNs), which is able to derive an optimal sensor subset selection for each\nspecific input sample instead of a fixed selection for the entire dataset. This\ndynamic selection is jointly learned with the task model in an end-to-end way,\nusing the Gumbel-Softmax trick to allow the discrete decisions to be learned\nthrough standard backpropagation. We then show how we can use this dynamic\nselection to increase the lifetime of a wireless sensor network (WSN) by\nimposing constraints on how often each node is allowed to transmit. We further\nimprove performance by including a dynamic spatial filter that makes the\ntask-DNN more robust against the fact that it now needs to be able to handle a\nmultitude of possible node subsets. Finally, we explain how the selection of\nthe optimal channels can be distributed across the different nodes in a WSN. We\nvalidate this method on a use case in the context of body-sensor networks,\nwhere we use real electroencephalography (EEG) sensor data to emulate an EEG\nsensor network. We analyze the resulting trade-offs between transmission load\nand task accuracy.\n","authors":["Thomas Strypsteen","Alexander Bertrand"],"pdf_url":"https://arxiv.org/pdf/2308.08379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11787v2","updated":"2023-08-16T14:03:03Z","published":"2023-07-20T16:22:36Z","title":"LLM Cognitive Judgements Differ From Human","summary":" Large Language Models (LLMs) have lately been on the spotlight of\nresearchers, businesses, and consumers alike. While the linguistic capabilities\nof such models have been studied extensively, there is growing interest in\ninvestigating them as cognitive subjects. In the present work I examine GPT-3\nand ChatGPT capabilities on an limited-data inductive reasoning task from the\ncognitive science literature. The results suggest that these models' cognitive\njudgements are not human-like.\n","authors":["Sotiris Lamprinidis"],"pdf_url":"https://arxiv.org/pdf/2307.11787v2.pdf","comment":"7 pages, 1 figure. License changed to CC BY-NC-SA"},{"id":"http://arxiv.org/abs/2303.13538v2","updated":"2023-08-16T13:59:00Z","published":"2023-03-15T13:32:11Z","title":"Bluetooth and WiFi Dataset for Real World RF Fingerprinting of\n Commercial Devices","summary":" RF fingerprinting is emerging as a physical layer security scheme to identify\nillegitimate and/or unauthorized emitters sharing the RF spectrum. However, due\nto the lack of publicly accessible real-world datasets, most research focuses\non generating synthetic waveforms with software-defined radios (SDRs) which are\nnot suited for practical deployment settings. On other hand, the limited\ndatasets that are available focus only on chipsets that generate only one kind\nof waveform. Commercial off-the-shelf (COTS) combo chipsets that support two\nwireless standards (for example WiFi and Bluetooth) over a shared dual-band\nantenna such as those found in laptops, adapters, wireless chargers, Raspberry\nPis, among others are becoming ubiquitous in the IoT realm. Hence, to keep up\nwith the modern IoT environment, there is a pressing need for real-world open\ndatasets capturing emissions from these combo chipsets transmitting\nheterogeneous communication protocols. To this end, we capture the first known\nemissions from the COTS IoT chipsets transmitting WiFi and Bluetooth under two\ndifferent time frames. The different time frames are essential to rigorously\nevaluate the generalization capability of the models. To ensure widespread use,\neach capture within the comprehensive 72 GB dataset is long enough (40\nMSamples) to support diverse input tensor lengths and formats. Finally, the\ndataset also comprises emissions at varying signal powers to account for the\nfeeble to high signal strength emissions as encountered in a real-world\nsetting.\n","authors":["Anu Jagannath","Zackary Kane","Jithin Jagannath"],"pdf_url":"https://arxiv.org/pdf/2303.13538v2.pdf","comment":"Revision Under Review"},{"id":"http://arxiv.org/abs/2307.16120v2","updated":"2023-08-16T13:58:20Z","published":"2023-07-30T03:59:47Z","title":"Deep Unrolling Networks with Recurrent Momentum Acceleration for\n Nonlinear Inverse Problems","summary":" Combining the strengths of model-based iterative algorithms and data-driven\ndeep learning solutions, deep unrolling networks (DuNets) have become a popular\ntool to solve inverse imaging problems. While DuNets have been successfully\napplied to many linear inverse problems, nonlinear problems tend to impair the\nperformance of the method. Inspired by momentum acceleration techniques that\nare often used in optimization algorithms, we propose a recurrent momentum\nacceleration (RMA) framework that uses a long short-term memory recurrent\nneural network (LSTM-RNN) to simulate the momentum acceleration process. The\nRMA module leverages the ability of the LSTM-RNN to learn and retain knowledge\nfrom the previous gradients. We apply RMA to two popular DuNets -- the learned\nproximal gradient descent (LPGD) and the learned primal-dual (LPD) methods,\nresulting in LPGD-RMA and LPD-RMA respectively. We provide experimental results\non two nonlinear inverse problems: a nonlinear deconvolution problem, and an\nelectrical impedance tomography problem with limited boundary measurements. In\nthe first experiment we have observed that the improvement due to RMA largely\nincreases with respect to the nonlinearity of the problem. The results of the\nsecond example further demonstrate that the RMA schemes can significantly\nimprove the performance of DuNets in strongly ill-posed problems.\n","authors":["Qingping Zhou","Jiayu Qian","Junqi Tang","Jinglai Li"],"pdf_url":"https://arxiv.org/pdf/2307.16120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04043v2","updated":"2023-08-16T13:51:23Z","published":"2023-05-06T13:13:18Z","title":"Echoes: Unsupervised Debiasing via Pseudo-bias Labeling in an Echo\n Chamber","summary":" Neural networks often learn spurious correlations when exposed to biased\ntraining data, leading to poor performance on out-of-distribution data. A\nbiased dataset can be divided, according to biased features, into bias-aligned\nsamples (i.e., with biased features) and bias-conflicting samples (i.e.,\nwithout biased features). Recent debiasing works typically assume that no bias\nlabel is available during the training phase, as obtaining such information is\nchallenging and labor-intensive. Following this unsupervised assumption,\nexisting methods usually train two models: a biased model specialized to learn\nbiased features and a target model that uses information from the biased model\nfor debiasing. This paper first presents experimental analyses revealing that\nthe existing biased models overfit to bias-conflicting samples in the training\ndata, which negatively impacts the debiasing performance of the target models.\nTo address this issue, we propose a straightforward and effective method called\nEchoes, which trains a biased model and a target model with a different\nstrategy. We construct an \"echo chamber\" environment by reducing the weights of\nsamples which are misclassified by the biased model, to ensure the biased model\nfully learns the biased features without overfitting to the bias-conflicting\nsamples. The biased model then assigns lower weights on the bias-conflicting\nsamples. Subsequently, we use the inverse of the sample weights of the biased\nmodel for training the target model. Experiments show that our approach\nachieves superior debiasing results compared to the existing baselines on both\nsynthetic and real-world datasets. Our code is available at\nhttps://github.com/isruihu/Echoes.\n","authors":["Rui Hu","Yahan Tu","Jitao Sang"],"pdf_url":"https://arxiv.org/pdf/2305.04043v2.pdf","comment":"Accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2308.08371v1","updated":"2023-08-16T13:50:23Z","published":"2023-08-16T13:50:23Z","title":"PDPK: A Framework to Synthesise Process Data and Corresponding\n Procedural Knowledge for Manufacturing","summary":" Procedural knowledge describes how to accomplish tasks and mitigate problems.\nSuch knowledge is commonly held by domain experts, e.g. operators in\nmanufacturing who adjust parameters to achieve quality targets. To the best of\nour knowledge, no real-world datasets containing process data and corresponding\nprocedural knowledge are publicly available, possibly due to corporate\napprehensions regarding the loss of knowledge advances. Therefore, we provide a\nframework to generate synthetic datasets that can be adapted to different\ndomains. The design choices are inspired by two real-world datasets of\nprocedural knowledge we have access to. Apart from containing representations\nof procedural knowledge in Resource Description Framework (RDF)-compliant\nknowledge graphs, the framework simulates parametrisation processes and\nprovides consistent process data. We compare established embedding methods on\nthe resulting knowledge graphs, detailing which out-of-the-box methods have the\npotential to represent procedural knowledge. This provides a baseline which can\nbe used to increase the comparability of future work. Furthermore, we validate\nthe overall characteristics of a synthesised dataset by comparing the results\nto those achievable on a real-world dataset. The framework and evaluation code,\nas well as the dataset used in the evaluation, are available open source.\n","authors":["Richard Nordsieck","André Schweizer","Michael Heider","Jörg Hähner"],"pdf_url":"https://arxiv.org/pdf/2308.08371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08366v1","updated":"2023-08-16T13:40:58Z","published":"2023-08-16T13:40:58Z","title":"Dual-Branch Temperature Scaling Calibration for Long-Tailed Recognition","summary":" The calibration for deep neural networks is currently receiving widespread\nattention and research. Miscalibration usually leads to overconfidence of the\nmodel. While, under the condition of long-tailed distribution of data, the\nproblem of miscalibration is more prominent due to the different confidence\nlevels of samples in minority and majority categories, and it will result in\nmore serious overconfidence. To address this problem, some current research\nhave designed diverse temperature coefficients for different categories based\non temperature scaling (TS) method. However, in the case of rare samples in\nminority classes, the temperature coefficient is not generalizable, and there\nis a large difference between the temperature coefficients of the training set\nand the validation set. To solve this challenge, this paper proposes a\ndual-branch temperature scaling calibration model (Dual-TS), which considers\nthe diversities in temperature parameters of different categories and the\nnon-generalizability of temperature parameters for rare samples in minority\nclasses simultaneously. Moreover, we noticed that the traditional calibration\nevaluation metric, Excepted Calibration Error (ECE), gives a higher weight to\nlow-confidence samples in the minority classes, which leads to inaccurate\nevaluation of model calibration. Therefore, we also propose Equal Sample Bin\nExcepted Calibration Error (Esbin-ECE) as a new calibration evaluation metric.\nThrough experiments, we demonstrate that our model yields state-of-the-art in\nboth traditional ECE and Esbin-ECE metrics.\n","authors":["Jialin Guo","Zhenyu Wu","Zhiqiang Zhan","Yang Ji"],"pdf_url":"https://arxiv.org/pdf/2308.08366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08361v1","updated":"2023-08-16T13:35:09Z","published":"2023-08-16T13:35:09Z","title":"KernelWarehouse: Towards Parameter-Efficient Dynamic Convolution","summary":" Dynamic convolution learns a linear mixture of $n$ static kernels weighted\nwith their sample-dependent attentions, demonstrating superior performance\ncompared to normal convolution. However, existing designs are\nparameter-inefficient: they increase the number of convolutional parameters by\n$n$ times. This and the optimization difficulty lead to no research progress in\ndynamic convolution that can allow us to use a significant large value of $n$\n(e.g., $n>100$ instead of typical setting $n<10$) to push forward the\nperformance boundary. In this paper, we propose $KernelWarehouse$, a more\ngeneral form of dynamic convolution, which can strike a favorable trade-off\nbetween parameter efficiency and representation power. Its key idea is to\nredefine the basic concepts of \"$kernels$\" and \"$assembling$ $kernels$\" in\ndynamic convolution from the perspective of reducing kernel dimension and\nincreasing kernel number significantly. In principle, KernelWarehouse enhances\nconvolutional parameter dependencies within the same layer and across\nsuccessive layers via tactful kernel partition and warehouse sharing, yielding\na high degree of freedom to fit a desired parameter budget. We validate our\nmethod on ImageNet and MS-COCO datasets with different ConvNet architectures,\nand show that it attains state-of-the-art results. For instance, the\nResNet18|ResNet50|MobileNetV2|ConvNeXt-Tiny model trained with KernelWarehouse\non ImageNet reaches 76.05%|81.05%|75.52%|82.51% top-1 accuracy. Thanks to its\nflexible design, KernelWarehouse can even reduce the model size of a ConvNet\nwhile improving the accuracy, e.g., our ResNet18 model with 36.45%|65.10%\nparameter reduction to the baseline shows 2.89%|2.29% absolute improvement to\ntop-1 accuracy.\n","authors":["Chao Li","Anbang Yao"],"pdf_url":"https://arxiv.org/pdf/2308.08361v1.pdf","comment":"This research work was completed and submitted in early May 2023.\n Code and pre-trained models are available at\n https://github.com/OSVAI/KernelWarehouse"},{"id":"http://arxiv.org/abs/2305.09781v2","updated":"2023-08-16T13:33:06Z","published":"2023-05-16T20:12:59Z","title":"SpecInfer: Accelerating Generative Large Language Model Serving with\n Speculative Inference and Token Tree Verification","summary":" The high computational and memory requirements of generative large language\nmodels (LLMs) make it challenging to serve them quickly and cheaply. This paper\nintroduces SpecInfer, an LLM serving system that accelerates generative LLM\ninference with speculative inference and token tree verification. A key insight\nbehind Specinfer is to combine various collectively boost-tuned small language\nmodels to jointly predict the LLM's outputs; the predictions are organized as a\ntoken tree, whose nodes each represent a candidate token sequence. The\ncorrectness of all candidate token sequences represented by a token tree is\nverified against the LLM in parallel using a novel tree-based parallel decoding\nmechanism. SpecInfer uses an LLM as a token tree verifier instead of an\nincremental decoder, which significantly reduces the end-to-end latency and\ncomputational requirement for serving generative LLMs while provably preserving\nmodel quality. Our evaluation shows that SpecInfer outperforms existing LLM\nserving systems by 1.3-2.4x for distributed LLM inference and by 2.6-3.5x for\noffloading-based LLM inference, while preserving the same generative\nperformance. SpecInfer is publicly available at\nhttps://github.com/flexflow/FlexFlow/tree/inference.\n","authors":["Xupeng Miao","Gabriele Oliaro","Zhihao Zhang","Xinhao Cheng","Zeyu Wang","Rae Ying Yee Wong","Alan Zhu","Lijie Yang","Xiaoxiang Shi","Chunan Shi","Zhuoming Chen","Daiyaan Arfeen","Reyna Abhyankar","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2305.09781v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08360v1","updated":"2023-08-16T13:32:43Z","published":"2023-08-16T13:32:43Z","title":"Independent Distribution Regularization for Private Graph Embedding","summary":" Learning graph embeddings is a crucial task in graph mining tasks. An\neffective graph embedding model can learn low-dimensional representations from\ngraph-structured data for data publishing benefiting various downstream\napplications such as node classification, link prediction, etc. However, recent\nstudies have revealed that graph embeddings are susceptible to attribute\ninference attacks, which allow attackers to infer private node attributes from\nthe learned graph embeddings. To address these concerns, privacy-preserving\ngraph embedding methods have emerged, aiming to simultaneously consider primary\nlearning and privacy protection through adversarial learning. However, most\nexisting methods assume that representation models have access to all sensitive\nattributes in advance during the training stage, which is not always the case\ndue to diverse privacy preferences. Furthermore, the commonly used adversarial\nlearning technique in privacy-preserving representation learning suffers from\nunstable training issues. In this paper, we propose a novel approach called\nPrivate Variational Graph AutoEncoders (PVGAE) with the aid of independent\ndistribution penalty as a regularization term. Specifically, we split the\noriginal variational graph autoencoder (VGAE) to learn sensitive and\nnon-sensitive latent representations using two sets of encoders. Additionally,\nwe introduce a novel regularization to enforce the independence of the\nencoders. We prove the theoretical effectiveness of regularization from the\nperspective of mutual information. Experimental results on three real-world\ndatasets demonstrate that PVGAE outperforms other baselines in private\nembedding learning regarding utility performance and privacy protection.\n","authors":["Qi Hu","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2308.08360v1.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2308.08358v1","updated":"2023-08-16T13:30:45Z","published":"2023-08-16T13:30:45Z","title":"Convergence of Two-Layer Regression with Nonlinear Units","summary":" Large language models (LLMs), such as ChatGPT and GPT4, have shown\noutstanding performance in many human life task. Attention computation plays an\nimportant role in training LLMs. Softmax unit and ReLU unit are the key\nstructure in attention computation. Inspired by them, we put forward a softmax\nReLU regression problem. Generally speaking, our goal is to find an optimal\nsolution to the regression problem involving the ReLU unit. In this work, we\ncalculate a close form representation for the Hessian of the loss function.\nUnder certain assumptions, we prove the Lipschitz continuous and the PSDness of\nthe Hessian. Then, we introduce an greedy algorithm based on approximate Newton\nmethod, which converges in the sense of the distance to optimal solution. Last,\nWe relax the Lipschitz condition and prove the convergence in the sense of loss\nvalue.\n","authors":["Yichuan Deng","Zhao Song","Shenghao Xie"],"pdf_url":"https://arxiv.org/pdf/2308.08358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08354v1","updated":"2023-08-16T13:24:47Z","published":"2023-08-16T13:24:47Z","title":"Is Meta-Learning the Right Approach for the Cold-Start Problem in\n Recommender Systems?","summary":" Recommender systems have become fundamental building blocks of modern online\nproducts and services, and have a substantial impact on user experience. In the\npast few years, deep learning methods have attracted a lot of research, and are\nnow heavily used in modern real-world recommender systems. Nevertheless,\ndealing with recommendations in the cold-start setting, e.g., when a user has\ndone limited interactions in the system, is a problem that remains far from\nsolved. Meta-learning techniques, and in particular optimization-based\nmeta-learning, have recently become the most popular approaches in the academic\nresearch literature for tackling the cold-start problem in deep learning models\nfor recommender systems. However, current meta-learning approaches are not\npractical for real-world recommender systems, which have billions of users and\nitems, and strict latency requirements. In this paper we show that it is\npossible to obtaining similar, or higher, performance on commonly used\nbenchmarks for the cold-start problem without using meta-learning techniques.\nIn more detail, we show that, when tuned correctly, standard and widely adopted\ndeep learning models perform just as well as newer meta-learning models. We\nfurther show that an extremely simple modular approach using common\nrepresentation learning techniques, can perform comparably to meta-learning\ntechniques specifically designed for the cold-start setting while being much\nmore easily deployable in real-world applications.\n","authors":["Davide Buffelli","Ashish Gupta","Agnieszka Strzalka","Vassilis Plachouras"],"pdf_url":"https://arxiv.org/pdf/2308.08354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03589v2","updated":"2023-08-16T13:17:03Z","published":"2023-06-06T11:15:53Z","title":"How does over-squashing affect the power of GNNs?","summary":" Graph Neural Networks (GNNs) are the state-of-the-art model for machine\nlearning on graph-structured data. The most popular class of GNNs operate by\nexchanging information between adjacent nodes, and are known as Message Passing\nNeural Networks (MPNNs). Given their widespread use, understanding the\nexpressive power of MPNNs is a key question. However, existing results\ntypically consider settings with uninformative node features. In this paper, we\nprovide a rigorous analysis to determine which function classes of node\nfeatures can be learned by an MPNN of a given capacity. We do so by measuring\nthe level of pairwise interactions between nodes that MPNNs allow for. This\nmeasure provides a novel quantitative characterization of the so-called\nover-squashing effect, which is observed to occur when a large volume of\nmessages is aggregated into fixed-size vectors. Using our measure, we prove\nthat, to guarantee sufficient communication between pairs of nodes, the\ncapacity of the MPNN must be large enough, depending on properties of the input\ngraph structure, such as commute times. For many relevant scenarios, our\nanalysis results in impossibility statements in practice, showing that\nover-squashing hinders the expressive power of MPNNs. We validate our\ntheoretical findings through extensive controlled experiments and ablation\nstudies.\n","authors":["Francesco Di Giovanni","T. Konstantin Rusch","Michael M. Bronstein","Andreea Deac","Marc Lackenby","Siddhartha Mishra","Petar Veličković"],"pdf_url":"https://arxiv.org/pdf/2306.03589v2.pdf","comment":"37 pages"},{"id":"http://arxiv.org/abs/2308.08344v1","updated":"2023-08-16T13:10:27Z","published":"2023-08-16T13:10:27Z","title":"Graph Out-of-Distribution Generalization with Controllable Data\n Augmentation","summary":" Graph Neural Network (GNN) has demonstrated extraordinary performance in\nclassifying graph properties. However, due to the selection bias of training\nand testing data (e.g., training on small graphs and testing on large graphs,\nor training on dense graphs and testing on sparse graphs), distribution\ndeviation is widespread. More importantly, we often observe \\emph{hybrid\nstructure distribution shift} of both scale and density, despite of one-sided\nbiased data partition. The spurious correlations over hybrid distribution\ndeviation degrade the performance of previous GNN methods and show large\ninstability among different datasets. To alleviate this problem, we propose\n\\texttt{OOD-GMixup} to jointly manipulate the training distribution with\n\\emph{controllable data augmentation} in metric space. Specifically, we first\nextract the graph rationales to eliminate the spurious correlations due to\nirrelevant information. Secondly, we generate virtual samples with perturbation\non graph rationale representation domain to obtain potential OOD training\nsamples. Finally, we propose OOD calibration to measure the distribution\ndeviation of virtual samples by leveraging Extreme Value Theory, and further\nactively control the training distribution by emphasizing the impact of virtual\nOOD samples. Extensive studies on several real-world datasets on graph\nclassification demonstrate the superiority of our proposed method over\nstate-of-the-art baselines.\n","authors":["Bin Lu","Xiaoying Gan","Ze Zhao","Shiyu Liang","Luoyi Fu","Xinbing Wang","Chenghu Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.08344v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.08334v1","updated":"2023-08-16T12:50:10Z","published":"2023-08-16T12:50:10Z","title":"Learning Logic Programs by Discovering Higher-Order Abstractions","summary":" Discovering novel abstractions is important for human-level AI. We introduce\nan approach to discover higher-order abstractions, such as map, filter, and\nfold. We focus on inductive logic programming, which induces logic programs\nfrom examples and background knowledge. We introduce the higher-order\nrefactoring problem, where the goal is to compress a logic program by\nintroducing higher-order abstractions. We implement our approach in STEVIE,\nwhich formulates the higher-order refactoring problem as a constraint\noptimisation problem. Our experimental results on multiple domains, including\nprogram synthesis and visual reasoning, show that, compared to no refactoring,\nSTEVIE can improve predictive accuracies by 27% and reduce learning times by\n47%. We also show that STEVIE can discover abstractions that transfer to\ndifferent domains\n","authors":["Céline Hocquette","Sebastijan Dumančić","Andrew Cropper"],"pdf_url":"https://arxiv.org/pdf/2308.08334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01631v2","updated":"2023-08-16T12:30:27Z","published":"2023-06-02T15:49:45Z","title":"Bi-level Contrastive Learning for Knowledge-Enhanced Molecule\n Representations","summary":" Molecule representation learning underpins diverse downstream applications\nsuch as molecular property and side effect understanding and prediction. In\nthis paper, we recognize the two-level structure of individual molecule as\nhaving intrinsic graph structure as well as being a node in a large molecule\nknowledge graph, and present GODE, a new approach that seamlessly integrates\ngraph representations of individual molecules with multi-domain biomedical data\nfrom knowledge graphs. By pre-training two graph neural networks (GNNs) on\ndifferent graph structures, combined with contrastive learning, GODE adeptly\nfuses molecular structures with their corresponding knowledge graph\nsubstructures. This fusion results in a more robust and informative\nrepresentation, enhancing molecular property prediction by harnessing both\nchemical and biological information. Finetuned on 11 chemical property tasks,\nour model surpasses benchmarks, achieving an average ROC-AUC improvement of\n14.5%, 9.8%, and 7.3% on BBBP, SIDER, and Tox21 datasets. In regression tasks\non ESOL and QM7 datasets, we achieve average improvements of 21.0% and 29.6%\nimprovements in RMSE and MAE, setting a new field benchmark.\n","authors":["Pengcheng Jiang","Cao Xiao","Tianfan Fu","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2306.01631v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08305v1","updated":"2023-08-16T12:08:50Z","published":"2023-08-16T12:08:50Z","title":"Warped geometric information on the optimisation of Euclidean functions","summary":" We consider the fundamental task of optimizing a real-valued function defined\nin a potentially high-dimensional Euclidean space, such as the loss function in\nmany machine-learning tasks or the logarithm of the probability distribution in\nstatistical inference. We use the warped Riemannian geometry notions to\nredefine the optimisation problem of a function on Euclidean space to a\nRiemannian manifold with a warped metric, and then find the function's optimum\nalong this manifold. The warped metric chosen for the search domain induces a\ncomputational friendly metric-tensor for which optimal search directions\nassociate with geodesic curves on the manifold becomes easier to compute.\nPerforming optimization along geodesics is known to be generally infeasible,\nyet we show that in this specific manifold we can analytically derive Taylor\napproximations up to third-order. In general these approximations to the\ngeodesic curve will not lie on the manifold, however we construct suitable\nretraction maps to pull them back onto the manifold. Therefore, we can\nefficiently optimize along the approximate geodesic curves. We cover the\nrelated theory, describe a practical optimization algorithm and empirically\nevaluate it on a collection of challenging optimisation benchmarks. Our\nproposed algorithm, using third-order approximation of geodesics, outperforms\nstandard Euclidean gradient-based counterparts in term of number of iterations\nuntil convergence and an alternative method for Hessian-based optimisation\nroutines.\n","authors":["Marcelo Hartmann","Bernardo Williams","Hanlin Yu","Mark Girolami","Alessandro Barp","Arto Klami"],"pdf_url":"https://arxiv.org/pdf/2308.08305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14036v2","updated":"2023-08-16T12:03:47Z","published":"2023-02-27T18:47:55Z","title":"Text-only domain adaptation for end-to-end ASR using integrated\n text-to-mel-spectrogram generator","summary":" We propose an end-to-end Automatic Speech Recognition (ASR) system that can\nbe trained on transcribed speech data, text-only data, or a mixture of both.\nThe proposed model uses an integrated auxiliary block for text-based training.\nThis block combines a non-autoregressive multi-speaker text-to-mel-spectrogram\ngenerator with a GAN-based enhancer to improve the spectrogram quality. The\nproposed system can generate a mel-spectrogram dynamically during training. It\ncan be used to adapt the ASR model to a new domain by using text-only data from\nthis domain. We demonstrate that the proposed training method significantly\nimproves ASR accuracy compared to the system trained on transcribed speech\nonly. It also surpasses cascade TTS systems with the vocoder in the adaptation\nquality and training speed.\n","authors":["Vladimir Bataev","Roman Korostik","Evgeny Shabalin","Vitaly Lavrukhin","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2302.14036v2.pdf","comment":"Accepted to INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2308.08291v1","updated":"2023-08-16T11:31:18Z","published":"2023-08-16T11:31:18Z","title":"Robust Bayesian Satisficing","summary":" Distributional shifts pose a significant challenge to achieving robustness in\ncontemporary machine learning. To overcome this challenge, robust satisficing\n(RS) seeks a robust solution to an unspecified distributional shift while\nachieving a utility above a desired threshold. This paper focuses on the\nproblem of RS in contextual Bayesian optimization when there is a discrepancy\nbetween the true and reference distributions of the context. We propose a novel\nrobust Bayesian satisficing algorithm called RoBOS for noisy black-box\noptimization. Our algorithm guarantees sublinear lenient regret under certain\nassumptions on the amount of distribution shift. In addition, we define a\nweaker notion of regret called robust satisficing regret, in which our\nalgorithm achieves a sublinear upper bound independent of the amount of\ndistribution shift. To demonstrate the effectiveness of our method, we apply it\nto various learning problems and compare it to other approaches, such as\ndistributionally robust optimization.\n","authors":["Artun Saday","Yaşar Cahit Yıldırım","Cem Tekin"],"pdf_url":"https://arxiv.org/pdf/2308.08291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.12791v2","updated":"2023-08-16T11:23:05Z","published":"2022-11-23T09:12:17Z","title":"An ensemble of VisNet, Transformer-M, and pretraining models for\n molecular property prediction in OGB Large-Scale Challenge @ NeurIPS 2022","summary":" In the technical report, we provide our solution for OGB-LSC 2022 Graph\nRegression Task. The target of this task is to predict the quantum chemical\nproperty, HOMO-LUMO gap for a given molecule on PCQM4Mv2 dataset. In the\ncompetition, we designed two kinds of models: Transformer-M-ViSNet which is an\ngeometry-enhanced graph neural network for fully connected molecular graphs and\nPretrained-3D-ViSNet which is a pretrained ViSNet by distilling geomeotric\ninformation from optimized structures. With an ensemble of 22 models, ViSNet\nTeam achieved the MAE of 0.0723 eV on the test-challenge set, dramatically\nreducing the error by 39.75% compared with the best method in the last year\ncompetition.\n","authors":["Yusong Wang","Shaoning Li","Zun Wang","Xinheng He","Bin Shao","Tie-Yan Liu","Tong Wang"],"pdf_url":"https://arxiv.org/pdf/2211.12791v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08290v1","updated":"2023-08-16T11:22:36Z","published":"2023-08-16T11:22:36Z","title":"DFedADMM: Dual Constraints Controlled Model Inconsistency for\n Decentralized Federated Learning","summary":" To address the communication burden issues associated with federated learning\n(FL), decentralized federated learning (DFL) discards the central server and\nestablishes a decentralized communication network, where each client\ncommunicates only with neighboring clients. However, existing DFL methods still\nsuffer from two major challenges: local inconsistency and local heterogeneous\noverfitting, which have not been fundamentally addressed by existing DFL\nmethods. To tackle these issues, we propose novel DFL algorithms, DFedADMM and\nits enhanced version DFedADMM-SAM, to enhance the performance of DFL. The\nDFedADMM algorithm employs primal-dual optimization (ADMM) by utilizing dual\nvariables to control the model inconsistency raised from the decentralized\nheterogeneous data distributions. The DFedADMM-SAM algorithm further improves\non DFedADMM by employing a Sharpness-Aware Minimization (SAM) optimizer, which\nuses gradient perturbations to generate locally flat models and searches for\nmodels with uniformly low loss values to mitigate local heterogeneous\noverfitting. Theoretically, we derive convergence rates of $\\small\n\\mathcal{O}\\Big(\\frac{1}{\\sqrt{KT}}+\\frac{1}{KT(1-\\psi)^2}\\Big)$ and $\\small\n\\mathcal{O}\\Big(\\frac{1}{\\sqrt{KT}}+\\frac{1}{KT(1-\\psi)^2}+\n\\frac{1}{T^{3/2}K^{1/2}}\\Big)$ in the non-convex setting for DFedADMM and\nDFedADMM-SAM, respectively, where $1 - \\psi$ represents the spectral gap of the\ngossip matrix. Empirically, extensive experiments on MNIST, CIFAR10 and\nCIFAR100 datesets demonstrate that our algorithms exhibit superior performance\nin terms of both generalization and convergence speed compared to existing\nstate-of-the-art (SOTA) optimizers in DFL.\n","authors":["Qinglun Li","Li Shen","Guanghao Li","Quanjun Yin","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2308.08290v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2305.13873v2","updated":"2023-08-16T11:16:15Z","published":"2023-05-23T09:48:16Z","title":"Unsafe Diffusion: On the Generation of Unsafe Images and Hateful Memes\n From Text-To-Image Models","summary":" State-of-the-art Text-to-Image models like Stable Diffusion and DALLE$\\cdot$2\nare revolutionizing how people generate visual content. At the same time,\nsociety has serious concerns about how adversaries can exploit such models to\ngenerate unsafe images. In this work, we focus on demystifying the generation\nof unsafe images and hateful memes from Text-to-Image models. We first\nconstruct a typology of unsafe images consisting of five categories (sexually\nexplicit, violent, disturbing, hateful, and political). Then, we assess the\nproportion of unsafe images generated by four advanced Text-to-Image models\nusing four prompt datasets. We find that these models can generate a\nsubstantial percentage of unsafe images; across four models and four prompt\ndatasets, 14.56% of all generated images are unsafe. When comparing the four\nmodels, we find different risk levels, with Stable Diffusion being the most\nprone to generating unsafe content (18.92% of all generated images are unsafe).\nGiven Stable Diffusion's tendency to generate more unsafe content, we evaluate\nits potential to generate hateful meme variants if exploited by an adversary to\nattack a specific individual or community. We employ three image editing\nmethods, DreamBooth, Textual Inversion, and SDEdit, which are supported by\nStable Diffusion. Our evaluation result shows that 24% of the generated images\nusing DreamBooth are hateful meme variants that present the features of the\noriginal hateful meme and the target individual/community; these generated\nimages are comparable to hateful meme variants collected from the real world.\nOverall, our results demonstrate that the danger of large-scale generation of\nunsafe images is imminent. We discuss several mitigating measures, such as\ncurating training data, regulating prompts, and implementing safety filters,\nand encourage better safeguard tools to be developed to prevent unsafe\ngeneration.\n","authors":["Yiting Qu","Xinyue Shen","Xinlei He","Michael Backes","Savvas Zannettou","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.13873v2.pdf","comment":"To Appear in the ACM Conference on Computer and Communications\n Security, November 26, 2023"},{"id":"http://arxiv.org/abs/2302.06608v3","updated":"2023-08-16T11:12:42Z","published":"2023-02-13T18:59:52Z","title":"3D-aware Blending with Generative NeRFs","summary":" Image blending aims to combine multiple images seamlessly. It remains\nchallenging for existing 2D-based methods, especially when input images are\nmisaligned due to differences in 3D camera poses and object shapes. To tackle\nthese issues, we propose a 3D-aware blending method using generative Neural\nRadiance Fields (NeRF), including two key components: 3D-aware alignment and\n3D-aware blending. For 3D-aware alignment, we first estimate the camera pose of\nthe reference image with respect to generative NeRFs and then perform 3D local\nalignment for each part. To further leverage 3D information of the generative\nNeRF, we propose 3D-aware blending that directly blends images on the NeRF's\nlatent representation space, rather than raw pixel space. Collectively, our\nmethod outperforms existing 2D baselines, as validated by extensive\nquantitative and qualitative evaluations with FFHQ and AFHQ-Cat.\n","authors":["Hyunsu Kim","Gayoung Lee","Yunjey Choi","Jin-Hwa Kim","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.06608v3.pdf","comment":"ICCV 2023, Project page: https://blandocs.github.io/blendnerf"},{"id":"http://arxiv.org/abs/2209.04744v2","updated":"2023-08-16T11:03:47Z","published":"2022-09-10T20:40:30Z","title":"Active Learning for Optimal Intervention Design in Causal Models","summary":" Sequential experimental design to discover interventions that achieve a\ndesired outcome is a key problem in various domains including science,\nengineering and public policy. When the space of possible interventions is\nlarge, making an exhaustive search infeasible, experimental design strategies\nare needed. In this context, encoding the causal relationships between the\nvariables, and thus the effect of interventions on the system, is critical for\nidentifying desirable interventions more efficiently. Here, we develop a causal\nactive learning strategy to identify interventions that are optimal, as\nmeasured by the discrepancy between the post-interventional mean of the\ndistribution and a desired target mean. The approach employs a Bayesian update\nfor the causal model and prioritizes interventions using a carefully designed,\ncausally informed acquisition function. This acquisition function is evaluated\nin closed form, allowing for fast optimization. The resulting algorithms are\ntheoretically grounded with information-theoretic bounds and provable\nconsistency results for linear causal models with known causal graph. We apply\nour approach to both synthetic data and single-cell transcriptomic data from\nPerturb-CITE-seq experiments to identify optimal perturbations that induce a\nspecific cell state transition. The causally informed acquisition function\ngenerally outperforms existing criteria allowing for optimal intervention\ndesign with fewer but carefully selected samples.\n","authors":["Jiaqi Zhang","Louis Cammarata","Chandler Squires","Themistoklis P. Sapsis","Caroline Uhler"],"pdf_url":"https://arxiv.org/pdf/2209.04744v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.10405v5","updated":"2023-08-16T10:57:58Z","published":"2023-01-25T04:45:06Z","title":"Editing Language Model-based Knowledge Graph Embeddings","summary":" Recently decades have witnessed the empirical success of framing Knowledge\nGraph (KG) embeddings via language models. However, language model-based KG\nembeddings are usually deployed as static artifacts, making them difficult to\nmodify post-deployment without re-training after deployment. To address this\nissue, we propose a new task of editing language model-based KG embeddings in\nthis paper. This task is designed to facilitate rapid, data-efficient updates\nto KG embeddings without compromising the performance of other aspects. We\nbuild four new datasets: E-FB15k237, A-FB15k237, E-WN18RR, and A-WN18RR, and\nevaluate several knowledge editing baselines demonstrating the limited ability\nof previous models to handle the proposed challenging task. We further propose\na simple yet strong baseline dubbed KGEditor, which utilizes additional\nparametric layers of the hyper network to edit/add facts. Our comprehensive\nexperimental results reveal that KGEditor excels in updating specific facts\nwithout impacting the overall performance, even when faced with limited\ntraining resources. Code and datasets are available in\nhttps://github.com/zjunlp/PromptKG/tree/main/deltaKG.\n","authors":["Siyuan Cheng","Ningyu Zhang","Bozhong Tian","Xi Chen","Qingbing Liu","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2301.10405v5.pdf","comment":"Work in progress and the project website is\n https://zjunlp.github.io/project/KGE_Editing/"},{"id":"http://arxiv.org/abs/2308.08283v1","updated":"2023-08-16T10:51:27Z","published":"2023-08-16T10:51:27Z","title":"CARE: A Large Scale CT Image Dataset and Clinical Applicable Benchmark\n Model for Rectal Cancer Segmentation","summary":" Rectal cancer segmentation of CT image plays a crucial role in timely\nclinical diagnosis, radiotherapy treatment, and follow-up. Although current\nsegmentation methods have shown promise in delineating cancerous tissues, they\nstill encounter challenges in achieving high segmentation precision. These\nobstacles arise from the intricate anatomical structures of the rectum and the\ndifficulties in performing differential diagnosis of rectal cancer.\nAdditionally, a major obstacle is the lack of a large-scale, finely annotated\nCT image dataset for rectal cancer segmentation. To address these issues, this\nwork introduces a novel large scale rectal cancer CT image dataset CARE with\npixel-level annotations for both normal and cancerous rectum, which serves as a\nvaluable resource for algorithm research and clinical application development.\nMoreover, we propose a novel medical cancer lesion segmentation benchmark model\nnamed U-SAM. The model is specifically designed to tackle the challenges posed\nby the intricate anatomical structures of abdominal organs by incorporating\nprompt information. U-SAM contains three key components: promptable information\n(e.g., points) to aid in target area localization, a convolution module for\ncapturing low-level lesion details, and skip-connections to preserve and\nrecover spatial information during the encoding-decoding process. To evaluate\nthe effectiveness of U-SAM, we systematically compare its performance with\nseveral popular segmentation methods on the CARE dataset. The generalization of\nthe model is further verified on the WORD dataset. Extensive experiments\ndemonstrate that the proposed U-SAM outperforms state-of-the-art methods on\nthese two datasets. These experiments can serve as the baseline for future\nresearch and clinical application development.\n","authors":["Hantao Zhang","Weidong Guo","Chenyang Qiu","Shouhong Wan","Bingbing Zou","Wanqin Wang","Peiquan Jin"],"pdf_url":"https://arxiv.org/pdf/2308.08283v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2308.07118v2","updated":"2023-08-16T10:46:35Z","published":"2023-08-14T12:57:12Z","title":"Neural radiance fields in the industrial and robotics domain:\n applications, research opportunities and use cases","summary":" The proliferation of technologies, such as extended reality (XR), has\nincreased the demand for high-quality three-dimensional (3D) graphical\nrepresentations. Industrial 3D applications encompass computer-aided design\n(CAD), finite element analysis (FEA), scanning, and robotics. However, current\nmethods employed for industrial 3D representations suffer from high\nimplementation costs and reliance on manual human input for accurate 3D\nmodeling. To address these challenges, neural radiance fields (NeRFs) have\nemerged as a promising approach for learning 3D scene representations based on\nprovided training 2D images. Despite a growing interest in NeRFs, their\npotential applications in various industrial subdomains are still unexplored.\nIn this paper, we deliver a comprehensive examination of NeRF industrial\napplications while also providing direction for future research endeavors. We\nalso present a series of proof-of-concept experiments that demonstrate the\npotential of NeRFs in the industrial domain. These experiments include\nNeRF-based video compression techniques and using NeRFs for 3D motion\nestimation in the context of collision avoidance. In the video compression\nexperiment, our results show compression savings up to 48\\% and 74\\% for\nresolutions of 1920x1080 and 300x168, respectively. The motion estimation\nexperiment used a 3D animation of a robotic arm to train Dynamic-NeRF (D-NeRF)\nand achieved an average peak signal-to-noise ratio (PSNR) of disparity map with\nthe value of 23 dB and an structural similarity index measure (SSIM) 0.97.\n","authors":["Eugen Šlapak","Enric Pardo","Matúš Dopiriak","Taras Maksymyuk","Juraj Gazda"],"pdf_url":"https://arxiv.org/pdf/2308.07118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08268v1","updated":"2023-08-16T10:09:42Z","published":"2023-08-16T10:09:42Z","title":"It Ain't That Bad: Understanding the Mysterious Performance Drop in OOD\n Generalization for Generative Transformer Models","summary":" Generative Transformer-based models have achieved remarkable proficiency on\nsolving diverse problems. However, their generalization ability is not fully\nunderstood and not always satisfying. Researchers take basic mathematical tasks\nlike n-digit addition or multiplication as important perspectives for\ninvestigating their generalization behaviors. Curiously, it is observed that\nwhen training on n-digit operations (e.g., additions) in which both input\noperands are n-digit in length, models generalize successfully on unseen\nn-digit inputs (in-distribution (ID) generalization), but fail miserably and\nmysteriously on longer, unseen cases (out-of-distribution (OOD)\ngeneralization). Studies try to bridge this gap with workarounds such as\nmodifying position embedding, fine-tuning, and priming with more extensive or\ninstructive data. However, without addressing the essential mechanism, there is\nhardly any guarantee regarding the robustness of these solutions. We bring this\nunexplained performance drop into attention and ask whether it is purely from\nrandom errors. Here we turn to the mechanistic line of research which has\nnotable successes in model interpretability. We discover that the strong ID\ngeneralization stems from structured representations, while behind the\nunsatisfying OOD performance, the models still exhibit clear learned algebraic\nstructures. Specifically, these models map unseen OOD inputs to outputs with\nequivalence relations in the ID domain. These highlight the potential of the\nmodels to carry useful information for improved generalization.\n","authors":["Xingcheng Xu","Zihao Pan","Haipeng Zhang","Yanqing Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.03973v7","updated":"2023-08-16T10:06:53Z","published":"2021-02-08T02:51:34Z","title":"STS-GAN: Can We Synthesize Solid Texture with High Fidelity from\n Arbitrary 2D Exemplar?","summary":" Solid texture synthesis (STS), an effective way to extend a 2D exemplar to a\n3D solid volume, exhibits advantages in computational photography. However,\nexisting methods generally fail to accurately learn arbitrary textures, which\nmay result in the failure to synthesize solid textures with high fidelity. In\nthis paper, we propose a novel generative adversarial nets-based framework\n(STS-GAN) to extend the given 2D exemplar to arbitrary 3D solid textures. In\nSTS-GAN, multi-scale 2D texture discriminators evaluate the similarity between\nthe given 2D exemplar and slices from the generated 3D texture, promoting the\n3D texture generator synthesizing realistic solid textures. Finally,\nexperiments demonstrate that the proposed method can generate high-fidelity\nsolid textures with similar visual characteristics to the 2D exemplar.\n","authors":["Xin Zhao","Jifeng Guo","Lin Wang","Fanqi Li","Jiahao Li","Junteng Zheng","Bo Yang"],"pdf_url":"https://arxiv.org/pdf/2102.03973v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08259v1","updated":"2023-08-16T09:53:20Z","published":"2023-08-16T09:53:20Z","title":"Graph Relation Aware Continual Learning","summary":" Continual graph learning (CGL) studies the problem of learning from an\ninfinite stream of graph data, consolidating historical knowledge, and\ngeneralizing it to the future task. At once, only current graph data are\navailable. Although some recent attempts have been made to handle this task, we\nstill face two potential challenges: 1) most of existing works only manipulate\non the intermediate graph embedding and ignore intrinsic properties of graphs.\nIt is non-trivial to differentiate the transferred information across graphs.\n2) recent attempts take a parameter-sharing policy to transfer knowledge across\ntime steps or progressively expand new architecture given shifted graph\ndistribution. Learning a single model could loss discriminative information for\neach graph task while the model expansion scheme suffers from high model\ncomplexity. In this paper, we point out that latent relations behind graph\nedges can be attributed as an invariant factor for the evolving graphs and the\nstatistical information of latent relations evolves. Motivated by this, we\ndesign a relation-aware adaptive model, dubbed as RAM-CG, that consists of a\nrelation-discovery modular to explore latent relations behind edges and a\ntask-awareness masking classifier to accounts for the shifted. Extensive\nexperiments show that RAM-CG provides significant 2.2%, 6.9% and 6.6% accuracy\nimprovements over the state-of-the-art results on CitationNet, OGBN-arxiv and\nTWITCH dataset, respective.\n","authors":["Qinghua Shen","Weijieying Ren","Wei Qin"],"pdf_url":"https://arxiv.org/pdf/2308.08259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08247v1","updated":"2023-08-16T09:28:55Z","published":"2023-08-16T09:28:55Z","title":"Two Phases of Scaling Laws for Nearest Neighbor Classifiers","summary":" A scaling law refers to the observation that the test performance of a model\nimproves as the number of training data increases. A fast scaling law implies\nthat one can solve machine learning problems by simply boosting the data and\nthe model sizes. Yet, in many cases, the benefit of adding more data can be\nnegligible. In this work, we study the rate of scaling laws of nearest neighbor\nclassifiers. We show that a scaling law can have two phases: in the first\nphase, the generalization error depends polynomially on the data dimension and\ndecreases fast; whereas in the second phase, the error depends exponentially on\nthe data dimension and decreases slowly. Our analysis highlights the complexity\nof the data distribution in determining the generalization error. When the data\ndistributes benignly, our result suggests that nearest neighbor classifier can\nachieve a generalization error that depends polynomially, instead of\nexponentially, on the data dimension.\n","authors":["Pengkun Yang","Jingzhao Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16296v2","updated":"2023-08-16T09:28:17Z","published":"2023-06-28T15:17:59Z","title":"Relevant Entity Selection: Knowledge Graph Bootstrapping via Zero-Shot\n Analogical Pruning","summary":" Knowledge Graph Construction (KGC) can be seen as an iterative process\nstarting from a high quality nucleus that is refined by knowledge extraction\napproaches in a virtuous loop. Such a nucleus can be obtained from knowledge\nexisting in an open KG like Wikidata. However, due to the size of such generic\nKGs, integrating them as a whole may entail irrelevant content and scalability\nissues. We propose an analogy-based approach that starts from seed entities of\ninterest in a generic KG, and keeps or prunes their neighboring entities. We\nevaluate our approach on Wikidata through two manually labeled datasets that\ncontain either domain-homogeneous or -heterogeneous seed entities. We\nempirically show that our analogy-based approach outperforms LSTM, Random\nForest, SVM, and MLP, with a drastically lower number of parameters. We also\nevaluate its generalization potential in a transfer learning setting. These\nresults advocate for the further integration of analogy-based inference in\ntasks related to the KG lifecycle.\n","authors":["Lucas Jarnac","Miguel Couceiro","Pierre Monnin"],"pdf_url":"https://arxiv.org/pdf/2306.16296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18477v3","updated":"2023-08-16T09:23:37Z","published":"2023-05-29T11:05:20Z","title":"Beyond the Meta: Leveraging Game Design Parameters for Patch-Agnostic\n Esport Analytics","summary":" Esport games comprise a sizeable fraction of the global games market, and is\nthe fastest growing segment in games. This has given rise to the domain of\nesports analytics, which uses telemetry data from games to inform players,\ncoaches, broadcasters and other stakeholders. Compared to traditional sports,\nesport titles change rapidly, in terms of mechanics as well as rules. Due to\nthese frequent changes to the parameters of the game, esport analytics models\ncan have a short life-spam, a problem which is largely ignored within the\nliterature. This paper extracts information from game design (i.e. patch notes)\nand utilises clustering techniques to propose a new form of character\nrepresentation. As a case study, a neural network model is trained to predict\nthe number of kills in a Dota 2 match utilising this novel character\nrepresentation technique. The performance of this model is then evaluated\nagainst two distinct baselines, including conventional techniques. Not only did\nthe model significantly outperform the baselines in terms of accuracy (85%\nAUC), but the model also maintains the accuracy in two newer iterations of the\ngame that introduced one new character and a brand new character type. These\nchanges introduced to the design of the game would typically break conventional\ntechniques that are commonly used within the literature. Therefore, the\nproposed methodology for representing characters can increase the life-spam of\nmachine learning models as well as contribute to a higher performance when\ncompared to traditional techniques typically employed within the literature.\n","authors":["Alan Pedrassoli Chitayat","Florian Block","James Walker","Anders Drachen"],"pdf_url":"https://arxiv.org/pdf/2305.18477v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15121v2","updated":"2023-08-16T09:19:20Z","published":"2023-05-24T13:13:26Z","title":"Beyond Individual Input for Deep Anomaly Detection on Tabular Data","summary":" Anomaly detection is crucial in various domains, such as finance, healthcare,\nand cybersecurity. In this paper, we propose a novel deep anomaly detection\nmethod for tabular data that leverages Non-Parametric Transformers (NPTs), a\nmodel initially proposed for supervised tasks, to capture both feature-feature\nand sample-sample dependencies. In a reconstruction-based framework, we train\nthe NPT to reconstruct masked features of normal samples. In a non-parametric\nfashion, we leverage the whole training set during inference and use the\nmodel's ability to reconstruct the masked features during to generate an\nanomaly score. To the best of our knowledge, our proposed method is the first\nto successfully combine feature-feature and sample-sample dependencies for\nanomaly detection on tabular datasets. We evaluate our method on an extensive\nbenchmark of 31 tabular datasets and demonstrate that our approach outperforms\nexisting state-of-the-art methods based on the F1-score and AUROC by a\nsignificant margin.\n","authors":["Hugo Thimonier","Fabrice Popineau","Arpad Rimmel","Bich-Liên Doan"],"pdf_url":"https://arxiv.org/pdf/2305.15121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08235v1","updated":"2023-08-16T09:12:21Z","published":"2023-08-16T09:12:21Z","title":"The Expressive Power of Graph Neural Networks: A Survey","summary":" Graph neural networks (GNNs) are effective machine learning models for many\ngraph-related applications. Despite their empirical success, many research\nefforts focus on the theoretical limitations of GNNs, i.e., the GNNs expressive\npower. Early works in this domain mainly focus on studying the graph\nisomorphism recognition ability of GNNs, and recent works try to leverage the\nproperties such as subgraph counting and connectivity learning to characterize\nthe expressive power of GNNs, which are more practical and closer to\nreal-world. However, no survey papers and open-source repositories\ncomprehensively summarize and discuss models in this important direction. To\nfill the gap, we conduct a first survey for models for enhancing expressive\npower under different forms of definition. Concretely, the models are reviewed\nbased on three categories, i.e., Graph feature enhancement, Graph topology\nenhancement, and GNNs architecture enhancement.\n","authors":["Bingxu Zhang","Changjun Fan","Shixuan Liu","Kuihua Huang","Xiang Zhao","Jincai Huang","Zhong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.08235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08234v1","updated":"2023-08-16T09:11:00Z","published":"2023-08-16T09:11:00Z","title":"Challenges and Opportunities of Using Transformer-Based Multi-Task\n Learning in NLP Through ML Lifecycle: A Survey","summary":" The increasing adoption of natural language processing (NLP) models across\nindustries has led to practitioners' need for machine learning systems to\nhandle these models efficiently, from training to serving them in production.\nHowever, training, deploying, and updating multiple models can be complex,\ncostly, and time-consuming, mainly when using transformer-based pre-trained\nlanguage models. Multi-Task Learning (MTL) has emerged as a promising approach\nto improve efficiency and performance through joint training, rather than\ntraining separate models. Motivated by this, we first provide an overview of\ntransformer-based MTL approaches in NLP. Then, we discuss the challenges and\nopportunities of using MTL approaches throughout typical ML lifecycle phases,\nspecifically focusing on the challenges related to data engineering, model\ndevelopment, deployment, and monitoring phases. This survey focuses on\ntransformer-based MTL architectures and, to the best of our knowledge, is novel\nin that it systematically analyses how transformer-based MTL in NLP fits into\nML lifecycle phases. Furthermore, we motivate research on the connection\nbetween MTL and continual learning (CL), as this area remains unexplored. We\nbelieve it would be practical to have a model that can handle both MTL and CL,\nas this would make it easier to periodically re-train the model, update it due\nto distribution shifts, and add new capabilities to meet real-world\nrequirements.\n","authors":["Lovre Torbarina","Tin Ferkovic","Lukasz Roguski","Velimir Mihelcic","Bruno Sarlija","Zeljko Kraljevic"],"pdf_url":"https://arxiv.org/pdf/2308.08234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08232v1","updated":"2023-08-16T09:06:54Z","published":"2023-08-16T09:06:54Z","title":"SCQPTH: an efficient differentiable splitting method for convex\n quadratic programming","summary":" We present SCQPTH: a differentiable first-order splitting method for convex\nquadratic programs. The SCQPTH framework is based on the alternating direction\nmethod of multipliers (ADMM) and the software implementation is motivated by\nthe state-of-the art solver OSQP: an operating splitting solver for convex\nquadratic programs (QPs). The SCQPTH software is made available as an\nopen-source python package and contains many similar features including\nefficient reuse of matrix factorizations, infeasibility detection, automatic\nscaling and parameter selection. The forward pass algorithm performs operator\nsplitting in the dimension of the original problem space and is therefore\nsuitable for large scale QPs with $100-1000$ decision variables and thousands\nof constraints. Backpropagation is performed by implicit differentiation of the\nADMM fixed-point mapping. Experiments demonstrate that for large scale QPs,\nSCQPTH can provide a $1\\times - 10\\times$ improvement in computational\nefficiency in comparison to existing differentiable QP solvers.\n","authors":["Andrew Butler"],"pdf_url":"https://arxiv.org/pdf/2308.08232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03496v2","updated":"2023-08-16T09:05:42Z","published":"2023-04-07T06:36:41Z","title":"Architecture-Preserving Provable Repair of Deep Neural Networks","summary":" Deep neural networks (DNNs) are becoming increasingly important components of\nsoftware, and are considered the state-of-the-art solution for a number of\nproblems, such as image recognition. However, DNNs are far from infallible, and\nincorrect behavior of DNNs can have disastrous real-world consequences. This\npaper addresses the problem of architecture-preserving V-polytope provable\nrepair of DNNs. A V-polytope defines a convex bounded polytope using its vertex\nrepresentation. V-polytope provable repair guarantees that the repaired DNN\nsatisfies the given specification on the infinite set of points in the given\nV-polytope. An architecture-preserving repair only modifies the parameters of\nthe DNN, without modifying its architecture. The repair has the flexibility to\nmodify multiple layers of the DNN, and runs in polynomial time. It supports\nDNNs with activation functions that have some linear pieces, as well as\nfully-connected, convolutional, pooling and residual layers. To the best our\nknowledge, this is the first provable repair approach that has all of these\nfeatures. We implement our approach in a tool called APRNN. Using MNIST,\nImageNet, and ACAS Xu DNNs, we show that it has better efficiency, scalability,\nand generalization compared to PRDNN and REASSURE, prior provable repair\nmethods that are not architecture preserving.\n","authors":["Zhe Tao","Stephanie Nawas","Jacqueline Mitchell","Aditya V. Thakur"],"pdf_url":"https://arxiv.org/pdf/2304.03496v2.pdf","comment":"Accepted paper at PLDI 2023. Tool is available at\n https://github.com/95616ARG/APRNN/"},{"id":"http://arxiv.org/abs/2308.08230v1","updated":"2023-08-16T09:03:13Z","published":"2023-08-16T09:03:13Z","title":"Exploring Winograd Convolution for Cost-effective Neural Network Fault\n Tolerance","summary":" Winograd is generally utilized to optimize convolution performance and\ncomputational efficiency because of the reduced multiplication operations, but\nthe reliability issues brought by winograd are usually overlooked. In this\nwork, we observe the great potential of winograd convolution in improving\nneural network (NN) fault tolerance. Based on the observation, we evaluate\nwinograd convolution fault tolerance comprehensively from different\ngranularities ranging from models, layers, and operation types for the first\ntime. Then, we explore the use of inherent fault tolerance of winograd\nconvolution for cost-effective NN protection against soft errors. Specifically,\nwe mainly investigate how winograd convolution can be effectively incorporated\nwith classical fault-tolerant design approaches including triple modular\nredundancy (TMR), fault-aware retraining, and constrained activation functions.\nAccording to our experiments, winograd convolution can reduce the\nfault-tolerant design overhead by 55.77\\% on average without any accuracy loss\ncompared to standard convolution, and further reduce the computing overhead by\n17.24\\% when the inherent fault tolerance of winograd convolution is\nconsidered. When it is applied on fault-tolerant neural networks enhanced with\nfault-aware retraining and constrained activation functions, the resulting\nmodel accuracy generally shows significant improvement in presence of various\nfaults.\n","authors":["Xinghua Xue","Cheng Liu","Bo Liu","Haitong Huang","Ying Wang","Tao Luo","Lei Zhang","Huawei Li","Xiaowei Li"],"pdf_url":"https://arxiv.org/pdf/2308.08230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08227v1","updated":"2023-08-16T08:58:25Z","published":"2023-08-16T08:58:25Z","title":"Inherent Redundancy in Spiking Neural Networks","summary":" Spiking Neural Networks (SNNs) are well known as a promising energy-efficient\nalternative to conventional artificial neural networks. Subject to the\npreconceived impression that SNNs are sparse firing, the analysis and\noptimization of inherent redundancy in SNNs have been largely overlooked, thus\nthe potential advantages of spike-based neuromorphic computing in accuracy and\nenergy efficiency are interfered. In this work, we pose and focus on three key\nquestions regarding the inherent redundancy in SNNs. We argue that the\nredundancy is induced by the spatio-temporal invariance of SNNs, which enhances\nthe efficiency of parameter utilization but also invites lots of noise spikes.\nFurther, we analyze the effect of spatio-temporal invariance on the\nspatio-temporal dynamics and spike firing of SNNs. Then, motivated by these\nanalyses, we propose an Advance Spatial Attention (ASA) module to harness SNNs'\nredundancy, which can adaptively optimize their membrane potential distribution\nby a pair of individual spatial attention sub-modules. In this way, noise spike\nfeatures are accurately regulated. Experimental results demonstrate that the\nproposed method can significantly drop the spike firing with better performance\nthan state-of-the-art SNN baselines. Our code is available in\n\\url{https://github.com/BICLab/ASA-SNN}.\n","authors":["Man Yao","Jiakui Hu","Guangshe Zhao","Yaoyuan Wang","Ziyang Zhang","Bo Xu","Guoqi Li"],"pdf_url":"https://arxiv.org/pdf/2308.08227v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.08224v1","updated":"2023-08-16T08:52:49Z","published":"2023-08-16T08:52:49Z","title":"How To Overcome Confirmation Bias in Semi-Supervised Image\n Classification By Active Learning","summary":" Do we need active learning? The rise of strong deep semi-supervised methods\nraises doubt about the usability of active learning in limited labeled data\nsettings. This is caused by results showing that combining semi-supervised\nlearning (SSL) methods with a random selection for labeling can outperform\nexisting active learning (AL) techniques. However, these results are obtained\nfrom experiments on well-established benchmark datasets that can overestimate\nthe external validity. However, the literature lacks sufficient research on the\nperformance of active semi-supervised learning methods in realistic data\nscenarios, leaving a notable gap in our understanding. Therefore we present\nthree data challenges common in real-world applications: between-class\nimbalance, within-class imbalance, and between-class similarity. These\nchallenges can hurt SSL performance due to confirmation bias. We conduct\nexperiments with SSL and AL on simulated data challenges and find that random\nsampling does not mitigate confirmation bias and, in some cases, leads to worse\nperformance than supervised learning. In contrast, we demonstrate that AL can\novercome confirmation bias in SSL in these realistic settings. Our results\nprovide insights into the potential of combining active and semi-supervised\nlearning in the presence of common real-world challenges, which is a promising\ndirection for robust methods when learning with limited labeled data in\nreal-world applications.\n","authors":["Sandra Gilhuber","Rasmus Hvingelby","Mang Ling Ada Fok","Thomas Seidl"],"pdf_url":"https://arxiv.org/pdf/2308.08224v1.pdf","comment":"Accepted @ ECML PKDD 2023. This is the author's version of the work.\n The definitive Version of Record will be published in the Proceedings of ECML\n PKDD 2023"},{"id":"http://arxiv.org/abs/2305.01397v2","updated":"2023-08-16T08:50:47Z","published":"2023-05-02T13:16:04Z","title":"Are demographically invariant models and representations in medical\n imaging fair?","summary":" Medical imaging models have been shown to encode information about patient\ndemographics such as age, race, and sex in their latent representation, raising\nconcerns about their potential for discrimination. Here, we ask whether\nrequiring models not to encode demographic attributes is desirable. We point\nout that marginal and class-conditional representation invariance imply the\nstandard group fairness notions of demographic parity and equalized odds,\nrespectively, while additionally requiring risk distribution matching, thus\npotentially equalizing away important group differences. Enforcing the\ntraditional fairness notions directly instead does not entail these strong\nconstraints. Moreover, representationally invariant models may still take\ndemographic attributes into account for deriving predictions. The latter can be\nprevented using counterfactual notions of (individual) fairness or invariance.\nWe caution, however, that properly defining medical image counterfactuals with\nrespect to demographic attributes is highly challenging. Finally, we posit that\nencoding demographic attributes may even be advantageous if it enables learning\na task-specific encoding of demographic features that does not rely on social\nconstructs such as 'race' and 'gender.' We conclude that demographically\ninvariant representations are neither necessary nor sufficient for fairness in\nmedical imaging. Models may need to encode demographic attributes, lending\nfurther urgency to calls for comprehensive model fairness assessments in terms\nof predictive performance across diverse patient groups.\n","authors":["Eike Petersen","Enzo Ferrante","Melanie Ganz","Aasa Feragen"],"pdf_url":"https://arxiv.org/pdf/2305.01397v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08222v1","updated":"2023-08-16T08:48:04Z","published":"2023-08-16T08:48:04Z","title":"HyperSNN: A new efficient and robust deep learning model for resource\n constrained control applications","summary":" In light of the increasing adoption of edge computing in areas such as\nintelligent furniture, robotics, and smart homes, this paper introduces\nHyperSNN, an innovative method for control tasks that uses spiking neural\nnetworks (SNNs) in combination with hyperdimensional computing. HyperSNN\nsubstitutes expensive 32-bit floating point multiplications with 8-bit integer\nadditions, resulting in reduced energy consumption while enhancing robustness\nand potentially improving accuracy. Our model was tested on AI Gym benchmarks,\nincluding Cartpole, Acrobot, MountainCar, and Lunar Lander. HyperSNN achieves\ncontrol accuracies that are on par with conventional machine learning methods\nbut with only 1.36% to 9.96% of the energy expenditure. Furthermore, our\nexperiments showed increased robustness when using HyperSNN. We believe that\nHyperSNN is especially suitable for interactive, mobile, and wearable devices,\npromoting energy-efficient and robust system design. Furthermore, it paves the\nway for the practical implementation of complex algorithms like model\npredictive control (MPC) in real-world industrial scenarios.\n","authors":["Zhanglu Yan","Shida Wang","Kaiwen Tang","Wong-Fai Wong"],"pdf_url":"https://arxiv.org/pdf/2308.08222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08203v1","updated":"2023-08-16T08:08:54Z","published":"2023-08-16T08:08:54Z","title":"Epicure: Distilling Sequence Model Predictions into Patterns","summary":" Most machine learning models predict a probability distribution over concrete\noutputs and struggle to accurately predict names over high entropy sequence\ndistributions. Here, we explore finding abstract, high-precision patterns\nintrinsic to these predictions in order to make abstract predictions that\nusefully capture rare sequences. In this short paper, we present Epicure, a\nmethod that distils the predictions of a sequence model, such as the output of\nbeam search, into simple patterns. Epicure maps a model's predictions into a\nlattice that represents increasingly more general patterns that subsume the\nconcrete model predictions.\n On the tasks of predicting a descriptive name of a function given the source\ncode of its body and detecting anomalous names given a function, we show that\nEpicure yields accurate naming patterns that match the ground truth more often\ncompared to just the highest probability model prediction. For a false alarm\nrate of 10%, Epicure predicts patterns that match 61% more ground-truth names\ncompared to the best model prediction, making Epicure well-suited for scenarios\nthat require high precision.\n","authors":["Miltiadis Allamanis","Earl T. Barr"],"pdf_url":"https://arxiv.org/pdf/2308.08203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07896v2","updated":"2023-08-16T08:00:58Z","published":"2023-08-15T17:37:44Z","title":"SciRE-Solver: Efficient Sampling of Diffusion Probabilistic Models by\n Score-integrand Solver with Recursive Derivative Estimation","summary":" Diffusion probabilistic models (DPMs) are a powerful class of generative\nmodels known for their ability to generate high-fidelity image samples. A major\nchallenge in the implementation of DPMs is the slow sampling process. In this\nwork, we bring a high-efficiency sampler for DPMs. Specifically, we propose a\nscore-based exact solution paradigm for the diffusion ODEs corresponding to the\nsampling process of DPMs, which introduces a new perspective on developing\nnumerical algorithms for solving diffusion ODEs. To achieve an efficient\nsampler, we propose a recursive derivative estimation (RDE) method to reduce\nthe estimation error. With our proposed solution paradigm and RDE method, we\npropose the score-integrand solver with the convergence order guarantee as\nefficient solver (SciRE-Solver) for solving diffusion ODEs. The SciRE-Solver\nattains state-of-the-art (SOTA) sampling performance with a limited number of\nscore function evaluations (NFE) on both discrete-time and continuous-time DPMs\nin comparison to existing training-free sampling algorithms. Such as, we\nachieve $3.48$ FID with $12$ NFE and $2.42$ FID with $20$ NFE for\ncontinuous-time DPMs on CIFAR10, respectively. Different from other samplers,\nSciRE-Solver has the promising potential to surpass the FIDs achieved in the\noriginal papers of some pre-trained models with a small NFEs. For example, we\nreach SOTA value of $2.40$ FID with $100$ NFE for continuous-time DPM and of\n$3.15$ FID with $84$ NFE for discrete-time DPM on CIFAR-10, as well as of\n$2.17$ ($2.02$) FID with $18$ ($50$) NFE for discrete-time DPM on CelebA\n64$\\times$64.\n","authors":["Shigui Li","Wei Chen","Delu Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.07896v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08198v1","updated":"2023-08-16T07:58:02Z","published":"2023-08-16T07:58:02Z","title":"DeSCo: Towards Generalizable and Scalable Deep Subgraph Counting","summary":" Subgraph counting is the problem of counting the occurrences of a given query\ngraph in a large target graph. Large-scale subgraph counting is useful in\nvarious domains, such as motif counting for social network analysis and loop\ncounting for money laundering detection on transaction networks. Recently, to\naddress the exponential runtime complexity of scalable subgraph counting,\nneural methods are proposed. However, existing neural counting approaches fall\nshort in three aspects. Firstly, the counts of the same query can vary from\nzero to millions on different target graphs, posing a much larger challenge\nthan most graph regression tasks. Secondly, current scalable graph neural\nnetworks have limited expressive power and fail to efficiently distinguish\ngraphs in count prediction. Furthermore, existing neural approaches cannot\npredict the occurrence position of queries in the target graph.\n Here we design DeSCo, a scalable neural deep subgraph counting pipeline,\nwhich aims to accurately predict the query count and occurrence position on any\ntarget graph after one-time training. Firstly, DeSCo uses a novel canonical\npartition and divides the large target graph into small neighborhood graphs.\nThe technique greatly reduces the count variation while guaranteeing no missing\nor double-counting. Secondly, neighborhood counting uses an expressive\nsubgraph-based heterogeneous graph neural network to accurately perform\ncounting in each neighborhood. Finally, gossip propagation propagates\nneighborhood counts with learnable gates to harness the inductive biases of\nmotif counts. DeSCo is evaluated on eight real-world datasets from various\ndomains. It outperforms state-of-the-art neural methods with 137x improvement\nin the mean squared error of count prediction, while maintaining the polynomial\nruntime complexity.\n","authors":["Tianyu Fu","Chiyue Wei","Yu Wang","Rex Ying"],"pdf_url":"https://arxiv.org/pdf/2308.08198v1.pdf","comment":"8 pages main text, 10 pages appendix"},{"id":"http://arxiv.org/abs/2308.08187v1","updated":"2023-08-16T07:36:58Z","published":"2023-08-16T07:36:58Z","title":"Endogenous Macrodynamics in Algorithmic Recourse","summary":" Existing work on Counterfactual Explanations (CE) and Algorithmic Recourse\n(AR) has largely focused on single individuals in a static environment: given\nsome estimated model, the goal is to find valid counterfactuals for an\nindividual instance that fulfill various desiderata. The ability of such\ncounterfactuals to handle dynamics like data and model drift remains a largely\nunexplored research challenge. There has also been surprisingly little work on\nthe related question of how the actual implementation of recourse by one\nindividual may affect other individuals. Through this work, we aim to close\nthat gap. We first show that many of the existing methodologies can be\ncollectively described by a generalized framework. We then argue that the\nexisting framework does not account for a hidden external cost of recourse,\nthat only reveals itself when studying the endogenous dynamics of recourse at\nthe group level. Through simulation experiments involving various state-of\nthe-art counterfactual generators and several benchmark datasets, we generate\nlarge numbers of counterfactuals and study the resulting domain and model\nshifts. We find that the induced shifts are substantial enough to likely impede\nthe applicability of Algorithmic Recourse in some situations. Fortunately, we\nfind various strategies to mitigate these concerns. Our simulation framework\nfor studying recourse dynamics is fast and opensourced.\n","authors":["Patrick Altmeyer","Giovan Angela","Aleksander Buszydlik","Karol Dobiczek","Arie van Deursen","Cynthia C. S. Liem"],"pdf_url":"https://arxiv.org/pdf/2308.08187v1.pdf","comment":"12 pages, 11 figures. Originally published at the 2023 IEEE\n Conference on Secure and Trustworthy Machine Learning (SaTML). IEEE holds the\n copyright"},{"id":"http://arxiv.org/abs/2210.05740v2","updated":"2023-08-16T07:06:58Z","published":"2022-10-11T19:11:19Z","title":"Stochastic Constrained DRO with a Complexity Independent of Sample Size","summary":" Distributionally Robust Optimization (DRO), as a popular method to train\nrobust models against distribution shift between training and test sets, has\nreceived tremendous attention in recent years. In this paper, we propose and\nanalyze stochastic algorithms that apply to both non-convex and convex losses\nfor solving Kullback Leibler divergence constrained DRO problem. Compared with\nexisting methods solving this problem, our stochastic algorithms not only enjoy\ncompetitive if not better complexity independent of sample size but also just\nrequire a constant batch size at every iteration, which is more practical for\nbroad applications. We establish a nearly optimal complexity bound for finding\nan $\\epsilon$ stationary solution for non-convex losses and an optimal\ncomplexity for finding an $\\epsilon$ optimal solution for convex losses.\nEmpirical studies demonstrate the effectiveness of the proposed algorithms for\nsolving non-convex and convex constrained DRO problems.\n","authors":["Qi Qi","Jiameng Lyu","Kung sik Chan","Er Wei Bai","Tianbao Yang"],"pdf_url":"https://arxiv.org/pdf/2210.05740v2.pdf","comment":"37 pages, 16 figures"},{"id":"http://arxiv.org/abs/2308.08174v1","updated":"2023-08-16T07:05:47Z","published":"2023-08-16T07:05:47Z","title":"Accelerating Generic Graph Neural Networks via Architecture, Compiler,\n Partition Method Co-Design","summary":" Graph neural networks (GNNs) have shown significant accuracy improvements in\na variety of graph learning domains, sparking considerable research interest.\nTo translate these accuracy improvements into practical applications, it is\nessential to develop high-performance and efficient hardware acceleration for\nGNN models. However, designing GNN accelerators faces two fundamental\nchallenges: the high bandwidth requirement of GNN models and the diversity of\nGNN models. Previous works have addressed the first challenge by using more\nexpensive memory interfaces to achieve higher bandwidth. For the second\nchallenge, existing works either support specific GNN models or have generic\ndesigns with poor hardware utilization.\n In this work, we tackle both challenges simultaneously. First, we identify a\nnew type of partition-level operator fusion, which we utilize to internally\nreduce the high bandwidth requirement of GNNs. Next, we introduce\npartition-level multi-threading to schedule the concurrent processing of graph\npartitions, utilizing different hardware resources. To further reduce the extra\non-chip memory required by multi-threading, we propose fine-grained graph\npartitioning to generate denser graph partitions. Importantly, these three\nmethods make no assumptions about the targeted GNN models, addressing the\nchallenge of model variety. We implement these methods in a framework called\nSwitchBlade, consisting of a compiler, a graph partitioner, and a hardware\naccelerator. Our evaluation demonstrates that SwitchBlade achieves an average\nspeedup of $1.85\\times$ and energy savings of $19.03\\times$ compared to the\nNVIDIA V100 GPU. Additionally, SwitchBlade delivers performance comparable to\nstate-of-the-art specialized accelerators.\n","authors":["Shuwen Lu","Zhihui Zhang","Cong Guo","Jingwen Leng","Yangjie Zhou","Minyi Guo"],"pdf_url":"https://arxiv.org/pdf/2308.08174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08173v1","updated":"2023-08-16T07:05:41Z","published":"2023-08-16T07:05:41Z","title":"Expressivity of Graph Neural Networks Through the Lens of Adversarial\n Robustness","summary":" We perform the first adversarial robustness study into Graph Neural Networks\n(GNNs) that are provably more powerful than traditional Message Passing Neural\nNetworks (MPNNs). In particular, we use adversarial robustness as a tool to\nuncover a significant gap between their theoretically possible and empirically\nachieved expressive power. To do so, we focus on the ability of GNNs to count\nspecific subgraph patterns, which is an established measure of expressivity,\nand extend the concept of adversarial robustness to this task. Based on this,\nwe develop efficient adversarial attacks for subgraph counting and show that\nmore powerful GNNs fail to generalize even to small perturbations to the\ngraph's structure. Expanding on this, we show that such architectures also fail\nto count substructures on out-of-distribution graphs.\n","authors":["Francesco Campi","Lukas Gosch","Tom Wollschläger","Yan Scholten","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2308.08173v1.pdf","comment":"Published in ${2}^{nd}$ AdvML Frontiers workshop at ${40}^{th}$\n International Conference on Machine Learning"},{"id":"http://arxiv.org/abs/2308.08172v1","updated":"2023-08-16T07:02:02Z","published":"2023-08-16T07:02:02Z","title":"AATCT-IDS: A Benchmark Abdominal Adipose Tissue CT Image Dataset for\n Image Denoising, Semantic Segmentation, and Radiomics Evaluation","summary":" Methods: In this study, a benchmark \\emph{Abdominal Adipose Tissue CT Image\nDataset} (AATTCT-IDS) containing 300 subjects is prepared and published.\nAATTCT-IDS publics 13,732 raw CT slices, and the researchers individually\nannotate the subcutaneous and visceral adipose tissue regions of 3,213 of those\nslices that have the same slice distance to validate denoising methods, train\nsemantic segmentation models, and study radiomics. For different tasks, this\npaper compares and analyzes the performance of various methods on AATTCT-IDS by\ncombining the visualization results and evaluation data. Thus, verify the\nresearch potential of this data set in the above three types of tasks.\n Results: In the comparative study of image denoising, algorithms using a\nsmoothing strategy suppress mixed noise at the expense of image details and\nobtain better evaluation data. Methods such as BM3D preserve the original image\nstructure better, although the evaluation data are slightly lower. The results\nshow significant differences among them. In the comparative study of semantic\nsegmentation of abdominal adipose tissue, the segmentation results of adipose\ntissue by each model show different structural characteristics. Among them,\nBiSeNet obtains segmentation results only slightly inferior to U-Net with the\nshortest training time and effectively separates small and isolated adipose\ntissue. In addition, the radiomics study based on AATTCT-IDS reveals three\nadipose distributions in the subject population.\n Conclusion: AATTCT-IDS contains the ground truth of adipose tissue regions in\nabdominal CT slices. This open-source dataset can attract researchers to\nexplore the multi-dimensional characteristics of abdominal adipose tissue and\nthus help physicians and patients in clinical practice. AATCT-IDS is freely\npublished for non-commercial purpose at:\n\\url{https://figshare.com/articles/dataset/AATTCT-IDS/23807256}.\n","authors":["Zhiyu Ma","Chen Li","Tianming Du","Le Zhang","Dechao Tang","Deguo Ma","Shanchuan Huang","Yan Liu","Yihao Sun","Zhihao Chen","Jin Yuan","Qianqing Nie","Marcin Grzegorzek","Hongzan Sun"],"pdf_url":"https://arxiv.org/pdf/2308.08172v1.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.08167v1","updated":"2023-08-16T06:46:37Z","published":"2023-08-16T06:46:37Z","title":"A Quantum Approximation Scheme for k-Means","summary":" We give a quantum approximation scheme (i.e., $(1 +\n\\varepsilon)$-approximation for every $\\varepsilon > 0$) for the classical\n$k$-means clustering problem in the QRAM model with a running time that has\nonly polylogarithmic dependence on the number of data points. More\nspecifically, given a dataset $V$ with $N$ points in $\\mathbb{R}^d$ stored in\nQRAM data structure, our quantum algorithm runs in time $\\tilde{O} \\left(\n2^{\\tilde{O}(\\frac{k}{\\varepsilon})} \\eta^2 d\\right)$ and with high probability\noutputs a set $C$ of $k$ centers such that $cost(V, C) \\leq (1+\\varepsilon)\n\\cdot cost(V, C_{OPT})$. Here $C_{OPT}$ denotes the optimal $k$-centers,\n$cost(.)$ denotes the standard $k$-means cost function (i.e., the sum of the\nsquared distance of points to the closest center), and $\\eta$ is the aspect\nratio (i.e., the ratio of maximum distance to minimum distance). This is the\nfirst quantum algorithm with a polylogarithmic running time that gives a\nprovable approximation guarantee of $(1+\\varepsilon)$ for the $k$-means\nproblem. Also, unlike previous works on unsupervised learning, our quantum\nalgorithm does not require quantum linear algebra subroutines and has a running\ntime independent of parameters (e.g., condition number) that appear in such\nprocedures.\n","authors":["Ragesh Jaiswal"],"pdf_url":"https://arxiv.org/pdf/2308.08167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08163v1","updated":"2023-08-16T06:11:27Z","published":"2023-08-16T06:11:27Z","title":"Characteristics of networks generated by kernel growing neural gas","summary":" This research aims to develop kernel GNG, a kernelized version of the growing\nneural gas (GNG) algorithm, and to investigate the features of the networks\ngenerated by the kernel GNG. The GNG is an unsupervised artificial neural\nnetwork that can transform a dataset into an undirected graph, thereby\nextracting the features of the dataset as a graph. The GNG is widely used in\nvector quantization, clustering, and 3D graphics. Kernel methods are often used\nto map a dataset to feature space, with support vector machines being the most\nprominent application. This paper introduces the kernel GNG approach and\nexplores the characteristics of the networks generated by kernel GNG. Five\nkernels, including Gaussian, Laplacian, Cauchy, inverse multiquadric, and log\nkernels, are used in this study.\n","authors":["Kazuhisa Fujita"],"pdf_url":"https://arxiv.org/pdf/2308.08163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08162v1","updated":"2023-08-16T06:09:51Z","published":"2023-08-16T06:09:51Z","title":"Interpretability Benchmark for Evaluating Spatial Misalignment of\n Prototypical Parts Explanations","summary":" Prototypical parts-based networks are becoming increasingly popular due to\ntheir faithful self-explanations. However, their similarity maps are calculated\nin the penultimate network layer. Therefore, the receptive field of the\nprototype activation region often depends on parts of the image outside this\nregion, which can lead to misleading interpretations. We name this undesired\nbehavior a spatial explanation misalignment and introduce an interpretability\nbenchmark with a set of dedicated metrics for quantifying this phenomenon. In\naddition, we propose a method for misalignment compensation and apply it to\nexisting state-of-the-art models. We show the expressiveness of our benchmark\nand the effectiveness of the proposed compensation methodology through\nextensive empirical studies.\n","authors":["Mikołaj Sacha","Bartosz Jura","Dawid Rymarczyk","Łukasz Struski","Jacek Tabor","Bartosz Zieliński"],"pdf_url":"https://arxiv.org/pdf/2308.08162v1.pdf","comment":"Under review. Code will be release upon acceptance"},{"id":"http://arxiv.org/abs/2308.08160v1","updated":"2023-08-16T06:06:56Z","published":"2023-08-16T06:06:56Z","title":"Benchmarking Adversarial Robustness of Compressed Deep Learning Models","summary":" The increasing size of Deep Neural Networks (DNNs) poses a pressing need for\nmodel compression, particularly when employed on resource constrained devices.\nConcurrently, the susceptibility of DNNs to adversarial attacks presents\nanother significant hurdle. Despite substantial research on both model\ncompression and adversarial robustness, their joint examination remains\nunderexplored. Our study bridges this gap, seeking to understand the effect of\nadversarial inputs crafted for base models on their pruned versions. To examine\nthis relationship, we have developed a comprehensive benchmark across diverse\nadversarial attacks and popular DNN models. We uniquely focus on models not\npreviously exposed to adversarial training and apply pruning schemes optimized\nfor accuracy and performance. Our findings reveal that while the benefits of\npruning enhanced generalizability, compression, and faster inference times are\npreserved, adversarial robustness remains comparable to the base model. This\nsuggests that model compression while offering its unique advantages, does not\nundermine adversarial robustness.\n","authors":["Brijesh Vora","Kartik Patwari","Syed Mahbub Hafiz","Zubair Shafiq","Chen-Nee Chuah"],"pdf_url":"https://arxiv.org/pdf/2308.08160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08158v1","updated":"2023-08-16T06:01:12Z","published":"2023-08-16T06:01:12Z","title":"Deep Generative Imputation Model for Missing Not At Random Data","summary":" Data analysis usually suffers from the Missing Not At Random (MNAR) problem,\nwhere the cause of the value missing is not fully observed. Compared to the\nnaive Missing Completely At Random (MCAR) problem, it is more in line with the\nrealistic scenario whereas more complex and challenging. Existing statistical\nmethods model the MNAR mechanism by different decomposition of the joint\ndistribution of the complete data and the missing mask. But we empirically find\nthat directly incorporating these statistical methods into deep generative\nmodels is sub-optimal. Specifically, it would neglect the confidence of the\nreconstructed mask during the MNAR imputation process, which leads to\ninsufficient information extraction and less-guaranteed imputation quality. In\nthis paper, we revisit the MNAR problem from a novel perspective that the\ncomplete data and missing mask are two modalities of incomplete data on an\nequal footing. Along with this line, we put forward a generative-model-specific\njoint probability decomposition method, conjunction model, to represent the\ndistributions of two modalities in parallel and extract sufficient information\nfrom both complete data and missing mask. Taking a step further, we exploit a\ndeep generative imputation model, namely GNR, to process the real-world missing\nmechanism in the latent space and concurrently impute the incomplete data and\nreconstruct the missing mask. The experimental results show that our GNR\nsurpasses state-of-the-art MNAR baselines with significant margins (averagely\nimproved from 9.9% to 18.8% in RMSE) and always gives a better mask\nreconstruction accuracy which makes the imputation more principle.\n","authors":["Jialei Chen","Yuanbo Xu","Pengyang Wang","Yongjian Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08156v1","updated":"2023-08-16T05:58:12Z","published":"2023-08-16T05:58:12Z","title":"Sarcasm Detection in a Disaster Context","summary":" During natural disasters, people often use social media platforms such as\nTwitter to ask for help, to provide information about the disaster situation,\nor to express contempt about the unfolding event or public policies and\nguidelines. This contempt is in some cases expressed as sarcasm or irony.\nUnderstanding this form of speech in a disaster-centric context is essential to\nimproving natural language understanding of disaster-related tweets. In this\npaper, we introduce HurricaneSARC, a dataset of 15,000 tweets annotated for\nintended sarcasm, and provide a comprehensive investigation of sarcasm\ndetection using pre-trained language models. Our best model is able to obtain\nas much as 0.70 F1 on our dataset. We also demonstrate that the performance on\nHurricaneSARC can be improved by leveraging intermediate task transfer\nlearning. We release our data and code at\nhttps://github.com/tsosea2/HurricaneSarc.\n","authors":["Tiberiu Sosea","Junyi Jessy Li","Cornelia Caragea"],"pdf_url":"https://arxiv.org/pdf/2308.08156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.11434v3","updated":"2023-08-16T05:39:43Z","published":"2022-03-22T03:13:39Z","title":"Non-linear Embeddings in Hilbert Simplex Geometry","summary":" A key technique of machine learning and computer vision is to embed discrete\nweighted graphs into continuous spaces for further downstream processing.\nEmbedding discrete hierarchical structures in hyperbolic geometry has proven\nvery successful since it was shown that any weighted tree can be embedded in\nthat geometry with arbitrary low distortion. Various optimization methods for\nhyperbolic embeddings based on common models of hyperbolic geometry have been\nstudied. In this paper, we consider Hilbert geometry for the standard simplex\nwhich is isometric to a vector space equipped with the variation polytope norm.\nWe study the representation power of this Hilbert simplex geometry by embedding\ndistance matrices of graphs. Our findings demonstrate that Hilbert simplex\ngeometry is competitive to alternative geometries such as the Poincar\\'e\nhyperbolic ball or the Euclidean geometry for embedding tasks while being fast\nand numerically robust.\n","authors":["Frank Nielsen","Ke Sun"],"pdf_url":"https://arxiv.org/pdf/2203.11434v3.pdf","comment":"21 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.07687v2","updated":"2023-08-16T05:24:46Z","published":"2023-08-15T10:37:04Z","title":"DiffGuard: Semantic Mismatch-Guided Out-of-Distribution Detection using\n Pre-trained Diffusion Models","summary":" Given a classifier, the inherent property of semantic Out-of-Distribution\n(OOD) samples is that their contents differ from all legal classes in terms of\nsemantics, namely semantic mismatch. There is a recent work that directly\napplies it to OOD detection, which employs a conditional Generative Adversarial\nNetwork (cGAN) to enlarge semantic mismatch in the image space. While achieving\nremarkable OOD detection performance on small datasets, it is not applicable to\nImageNet-scale datasets due to the difficulty in training cGANs with both input\nimages and labels as conditions. As diffusion models are much easier to train\nand amenable to various conditions compared to cGANs, in this work, we propose\nto directly use pre-trained diffusion models for semantic mismatch-guided OOD\ndetection, named DiffGuard. Specifically, given an OOD input image and the\npredicted label from the classifier, we try to enlarge the semantic difference\nbetween the reconstructed OOD image under these conditions and the original\ninput image. We also present several test-time techniques to further strengthen\nsuch differences. Experimental results show that DiffGuard is effective on both\nCifar-10 and hard cases of the large-scale ImageNet, and it can be easily\ncombined with existing OOD detection techniques to achieve state-of-the-art OOD\ndetection results.\n","authors":["Ruiyuan Gao","Chenchen Zhao","Lanqing Hong","Qiang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.07687v2.pdf","comment":"Accepted by ICCV2023, with supplementary materials"},{"id":"http://arxiv.org/abs/2210.14184v2","updated":"2023-08-16T05:08:00Z","published":"2022-10-25T17:22:31Z","title":"Learning Ability of Interpolating Deep Convolutional Neural Networks","summary":" It is frequently observed that overparameterized neural networks generalize\nwell. Regarding such phenomena, existing theoretical work mainly devotes to\nlinear settings or fully-connected neural networks. This paper studies the\nlearning ability of an important family of deep neural networks, deep\nconvolutional neural networks (DCNNs), under both underparameterized and\noverparameterized settings. We establish the first learning rates of\nunderparameterized DCNNs without parameter or function variable structure\nrestrictions presented in the literature. We also show that by adding\nwell-defined layers to a non-interpolating DCNN, we can obtain some\ninterpolating DCNNs that maintain the good learning rates of the\nnon-interpolating DCNN. This result is achieved by a novel network deepening\nscheme designed for DCNNs. Our work provides theoretical verification of how\noverfitted DCNNs generalize well.\n","authors":["Tian-Yi Zhou","Xiaoming Huo"],"pdf_url":"https://arxiv.org/pdf/2210.14184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08148v1","updated":"2023-08-16T05:01:33Z","published":"2023-08-16T05:01:33Z","title":"Hierarchical Topological Ordering with Conditional Independence Test for\n Limited Time Series","summary":" Learning directed acyclic graphs (DAGs) to identify causal relations\nunderlying observational data is crucial but also poses significant challenges.\nRecently, topology-based methods have emerged as a two-step approach to\ndiscovering DAGs by first learning the topological ordering of variables and\nthen eliminating redundant edges, while ensuring that the graph remains\nacyclic. However, one limitation is that these methods would generate numerous\nspurious edges that require subsequent pruning. To overcome this limitation, in\nthis paper, we propose an improvement to topology-based methods by introducing\nlimited time series data, consisting of only two cross-sectional records that\nneed not be adjacent in time and are subject to flexible timing. By\nincorporating conditional instrumental variables as exogenous interventions, we\naim to identify descendant nodes for each variable. Following this line, we\npropose a hierarchical topological ordering algorithm with conditional\nindependence test (HT-CIT), which enables the efficient learning of sparse DAGs\nwith a smaller search space compared to other popular approaches. The HT-CIT\nalgorithm greatly reduces the number of edges that need to be pruned. Empirical\nresults from synthetic and real-world datasets demonstrate the superiority of\nthe proposed HT-CIT algorithm.\n","authors":["Anpeng Wu","Haoxuan Li","Kun Kuang","Keli Zhang","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2308.08148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08503v3","updated":"2023-08-16T04:59:32Z","published":"2023-04-17T06:48:07Z","title":"A Scalable Test Problem Generator for Sequential Transfer Optimization","summary":" Sequential transfer optimization (STO), which aims to improve the\noptimization performance on a task at hand by exploiting the knowledge captured\nfrom several previously-solved optimization tasks stored in a database, has\nbeen gaining increasing research attention over the years. However, despite\nremarkable advances in algorithm design, the development of a systematic\nbenchmark suite for comprehensive comparisons of STO algorithms received far\nless attention. Existing test problems are either simply generated by\nassembling other benchmark functions or extended from specific practical\nproblems with limited variations. The relationships between the optimal\nsolutions of the source and target tasks in these problems are always manually\nconfigured, limiting their ability to model different relationships presented\nin real-world problems. Consequently, the good performance achieved by an\nalgorithm on these problems might be biased and could not be generalized to\nother problems. In light of the above, in this study, we first introduce four\nrudimentary concepts for characterizing STO problems (STOPs) and present an\nimportant problem feature, namely similarity distribution, which quantitatively\ndelineates the relationship between the optima of the source and target tasks.\nThen, we propose the general design guidelines and a problem generator with\nsuperior scalability. Specifically, the similarity distribution of an STOP can\nbe easily customized, enabling a continuous spectrum of representation of the\ndiverse similarity relationships of real-world problems. Lastly, a benchmark\nsuite with 12 STOPs featured by a variety of customized similarity\nrelationships is developed using the proposed generator, which would serve as\nan arena for STO algorithms and provide more comprehensive evaluation results.\nThe source code of the problem generator is available at\nhttps://github.com/XmingHsueh/STOP-G.\n","authors":["Xiaoming Xue","Cuie Yang","Liang Feng","Kai Zhang","Linqi Song","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2304.08503v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15891v2","updated":"2023-08-16T04:53:15Z","published":"2023-06-28T03:16:45Z","title":"Capturing the Diffusive Behavior of the Multiscale Linear Transport\n Equations by Asymptotic-Preserving Convolutional DeepONets","summary":" In this paper, we introduce two types of novel Asymptotic-Preserving\nConvolutional Deep Operator Networks (APCONs) designed to address the\nmultiscale time-dependent linear transport problem. We observe that the vanilla\nphysics-informed DeepONets with modified MLP may exhibit instability in\nmaintaining the desired limiting macroscopic behavior. Therefore, this\nnecessitates the utilization of an asymptotic-preserving loss function. Drawing\ninspiration from the heat kernel in the diffusion equation, we propose a new\narchitecture called Convolutional Deep Operator Networks, which employ multiple\nlocal convolution operations instead of a global heat kernel, along with\npooling and activation operations in each filter layer. Our APCON methods\npossess a parameter count that is independent of the grid size and are capable\nof capturing the diffusive behavior of the linear transport problem. Finally,\nwe validate the effectiveness of our methods through several numerical\nexamples.\n","authors":["Keke Wu","Xiong-bin Yan","Shi Jin","Zheng Ma"],"pdf_url":"https://arxiv.org/pdf/2306.15891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05890v2","updated":"2023-08-16T04:15:14Z","published":"2023-05-10T04:20:36Z","title":"CUTS+: High-dimensional Causal Discovery from Irregular Time-series","summary":" Causal discovery in time-series is a fundamental problem in the machine\nlearning community, enabling causal reasoning and decision-making in complex\nscenarios. Recently, researchers successfully discover causality by combining\nneural networks with Granger causality, but their performances degrade largely\nwhen encountering high-dimensional data because of the highly redundant network\ndesign and huge causal graphs. Moreover, the missing entries in the\nobservations further hamper the causal structural learning. To overcome these\nlimitations, We propose CUTS+, which is built on the Granger-causality-based\ncausal discovery method CUTS and raises the scalability by introducing a\ntechnique called Coarse-to-fine-discovery (C2FD) and leveraging a\nmessage-passing-based graph neural network (MPGNN). Compared to previous\nmethods on simulated, quasi-real, and real datasets, we show that CUTS+ largely\nimproves the causal discovery performance on high-dimensional data with\ndifferent types of irregular sampling.\n","authors":["Yuxiao Cheng","Lianglong Li","Tingxiong Xiao","Zongren Li","Qin Zhong","Jinli Suo","Kunlun He"],"pdf_url":"https://arxiv.org/pdf/2305.05890v2.pdf","comment":"Submit to AAAI-24"},{"id":"http://arxiv.org/abs/2308.08138v1","updated":"2023-08-16T04:05:22Z","published":"2023-08-16T04:05:22Z","title":"Online Control for Linear Dynamics: A Data-Driven Approach","summary":" This paper considers an online control problem over a linear time-invariant\nsystem with unknown dynamics, bounded disturbance, and adversarial cost. We\npropose a data-driven strategy to reduce the regret of the controller. Unlike\nmodel-based methods, our algorithm does not identify the system model, instead,\nit leverages a single noise-free trajectory to calculate the accumulation of\ndisturbance and makes decisions using the accumulated disturbance action\ncontroller we design, whose parameters are updated by online gradient descent.\nWe prove that the regret of our algorithm is $\\mathcal{O}(\\sqrt{T})$ under mild\nassumptions, suggesting that its performance is on par with model-based\nmethods.\n","authors":["Zishun Liu","Yongxin Chen"],"pdf_url":"https://arxiv.org/pdf/2308.08138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09043v2","updated":"2023-08-16T04:01:14Z","published":"2022-10-14T01:51:33Z","title":"ST-former for short-term passenger flow prediction during COVID-19 in\n urban rail transit system","summary":" Accurate passenger flow prediction of urban rail transit is essential for\nimproving the performance of intelligent transportation systems, especially\nduring the epidemic. How to dynamically model the complex spatiotemporal\ndependencies of passenger flow is the main issue in achieving accurate\npassenger flow prediction during the epidemic. To solve this issue, this paper\nproposes a brand-new transformer-based architecture called STformer under the\nencoder-decoder framework specifically for COVID-19. Concretely, we develop a\nmodified self-attention mechanism named Causal-Convolution ProbSparse\nSelf-Attention (CPSA) to model the multiple temporal dependencies of passenger\nflow with low computational costs. To capture the complex and dynamic spatial\ndependencies, we introduce a novel Adaptive Multi-Graph Convolution Network\n(AMGCN) by leveraging multiple graphs in a self-adaptive manner. Additionally,\nthe Multi-source Data Fusion block fuses the passenger flow data, COVID-19\nconfirmed case data, and the relevant social media data to study the impact of\nCOVID-19 to passenger flow. Experiments on real-world passenger flow datasets\ndemonstrate the superiority of ST-former over the other eleven state-of-the-art\nmethods. Several ablation studies are carried out to verify the effectiveness\nand reliability of our model structure. Results can provide critical insights\nfor the operation of URT systems.\n","authors":["Shuxin Zhang","Jinlei Zhang","Lixing Yang","Chengcheng Wang","Ziyou Gao"],"pdf_url":"https://arxiv.org/pdf/2210.09043v2.pdf","comment":"There are some errors that might mislead readers for this version.\n There is no new version right now"},{"id":"http://arxiv.org/abs/2203.00007v4","updated":"2023-08-16T04:00:48Z","published":"2022-02-27T01:06:24Z","title":"Spatial-Temporal Attention Fusion Network for short-term passenger flow\n prediction on holidays in urban rail transit systems","summary":" The short term passenger flow prediction of the urban rail transit system is\nof great significance for traffic operation and management. The emerging deep\nlearning-based models provide effective methods to improve prediction accuracy.\nHowever, most of the existing models mainly predict the passenger flow on\ngeneral weekdays or weekends. There are only few studies focusing on predicting\nthe passenger flow on holidays, which is a significantly challenging task for\ntraffic management because of its suddenness and irregularity. To this end, we\npropose a deep learning-based model named Spatial Temporal Attention Fusion\nNetwork comprising a novel Multi-Graph Attention Network, a Conv-Attention\nBlock, and Feature Fusion Block for short-term passenger flow prediction on\nholidays. The multi-graph attention network is applied to extract the complex\nspatial dependencies of passenger flow dynamically and the conv-attention block\nis applied to extract the temporal dependencies of passenger flow from global\nand local perspectives. Moreover, in addition to the historical passenger flow\ndata, the social media data, which has been proven that they can effectively\nreflect the evolution trend of passenger flow under events, are also fused into\nthe feature fusion block of STAFN. The STAFN is tested on two large-scale urban\nrail transit AFC datasets from China on the New Year holiday, and the\nprediction performance of the model are compared with that of several\nconventional prediction models. Results demonstrate its better robustness and\nadvantages among benchmark methods, which can provide overwhelming support for\npractical applications of short term passenger flow prediction on holidays.\n","authors":["Shuxin Zhang","Jinlei Zhang","Lixing Yang","Jiateng Yin","Ziyou Gao"],"pdf_url":"https://arxiv.org/pdf/2203.00007v4.pdf","comment":"There are some errors that might mislead readers for this version.\n There is no new version right now"},{"id":"http://arxiv.org/abs/2202.06727v3","updated":"2023-08-16T04:00:33Z","published":"2022-02-10T13:18:11Z","title":"STG-GAN: A spatiotemporal graph generative adversarial networks for\n short-term passenger flow prediction in urban rail transit systems","summary":" Short-term passenger flow prediction is an important but challenging task for\nbetter managing urban rail transit (URT) systems. Some emerging deep learning\nmodels provide good insights to improve short-term prediction accuracy.\nHowever, there exist many complex spatiotemporal dependencies in URT systems.\nMost previous methods only consider the absolute error between ground truth and\npredictions as the optimization objective, which fails to account for spatial\nand temporal constraints on the predictions. Furthermore, a large number of\nexisting prediction models introduce complex neural network layers to improve\naccuracy while ignoring their training efficiency and memory occupancy,\ndecreasing the chances to be applied to the real world. To overcome these\nlimitations, we propose a novel deep learning-based spatiotemporal graph\ngenerative adversarial network (STG-GAN) model with higher prediction accuracy,\nhigher efficiency, and lower memory occupancy to predict short-term passenger\nflows of the URT network. Our model consists of two major parts, which are\noptimized in an adversarial learning manner: (1) a generator network including\ngated temporal conventional networks (TCN) and weight sharing graph convolution\nnetworks (GCN) to capture structural spatiotemporal dependencies and generate\npredictions with a relatively small computational burden; (2) a discriminator\nnetwork including a spatial discriminator and a temporal discriminator to\nenhance the spatial and temporal constraints of the predictions. The STG-GAN is\nevaluated on two large-scale real-world datasets from Beijing Subway. A\ncomparison with those of several state-of-the-art models illustrates its\nsuperiority and robustness. This study can provide critical experience in\nconducting short-term passenger flow predictions, especially from the\nperspective of real-world applications.\n","authors":["Jinlei Zhang","Hua Li","Lixing Yang","Guangyin Jin","Jianguo Qi","Ziyou Gao"],"pdf_url":"https://arxiv.org/pdf/2202.06727v3.pdf","comment":"There are some errors that might mislead readers for this version.\n There is no new version right now"},{"id":"http://arxiv.org/abs/2308.08135v1","updated":"2023-08-16T03:56:58Z","published":"2023-08-16T03:56:58Z","title":"Microstructure-Empowered Stock Factor Extraction and Utilization","summary":" High-frequency quantitative investment is a crucial aspect of stock\ninvestment. Notably, order flow data plays a critical role as it provides the\nmost detailed level of information among high-frequency trading data, including\ncomprehensive data from the order book and transaction records at the tick\nlevel. The order flow data is extremely valuable for market analysis as it\nequips traders with essential insights for making informed decisions. However,\nextracting and effectively utilizing order flow data present challenges due to\nthe large volume of data involved and the limitations of traditional factor\nmining techniques, which are primarily designed for coarser-level stock data.\nTo address these challenges, we propose a novel framework that aims to\neffectively extract essential factors from order flow data for diverse\ndownstream tasks across different granularities and scenarios. Our method\nconsists of a Context Encoder and an Factor Extractor. The Context Encoder\nlearns an embedding for the current order flow data segment's context by\nconsidering both the expected and actual market state. In addition, the Factor\nExtractor uses unsupervised learning methods to select such important signals\nthat are most distinct from the majority within the given context. The\nextracted factors are then utilized for downstream tasks. In empirical studies,\nour proposed framework efficiently handles an entire year of stock order flow\ndata across diverse scenarios, offering a broader range of applications\ncompared to existing tick-level approaches that are limited to only a few days\nof stock data. We demonstrate that our method extracts superior factors from\norder flow data, enabling significant improvement for stock trend prediction\nand order execution tasks at the second and minute level.\n","authors":["Xianfeng Jiao","Zizhong Li","Chang Xu","Yang Liu","Weiqing Liu","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2308.08135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08129v1","updated":"2023-08-16T03:38:43Z","published":"2023-08-16T03:38:43Z","title":"Is Self-Supervised Pretraining Good for Extrapolation in Molecular\n Property Prediction?","summary":" The prediction of material properties plays a crucial role in the development\nand discovery of materials in diverse applications, such as batteries,\nsemiconductors, catalysts, and pharmaceuticals. Recently, there has been a\ngrowing interest in employing data-driven approaches by using machine learning\ntechnologies, in combination with conventional theoretical calculations. In\nmaterial science, the prediction of unobserved values, commonly referred to as\nextrapolation, is particularly critical for property prediction as it enables\nresearchers to gain insight into materials beyond the limits of available data.\nHowever, even with the recent advancements in powerful machine learning models,\naccurate extrapolation is still widely recognized as a significantly\nchallenging problem. On the other hand, self-supervised pretraining is a\nmachine learning technique where a model is first trained on unlabeled data\nusing relatively simple pretext tasks before being trained on labeled data for\ntarget tasks. As self-supervised pretraining can effectively utilize material\ndata without observed property values, it has the potential to improve the\nmodel's extrapolation ability. In this paper, we clarify how such\nself-supervised pretraining can enhance extrapolation performance.We propose an\nexperimental framework for the demonstration and empirically reveal that while\nmodels were unable to accurately extrapolate absolute property values,\nself-supervised pretraining enables them to learn relative tendencies of\nunobserved property values and improve extrapolation performance.\n","authors":["Shun Takashige","Masatoshi Hanai","Toyotaro Suzumura","Limin Wang","Kenjiro Taura"],"pdf_url":"https://arxiv.org/pdf/2308.08129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08128v1","updated":"2023-08-16T03:35:52Z","published":"2023-08-16T03:35:52Z","title":"How to Mask in Error Correction Code Transformer: Systematic and Double\n Masking","summary":" In communication and storage systems, error correction codes (ECCs) are\npivotal in ensuring data reliability. As deep learning's applicability has\nbroadened across diverse domains, there is a growing research focus on neural\nnetwork-based decoders that outperform traditional decoding algorithms. Among\nthese neural decoders, Error Correction Code Transformer (ECCT) has achieved\nthe state-of-the-art performance, outperforming other methods by large margins.\nTo further enhance the performance of ECCT, we propose two novel methods.\nFirst, leveraging the systematic encoding technique of ECCs, we introduce a new\nmasking matrix for ECCT, aiming to improve the performance and reduce the\ncomputational complexity. Second, we propose a novel transformer architecture\nof ECCT called a double-masked ECCT. This architecture employs two different\nmask matrices in a parallel manner to learn more diverse features of the\nrelationship between codeword bits in the masked self-attention blocks.\nExtensive simulation results show that the proposed double-masked ECCT\noutperforms the conventional ECCT, achieving the state-of-the-art decoding\nperformance with significant margins.\n","authors":["Seong-Joon Park","Hee-Youl Kwak","Sang-Hyo Kim","Sunghwan Kim","Yongjune Kim","Jong-Seon No"],"pdf_url":"https://arxiv.org/pdf/2308.08128v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2306.10698v4","updated":"2023-08-16T02:11:16Z","published":"2023-06-19T04:48:36Z","title":"Deep Reinforcement Learning with Multitask Episodic Memory Based on\n Task-Conditioned Hypernetwork","summary":" Deep reinforcement learning algorithms are usually impeded by sampling\ninefficiency, heavily depending on multiple interactions with the environment\nto acquire accurate decision-making capabilities. In contrast, humans rely on\ntheir hippocampus to retrieve relevant information from past experiences of\nrelevant tasks, which guides their decision-making when learning a new task,\nrather than exclusively depending on environmental interactions. Nevertheless,\ndesigning a hippocampus-like module for an agent to incorporate past\nexperiences into established reinforcement learning algorithms presents two\nchallenges. The first challenge involves selecting the most relevant past\nexperiences for the current task, and the second challenge is integrating such\nexperiences into the decision network. To address these challenges, we propose\na novel method that utilizes a retrieval network based on task-conditioned\nhypernetwork, which adapts the retrieval network's parameters depending on the\ntask. At the same time, a dynamic modification mechanism enhances the\ncollaborative efforts between the retrieval and decision networks. We evaluate\nthe proposed method on the MiniGrid environment.The experimental results\ndemonstrate that our proposed method significantly outperforms strong\nbaselines.\n","authors":["Yonggang Jin","Chenxu Wang","Liuyu Xiang","Yaodong Yang","Junge Zhang","Jie Fu","Zhaofeng He"],"pdf_url":"https://arxiv.org/pdf/2306.10698v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08097v1","updated":"2023-08-16T02:08:46Z","published":"2023-08-16T02:08:46Z","title":"S-Mixup: Structural Mixup for Graph Neural Networks","summary":" Existing studies for applying the mixup technique on graphs mainly focus on\ngraph classification tasks, while the research in node classification is still\nunder-explored. In this paper, we propose a novel mixup augmentation for node\nclassification called Structural Mixup (S-Mixup). The core idea is to take into\naccount the structural information while mixing nodes. Specifically, S-Mixup\nobtains pseudo-labels for unlabeled nodes in a graph along with their\nprediction confidence via a Graph Neural Network (GNN) classifier. These serve\nas the criteria for the composition of the mixup pool for both inter and\nintra-class mixups. Furthermore, we utilize the edge gradient obtained from the\nGNN training and propose a gradient-based edge selection strategy for selecting\nedges to be attached to the nodes generated by the mixup. Through extensive\nexperiments on real-world benchmark datasets, we demonstrate the effectiveness\nof S-Mixup evaluated on the node classification task. We observe that S-Mixup\nenhances the robustness and generalization performance of GNNs, especially in\nheterophilous situations. The source code of S-Mixup can be found at\n\\url{https://github.com/SukwonYun/S-Mixup}\n","authors":["Junghurn Kim","Sukwon Yun","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2308.08097v1.pdf","comment":"CIKM 2023 (Short Paper)"},{"id":"http://arxiv.org/abs/2308.08086v1","updated":"2023-08-16T01:30:13Z","published":"2023-08-16T01:30:13Z","title":"Safety Filter Design for Neural Network Systems via Convex Optimization","summary":" With the increase in data availability, it has been widely demonstrated that\nneural networks (NN) can capture complex system dynamics precisely in a\ndata-driven manner. However, the architectural complexity and nonlinearity of\nthe NNs make it challenging to synthesize a provably safe controller. In this\nwork, we propose a novel safety filter that relies on convex optimization to\nensure safety for a NN system, subject to additive disturbances that are\ncapable of capturing modeling errors. Our approach leverages tools from NN\nverification to over-approximate NN dynamics with a set of linear bounds,\nfollowed by an application of robust linear MPC to search for controllers that\ncan guarantee robust constraint satisfaction. We demonstrate the efficacy of\nthe proposed framework numerically on a nonlinear pendulum system.\n","authors":["Shaoru Chen","Kong Yao Chee","Nikolai Matni","M. Ani Hsieh","George J. Pappas"],"pdf_url":"https://arxiv.org/pdf/2308.08086v1.pdf","comment":"This paper has been accepted to the 2023 62nd IEEE Conference on\n Decision and Control (CDC)"},{"id":"http://arxiv.org/abs/2308.07439v2","updated":"2023-08-16T01:29:39Z","published":"2023-08-14T20:20:26Z","title":"Interaction-Aware Personalized Vehicle Trajectory Prediction Using\n Temporal Graph Neural Networks","summary":" Accurate prediction of vehicle trajectories is vital for advanced driver\nassistance systems and autonomous vehicles. Existing methods mainly rely on\ngeneric trajectory predictions derived from large datasets, overlooking the\npersonalized driving patterns of individual drivers. To address this gap, we\npropose an approach for interaction-aware personalized vehicle trajectory\nprediction that incorporates temporal graph neural networks. Our method\nutilizes Graph Convolution Networks (GCN) and Long Short-Term Memory (LSTM) to\nmodel the spatio-temporal interactions between target vehicles and their\nsurrounding traffic. To personalize the predictions, we establish a pipeline\nthat leverages transfer learning: the model is initially pre-trained on a\nlarge-scale trajectory dataset and then fine-tuned for each driver using their\nspecific driving data. We employ human-in-the-loop simulation to collect\npersonalized naturalistic driving trajectories and corresponding surrounding\nvehicle trajectories. Experimental results demonstrate the superior performance\nof our personalized GCN-LSTM model, particularly for longer prediction\nhorizons, compared to its generic counterpart. Moreover, the personalized model\noutperforms individual models created without pre-training, emphasizing the\nsignificance of pre-training on a large dataset to avoid overfitting. By\nincorporating personalization, our approach enhances trajectory prediction\naccuracy.\n","authors":["Amr Abdelraouf","Rohit Gupta","Kyungtae Han"],"pdf_url":"https://arxiv.org/pdf/2308.07439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.12288v2","updated":"2023-08-16T01:05:13Z","published":"2022-08-25T18:25:27Z","title":"Neuro-Dynamic State Estimation for Networked Microgrids","summary":" We devise neuro-dynamic state estimation (Neuro-DSE), a learning-based\ndynamic state estimation (DSE) algorithm for networked microgrids (NMs) under\nunknown subsystems. Our contributions include: 1) a data-driven Neuro-DSE\nalgorithm for NMs DSE with partially unidentified dynamic models, which\nincorporates the neural-ordinary-differential-equations (ODE-Net) into Kalman\nfilters; 2) a self-refining Neuro-DSE algorithm (Neuro-DSE+) which enables\ndata-driven DSE under limited and noisy measurements by establishing an\nautomatic filtering, augmenting and correcting framework; 3) a\nNeuro-KalmanNet-DSE algorithm which further integrates KalmanNet with Neuro-DSE\nto relieve the model mismatch of both neural- and physics-based dynamic models;\nand 4) an augmented Neuro-DSE for joint estimation of NMs states and unknown\nparameters (e.g., inertia). Extensive case studies demonstrate the efficacy of\nNeuro-DSE and its variants under different noise levels, control modes, power\nsources, observabilities and model knowledge, respectively.\n","authors":["Fei Feng","Yifan Zhou","Peng Zhang"],"pdf_url":"https://arxiv.org/pdf/2208.12288v2.pdf","comment":"This paper needs to be withdrawn by the author. In Section II, Part\n C, there is lack of procedure to achieve parameter estimation using the\n proposed model. In Section V, Part E, experiment parameter setting is missed.\n Noise for estimating inertia case needs to be reset for simulation.\n Additional tests need to be added. These two parts need to be rewritten"},{"id":"http://arxiv.org/abs/2210.10592v2","updated":"2023-08-16T01:01:50Z","published":"2022-10-19T14:34:12Z","title":"DyTed: Disentangled Representation Learning for Discrete-time Dynamic\n Graph","summary":" Unsupervised representation learning for dynamic graphs has attracted a lot\nof research attention in recent years. Compared with static graph, the dynamic\ngraph is a comprehensive embodiment of both the intrinsic stable\ncharacteristics of nodes and the time-related dynamic preference. However,\nexisting methods generally mix these two types of information into a single\nrepresentation space, which may lead to poor explanation, less robustness, and\na limited ability when applied to different downstream tasks. To solve the\nabove problems, in this paper, we propose a novel disenTangled representation\nlearning framework for discrete-time Dynamic graphs, namely DyTed. We specially\ndesign a temporal-clips contrastive learning task together with a structure\ncontrastive learning to effectively identify the time-invariant and\ntime-varying representations respectively. To further enhance the\ndisentanglement of these two types of representation, we propose a\ndisentanglement-aware discriminator under an adversarial learning framework\nfrom the perspective of information theory. Extensive experiments on Tencent\nand five commonly used public datasets demonstrate that DyTed, as a general\nframework that can be applied to existing methods, achieves state-of-the-art\nperformance on various downstream tasks, as well as be more robust against\nnoise.\n","authors":["Kaike Zhang","Qi Cao","Gaolin Fang","Bingbing Xu","Hongjian Zou","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2210.10592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18437v2","updated":"2023-08-16T00:58:55Z","published":"2023-05-29T00:41:32Z","title":"Explainable Machine Learning for Categorical and Mixed Data with\n Lossless Visualization","summary":" Building accurate and interpretable Machine Learning (ML) models for\nheterogeneous/mixed data is a long-standing challenge for algorithms designed\nfor numeric data. This work focuses on developing numeric coding schemes for\nnon-numeric attributes for ML algorithms to support accurate and explainable ML\nmodels, methods for lossless visualization of n-D non-numeric categorical data\nwith visual rule discovery in these visualizations, and accurate and\nexplainable ML models for categorical data. This study proposes a\nclassification of mixed data types and analyzes their important role in Machine\nLearning. It presents a toolkit for enforcing interpretability of all internal\noperations of ML algorithms on mixed data with a visual data exploration on\nmixed data. A new Sequential Rule Generation (SRG) algorithm for explainable\nrule generation with categorical data is proposed and successfully evaluated in\nmultiple computational experiments. This work is one of the steps to the full\nscope ML algorithms for mixed data supported by lossless visualization of n-D\ndata in General Line Coordinates beyond Parallel Coordinates.\n","authors":["Boris Kovalerchuk","Elijah McCoy"],"pdf_url":"https://arxiv.org/pdf/2305.18437v2.pdf","comment":"46 pages, 32 figures, 29 tables. arXiv admin note: substantial text\n overlap with arXiv:2206.06476"},{"id":"http://arxiv.org/abs/2305.11095v3","updated":"2023-08-16T00:57:34Z","published":"2023-05-18T16:32:58Z","title":"Prompting the Hidden Talent of Web-Scale Speech Models for Zero-Shot\n Task Generalization","summary":" We investigate the emergent abilities of the recently proposed web-scale\nspeech model Whisper, by adapting it to unseen tasks with prompt engineering.\nWe selected three tasks: audio-visual speech recognition (AVSR), code-switched\nspeech recognition (CS-ASR), and speech translation (ST) on unseen language\npairs. We design task-specific prompts, by either leveraging another\nlarge-scale model, or simply manipulating the special tokens in the default\nprompts. Experiments show that compared to the default prompts, our proposed\nprompts improve performance by 10% to 45% on the three zero-shot tasks, and\neven outperform SotA supervised models on some datasets. In addition, our\nexperiments reveal many interesting properties of Whisper, including its\nrobustness to prompts, bias on accents, and the multilingual understanding in\nits latent space. Code is available at\nhttps://github.com/jasonppy/PromptingWhisper\n","authors":["Puyuan Peng","Brian Yan","Shinji Watanabe","David Harwath"],"pdf_url":"https://arxiv.org/pdf/2305.11095v3.pdf","comment":"Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.08079v1","updated":"2023-08-16T00:19:52Z","published":"2023-08-16T00:19:52Z","title":"Rigid Transformations for Stabilized Lower Dimensional Space to Support\n Subsurface Uncertainty Quantification and Interpretation","summary":" Subsurface datasets inherently possess big data characteristics such as vast\nvolume, diverse features, and high sampling speeds, further compounded by the\ncurse of dimensionality from various physical, engineering, and geological\ninputs. Among the existing dimensionality reduction (DR) methods, nonlinear\ndimensionality reduction (NDR) methods, especially Metric-multidimensional\nscaling (MDS), are preferred for subsurface datasets due to their inherent\ncomplexity. While MDS retains intrinsic data structure and quantifies\nuncertainty, its limitations include unstabilized unique solutions invariant to\nEuclidean transformations and an absence of out-of-sample points (OOSP)\nextension. To enhance subsurface inferential and machine learning workflows,\ndatasets must be transformed into stable, reduced-dimension representations\nthat accommodate OOSP.\n Our solution employs rigid transformations for a stabilized Euclidean\ninvariant representation for LDS. By computing an MDS input dissimilarity\nmatrix, and applying rigid transformations on multiple realizations, we ensure\ntransformation invariance and integrate OOSP. This process leverages a convex\nhull algorithm and incorporates loss function and normalized stress for\ndistortion quantification. We validate our approach with synthetic data,\nvarying distance metrics, and real-world wells from the Duvernay Formation.\nResults confirm our method's efficacy in achieving consistent LDS\nrepresentations. Furthermore, our proposed \"stress ratio\" (SR) metric provides\ninsight into uncertainty, beneficial for model adjustments and inferential\nanalysis. Consequently, our workflow promises enhanced repeatability and\ncomparability in NDR for subsurface energy resource engineering and associated\nbig data workflows.\n","authors":["Ademide O. Mabadeje","Michael J. Pyrcz"],"pdf_url":"https://arxiv.org/pdf/2308.08079v1.pdf","comment":"30 pages, 17 figures, Submitted to Computational Geosciences Journal"},{"id":"http://arxiv.org/abs/2303.13516v3","updated":"2023-08-16T00:00:47Z","published":"2023-03-23T17:59:42Z","title":"Ablating Concepts in Text-to-Image Diffusion Models","summary":" Large-scale text-to-image diffusion models can generate high-fidelity images\nwith powerful compositional ability. However, these models are typically\ntrained on an enormous amount of Internet data, often containing copyrighted\nmaterial, licensed images, and personal photos. Furthermore, they have been\nfound to replicate the style of various living artists or memorize exact\ntraining samples. How can we remove such copyrighted concepts or images without\nretraining the model from scratch? To achieve this goal, we propose an\nefficient method of ablating concepts in the pretrained model, i.e., preventing\nthe generation of a target concept. Our algorithm learns to match the image\ndistribution for a target style, instance, or text prompt we wish to ablate to\nthe distribution corresponding to an anchor concept. This prevents the model\nfrom generating target concepts given its text condition. Extensive experiments\nshow that our method can successfully prevent the generation of the ablated\nconcept while preserving closely related concepts in the model.\n","authors":["Nupur Kumari","Bingliang Zhang","Sheng-Yu Wang","Eli Shechtman","Richard Zhang","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2303.13516v3.pdf","comment":"ICCV 2023. Project website: https://www.cs.cmu.edu/~concept-ablation/"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.07578v2","updated":"2023-08-16T14:12:43Z","published":"2023-08-15T05:33:48Z","title":"Understanding User Behavior in Volumetric Video Watching: Dataset,\n Analysis and Prediction","summary":" Volumetric video emerges as a new attractive video paradigm in recent years\nsince it provides an immersive and interactive 3D viewing experience with six\ndegree-of-freedom (DoF). Unlike traditional 2D or panoramic videos, volumetric\nvideos require dense point clouds, voxels, meshes, or huge neural models to\ndepict volumetric scenes, which results in a prohibitively high bandwidth\nburden for video delivery. Users' behavior analysis, especially the viewport\nand gaze analysis, then plays a significant role in prioritizing the content\nstreaming within users' viewport and degrading the remaining content to\nmaximize user QoE with limited bandwidth. Although understanding user behavior\nis crucial, to the best of our best knowledge, there are no available 3D\nvolumetric video viewing datasets containing fine-grained user interactivity\nfeatures, not to mention further analysis and behavior prediction. In this\npaper, we for the first time release a volumetric video viewing behavior\ndataset, with a large scale, multiple dimensions, and diverse conditions. We\nconduct an in-depth analysis to understand user behaviors when viewing\nvolumetric videos. Interesting findings on user viewport, gaze, and motion\npreference related to different videos and users are revealed. We finally\ndesign a transformer-based viewport prediction model that fuses the features of\nboth gaze and motion, which is able to achieve high accuracy at various\nconditions. Our prediction model is expected to further benefit volumetric\nvideo streaming optimization. Our dataset, along with the corresponding\nvisualization tools is accessible at\nhttps://cuhksz-inml.github.io/user-behavior-in-vv-watching/\n","authors":["Kaiyuan Hu","Haowen Yang","Yili Jin","Junhua Liu","Yongting Chen","Miao Zhang","Fangxin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07578v2.pdf","comment":"Accepted by ACM MM'23"},{"id":"http://arxiv.org/abs/2308.02510v2","updated":"2023-08-16T09:59:40Z","published":"2023-07-27T12:54:16Z","title":"Seeing through the Brain: Image Reconstruction of Visual Perception from\n Human Brain Signals","summary":" Seeing is believing, however, the underlying mechanism of how human visual\nperceptions are intertwined with our cognitions is still a mystery. Thanks to\nthe recent advances in both neuroscience and artificial intelligence, we have\nbeen able to record the visually evoked brain activities and mimic the visual\nperception ability through computational approaches. In this paper, we pay\nattention to visual stimuli reconstruction by reconstructing the observed\nimages based on portably accessible brain signals, i.e., electroencephalography\n(EEG) data. Since EEG signals are dynamic in the time-series format and are\nnotorious to be noisy, processing and extracting useful information requires\nmore dedicated efforts; In this paper, we propose a comprehensive pipeline,\nnamed NeuroImagen, for reconstructing visual stimuli images from EEG signals.\nSpecifically, we incorporate a novel multi-level perceptual information\ndecoding to draw multi-grained outputs from the given EEG data. A latent\ndiffusion model will then leverage the extracted information to reconstruct the\nhigh-resolution visual stimuli images. The experimental results have\nillustrated the effectiveness of image reconstruction and superior quantitative\nperformance of our proposed method.\n","authors":["Yu-Ting Lan","Kan Ren","Yansen Wang","Wei-Long Zheng","Dongsheng Li","Bao-Liang Lu","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2308.02510v2.pdf","comment":"A preprint version of an ongoing work"},{"id":"http://arxiv.org/abs/2308.08143v1","updated":"2023-08-16T04:31:33Z","published":"2023-08-16T04:31:33Z","title":"SCANet: A Self- and Cross-Attention Network for Audio-Visual Speech\n Separation","summary":" The integration of different modalities, such as audio and visual\ninformation, plays a crucial role in human perception of the surrounding\nenvironment. Recent research has made significant progress in designing fusion\nmodules for audio-visual speech separation. However, they predominantly focus\non multi-modal fusion architectures situated either at the top or bottom\npositions, rather than comprehensively considering multi-modal fusion at\nvarious hierarchical positions within the network. In this paper, we propose a\nnovel model called self- and cross-attention network (SCANet), which leverages\nthe attention mechanism for efficient audio-visual feature fusion. SCANet\nconsists of two types of attention blocks: self-attention (SA) and\ncross-attention (CA) blocks, where the CA blocks are distributed at the top\n(TCA), middle (MCA) and bottom (BCA) of SCANet. These blocks maintain the\nability to learn modality-specific features and enable the extraction of\ndifferent semantics from audio-visual features. Comprehensive experiments on\nthree standard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2)\ndemonstrate the effectiveness of SCANet, outperforming existing\nstate-of-the-art (SOTA) methods while maintaining comparable inference time.\n","authors":["Kai Li","Runxuan Yang","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2308.08143v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.07056v2","updated":"2023-08-16T01:58:26Z","published":"2023-08-14T10:31:29Z","title":"VoxBlink: X-Large Speaker Verification Dataset on Camera","summary":" In this paper, we contribute a novel and extensive dataset for speaker\nverification, which contains noisy 38k identities/1.45M utterances (VoxBlink)\nand relatively cleaned 18k identities/1.02M (VoxBlink-Clean) utterances for\ntraining. Firstly, we collect a 60K+ users' list as well as their avatar and\ndownload their SHORT videos on the YouTube. Then, an automatically pipeline is\ndevised to extract target user's speech segments and videos, which is efficient\nand scalable. To the best of our knowledge, the VoxBlink dataset is the largest\nspeaker recognition dataset. Secondly, we develop a series of experiments based\non VoxBlink-clean together with VoxCeleb2. Our findings highlight a notable\nimprovement in performance, ranging from 15% to 30%, across different backbone\narchitectures, upon integrating our dataset for training. The dataset will be\nreleased SOON~.\n","authors":["Yuke Lin","Xiaoyi Qin","Ming Cheng","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2308.07056v2.pdf","comment":"submit to ICASSP2023"},{"id":"http://arxiv.org/abs/2308.08088v1","updated":"2023-08-16T01:38:49Z","published":"2023-08-16T01:38:49Z","title":"Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme\n Detection","summary":" Hateful meme detection is a challenging multimodal task that requires\ncomprehension of both vision and language, as well as cross-modal interactions.\nRecent studies have tried to fine-tune pre-trained vision-language models\n(PVLMs) for this task. However, with increasing model sizes, it becomes\nimportant to leverage powerful PVLMs more efficiently, rather than simply\nfine-tuning them. Recently, researchers have attempted to convert meme images\ninto textual captions and prompt language models for predictions. This approach\nhas shown good performance but suffers from non-informative image captions.\nConsidering the two factors mentioned above, we propose a probing-based\ncaptioning approach to leverage PVLMs in a zero-shot visual question answering\n(VQA) manner. Specifically, we prompt a frozen PVLM by asking hateful\ncontent-related questions and use the answers as image captions (which we call\nPro-Cap), so that the captions contain information critical for hateful content\ndetection. The good performance of models with Pro-Cap on three benchmarks\nvalidates the effectiveness and generalization of the proposed method.\n","authors":["Rui Cao","Ming Shan Hee","Adriel Kuek","Wen-Haw Chong","Roy Ka-Wei Lee","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.08088v1.pdf","comment":"Camera-ready for 23, ACM MM"},{"id":"http://arxiv.org/abs/2308.08696v1","updated":"2023-08-16T22:54:49Z","published":"2023-08-16T22:54:49Z","title":"Improving Anomaly Segmentation with Multi-Granularity Cross-Domain\n Alignment","summary":" Anomaly segmentation plays a crucial role in identifying anomalous objects\nwithin images, which facilitates the detection of road anomalies for autonomous\ndriving. Although existing methods have shown impressive results in anomaly\nsegmentation using synthetic training data, the domain discrepancies between\nsynthetic training data and real test data are often neglected. To address this\nissue, the Multi-Granularity Cross-Domain Alignment (MGCDA) framework is\nproposed for anomaly segmentation in complex driving environments. It uniquely\ncombines a new Multi-source Domain Adversarial Training (MDAT) module and a\nnovel Cross-domain Anomaly-aware Contrastive Learning (CACL) method to boost\nthe generality of the model, seamlessly integrating multi-domain data at both\nscene and sample levels. Multi-source domain adversarial loss and a dynamic\nlabel smoothing strategy are integrated into the MDAT module to facilitate the\nacquisition of domain-invariant features at the scene level, through\nadversarial training across multiple stages. CACL aligns sample-level\nrepresentations with contrastive loss on cross-domain data, which utilizes an\nanomaly-aware sampling strategy to efficiently sample hard samples and anchors.\nThe proposed framework has decent properties of parameter-free during the\ninference stage and is compatible with other anomaly segmentation networks.\nExperimental conducted on Fishyscapes and RoadAnomaly datasets demonstrate that\nthe proposed framework achieves state-of-the-art performance.\n","authors":["Ji Zhang","Xiao Wu","Zhi-Qi Cheng","Qi He","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2308.08696v1.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2306.06979v2","updated":"2023-08-16T22:27:45Z","published":"2023-06-12T09:21:34Z","title":"A Weakly Supervised Approach to Emotion-change Prediction and Improved\n Mood Inference","summary":" Whilst a majority of affective computing research focuses on inferring\nemotions, examining mood or understanding the \\textit{mood-emotion interplay}\nhas received significantly less attention. Building on prior work, we (a)\ndeduce and incorporate emotion-change ($\\Delta$) information for inferring\nmood, without resorting to annotated labels, and (b) attempt mood prediction\nfor long duration video clips, in alignment with the characterisation of mood.\nWe generate the emotion-change ($\\Delta$) labels via metric learning from a\npre-trained Siamese Network, and use these in addition to mood labels for mood\nclassification. Experiments evaluating \\textit{unimodal} (training only using\nmood labels) vs \\textit{multimodal} (training using mood plus $\\Delta$ labels)\nmodels show that mood prediction benefits from the incorporation of\nemotion-change information, emphasising the importance of modelling the\nmood-emotion interplay for effective mood inference.\n","authors":["Soujanya Narayana","Ibrahim Radwan","Ravikiran Parameshwara","Iman Abbasnejad","Akshay Asthana","Ramanathan Subramanian","Roland Goecke"],"pdf_url":"https://arxiv.org/pdf/2306.06979v2.pdf","comment":"9 pages, 3 figures, 6 tables, published in IEEE International\n Conference on Affective Computing and Intelligent Interaction"}]},"2023-08-18T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2306.07622v2","updated":"2023-08-18T17:33:15Z","published":"2023-06-13T08:43:13Z","title":"Human-Like Intuitive Behavior and Reasoning Biases Emerged in Language\n Models -- and Disappeared in GPT-4","summary":" Large language models (LLMs) are currently at the forefront of intertwining\nAI systems with human communication and everyday life. Therefore, it is of\ngreat importance to evaluate their emerging abilities. In this study, we show\nthat LLMs, most notably GPT-3, exhibit behavior that strikingly resembles\nhuman-like intuition -- and the cognitive errors that come with it. However,\nLLMs with higher cognitive capabilities, in particular ChatGPT and GPT-4,\nlearned to avoid succumbing to these errors and perform in a hyperrational\nmanner. For our experiments, we probe LLMs with the Cognitive Reflection Test\n(CRT) as well as semantic illusions that were originally designed to\ninvestigate intuitive decision-making in humans. Moreover, we probe how sturdy\nthe inclination for intuitive-like decision-making is. Our study demonstrates\nthat investigating LLMs with methods from psychology has the potential to\nreveal otherwise unknown emergent traits.\n","authors":["Thilo Hagendorff","Sarah Fabi"],"pdf_url":"https://arxiv.org/pdf/2306.07622v2.pdf","comment":"Overlap with arXiv:2212.05206"},{"id":"http://arxiv.org/abs/2308.09687v1","updated":"2023-08-18T17:29:23Z","published":"2023-08-18T17:29:23Z","title":"Graph of Thoughts: Solving Elaborate Problems with Large Language Models","summary":" We introduce Graph of Thoughts (GoT): a framework that advances prompting\ncapabilities in large language models (LLMs) beyond those offered by paradigms\nsuch as Chain-ofThought or Tree of Thoughts (ToT). The key idea and primary\nadvantage of GoT is the ability to model the information generated by an LLM as\nan arbitrary graph, where units of information (\"LLM thoughts\") are vertices,\nand edges correspond to dependencies between these vertices. This approach\nenables combining arbitrary LLM thoughts into synergistic outcomes, distilling\nthe essence of whole networks of thoughts, or enhancing thoughts using feedback\nloops. We illustrate that GoT offers advantages over state of the art on\ndifferent tasks, for example increasing the quality of sorting by 62% over ToT,\nwhile simultaneously reducing costs by >31%. We ensure that GoT is extensible\nwith new thought transformations and thus can be used to spearhead new\nprompting schemes. This work brings the LLM reasoning closer to human thinking\nor brain mechanisms such as recurrence, both of which form complex networks.\n","authors":["Maciej Besta","Nils Blach","Ales Kubicek","Robert Gerstenberger","Lukas Gianinazzi","Joanna Gajda","Tomasz Lehmann","Michal Podstawski","Hubert Niewiadomski","Piotr Nyczyk","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2308.09687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.01175v2","updated":"2023-08-18T16:56:31Z","published":"2022-04-03T22:53:36Z","title":"A Part-of-Speech Tagger for Yiddish","summary":" We describe the construction and evaluation of a part-of-speech tagger for\nYiddish. This is the first step in a larger project of automatically assigning\npart-of-speech tags and syntactic structure to Yiddish text for purposes of\nlinguistic research. We combine two resources for the current work - an\n80K-word subset of the Penn Parsed Corpus of Historical Yiddish (PPCHY) and 650\nmillion words of OCR'd Yiddish text from the Yiddish Book Center (YBC). Yiddish\northography in the YBC corpus has many spelling inconsistencies, and we present\nsome evidence that even simple non-contextualized embeddings trained on YBC are\nable to capture the relationships among spelling variants without the need to\nfirst \"standardize\" the corpus. We also use YBC for continued pretraining of\ncontexualized embeddings, which are then integrated into a tagger model trained\nand evaluated on the PPCHY. We evaluate the tagger performance on a 10-fold\ncross-validation split, showing that the use of the YBC text for the\ncontextualized embeddings improves tagger performance. We conclude by\ndiscussing some next steps, including the need for additional annotated\ntraining and test data.\n","authors":["Seth Kulick","Neville Ryant","Beatrice Santorini","Joel Wallenberg","Assaf Urieli"],"pdf_url":"https://arxiv.org/pdf/2204.01175v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09671v1","updated":"2023-08-18T16:46:11Z","published":"2023-08-18T16:46:11Z","title":"OCR Language Models with Custom Vocabularies","summary":" Language models are useful adjuncts to optical models for producing accurate\noptical character recognition (OCR) results. One factor which limits the power\nof language models in this context is the existence of many specialized domains\nwith language statistics very different from those implied by a general\nlanguage model - think of checks, medical prescriptions, and many other\nspecialized document classes. This paper introduces an algorithm for\nefficiently generating and attaching a domain specific word based language\nmodel at run time to a general language model in an OCR system. In order to\nbest use this model the paper also introduces a modified CTC beam search\ndecoder which effectively allows hypotheses to remain in contention based on\npossible future completion of vocabulary words. The result is a substantial\nreduction in word error rate in recognizing material from specialized domains.\n","authors":["Peter Garst","Reeve Ingle","Yasuhisa Fujii"],"pdf_url":"https://arxiv.org/pdf/2308.09671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09662v1","updated":"2023-08-18T16:27:04Z","published":"2023-08-18T16:27:04Z","title":"Red-Teaming Large Language Models using Chain of Utterances for\n Safety-Alignment","summary":" Larger language models (LLMs) have taken the world by storm with their\nmassive multi-tasking capabilities simply by optimizing over a next-word\nprediction objective. With the emergence of their properties and encoded\nknowledge, the risk of LLMs producing harmful outputs increases, making them\nunfit for scalable deployment for the public. In this work, we propose a new\nsafety evaluation benchmark RED-EVAL that carries out red-teaming. We show that\neven widely deployed models are susceptible to the Chain of Utterances-based\n(CoU) prompting, jailbreaking closed source LLM-based systems such as GPT-4 and\nChatGPT to unethically respond to more than 65% and 73% of harmful queries. We\nalso demonstrate the consistency of the RED-EVAL across 8 open-source LLMs in\ngenerating harmful responses in more than 86% of the red-teaming attempts.\nNext, we propose RED-INSTRUCT--An approach for the safety alignment of LLMs. It\nconstitutes two phases: 1) HARMFULQA data collection: Leveraging CoU prompting,\nwe collect a dataset that consists of 1.9K harmful questions covering a wide\nrange of topics, 9.5K safe and 7.3K harmful conversations from ChatGPT; 2)\nSAFE-ALIGN: We demonstrate how the conversational dataset can be used for the\nsafety alignment of LLMs by minimizing the negative log-likelihood over helpful\nresponses and penalizing over harmful responses by gradient accent over sample\nloss. Our model STARLING, a fine-tuned Vicuna-7B, is observed to be more safely\naligned when evaluated on RED-EVAL and HHH benchmarks while preserving the\nutility of the baseline models (TruthfulQA, MMLU, and BBH).\n","authors":["Rishabh Bhardwaj","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2308.09662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09658v1","updated":"2023-08-18T16:21:40Z","published":"2023-08-18T16:21:40Z","title":"Tree-of-Mixed-Thought: Combining Fast and Slow Thinking for Multi-hop\n Visual Reasoning","summary":" There emerges a promising trend of using large language models (LLMs) to\ngenerate code-like plans for complex inference tasks such as visual reasoning.\nThis paradigm, known as LLM-based planning, provides flexibility in problem\nsolving and endows better interpretability. However, current research is mostly\nlimited to basic scenarios of simple questions that can be straightforward\nanswered in a few inference steps. Planning for the more challenging multi-hop\nvisual reasoning tasks remains under-explored. Specifically, under multi-hop\nreasoning situations, the trade-off between accuracy and the complexity of\nplan-searching becomes prominent. The prevailing algorithms either address the\nefficiency issue by employing the fast one-stop generation or adopt a complex\niterative generation method to improve accuracy. Both fail to balance the need\nfor efficiency and performance. Drawing inspiration from the dual system of\ncognition in the human brain, the fast and the slow think processes, we propose\na hierarchical plan-searching algorithm that integrates the one-stop reasoning\n(fast) and the Tree-of-thought (slow). Our approach succeeds in performance\nwhile significantly saving inference steps. Moreover, we repurpose the PTR and\nthe CLEVER datasets, developing a systematic framework for evaluating the\nperformance and efficiency of LLMs-based plan-search algorithms under reasoning\ntasks at different levels of difficulty. Extensive experiments demonstrate the\nsuperiority of our proposed algorithm in terms of performance and efficiency.\nThe dataset and code will be release soon.\n","authors":["Pengbo Hu","Ji Qi","Xingyu Li","Hong Li","Xinqi Wang","Bing Quan","Ruiyu Wang","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.09658v1.pdf","comment":"16 pages,1 figures, under review"},{"id":"http://arxiv.org/abs/2308.09597v1","updated":"2023-08-18T14:50:25Z","published":"2023-08-18T14:50:25Z","title":"ChatHaruhi: Reviving Anime Character in Reality via Large Language Model","summary":" Role-playing chatbots built on large language models have drawn interest, but\nbetter techniques are needed to enable mimicking specific fictional characters.\nWe propose an algorithm that controls language models via an improved prompt\nand memories of the character extracted from scripts. We construct ChatHaruhi,\na dataset covering 32 Chinese / English TV / anime characters with over 54k\nsimulated dialogues. Both automatic and human evaluations show our approach\nimproves role-playing ability over baselines. Code and data are available at\nhttps://github.com/LC1332/Chat-Haruhi-Suzumiya .\n","authors":["Cheng Li","Ziang Leng","Chenxi Yan","Junyi Shen","Hao Wang","Weishi MI","Yaying Fei","Xiaoyang Feng","Song Yan","HaoSheng Wang","Linkang Zhan","Yaokai Jia","Pingyu Wu","Haozhen Sun"],"pdf_url":"https://arxiv.org/pdf/2308.09597v1.pdf","comment":"v1 - First version of techique report"},{"id":"http://arxiv.org/abs/2308.09583v1","updated":"2023-08-18T14:23:21Z","published":"2023-08-18T14:23:21Z","title":"WizardMath: Empowering Mathematical Reasoning for Large Language Models\n via Reinforced Evol-Instruct","summary":" Large language models (LLMs), such as GPT-4, have shown remarkable\nperformance in natural language processing (NLP) tasks, including challenging\nmathematical reasoning. However, most existing open-source models are only\npre-trained on large-scale internet data and without math-related optimization.\nIn this paper, we present WizardMath, which enhances the mathematical reasoning\nabilities of Llama-2, by applying our proposed Reinforcement Learning from\nEvol-Instruct Feedback (RLEIF) method to the domain of math. Through extensive\nexperiments on two mathematical reasoning benchmarks, namely GSM8k and MATH, we\nreveal the extraordinary capabilities of our model. WizardMath surpasses all\nother open-source LLMs by a substantial margin. Furthermore, our model even\noutperforms ChatGPT-3.5, Claude Instant-1, PaLM-2 and Minerva on GSM8k,\nsimultaneously surpasses Text-davinci-002, PaLM-1 and GPT-3 on MATH. More\ndetails and model weights are public at https://github.com/nlpxucan/WizardLM\nand https://huggingface.co/WizardLM.\n","authors":["Haipeng Luo","Qingfeng Sun","Can Xu","Pu Zhao","Jianguang Lou","Chongyang Tao","Xiubo Geng","Qingwei Lin","Shifeng Chen","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09583v1.pdf","comment":"LLM, Mathematical Reasoning"},{"id":"http://arxiv.org/abs/2308.09568v1","updated":"2023-08-18T14:01:37Z","published":"2023-08-18T14:01:37Z","title":"PUMGPT: A Large Vision-Language Model for Product Understanding","summary":" Recent developments of multi-modal large language models have demonstrated\nits strong ability in solving vision-language tasks. In this paper, we focus on\nthe product understanding task, which plays an essential role in enhancing\nonline shopping experience. Product understanding task includes a variety of\nsub-tasks, which require models to respond diverse queries based on multi-modal\nproduct information. Traditional methods design distinct model architectures\nfor each sub-task. On the contrary, we present PUMGPT, a large vision-language\nmodel aims at unifying all product understanding tasks under a singular model\nstructure. To bridge the gap between vision and text representations, we\npropose Layer-wise Adapters (LA), an approach that provides enhanced alignment\nwith fewer visual tokens and enables parameter-efficient fine-tuning. Moreover,\nthe inherent parameter-efficient fine-tuning ability allows PUMGPT to be\nreadily adapted to new product understanding tasks and emerging products. We\ndesign instruction templates to generate diverse product instruction datasets.\nSimultaneously, we utilize open-domain datasets during training to improve the\nperformance of PUMGPT and its generalization ability. Through extensive\nevaluations, PUMGPT demonstrates its superior performance across multiple\nproduct understanding tasks, including product captioning, category\nquestion-answering, attribute extraction, attribute question-answering, and\neven free-form question-answering about products.\n","authors":["Shuhui Wu","Zengming Tang","Zongyi Guo","Weiwei Zhang","Baoliang Cui","Haihong Tang","Weiming Lu"],"pdf_url":"https://arxiv.org/pdf/2308.09568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06435v2","updated":"2023-08-18T13:53:06Z","published":"2023-07-12T20:01:52Z","title":"A Comprehensive Overview of Large Language Models","summary":" Large Language Models (LLMs) have recently demonstrated remarkable\ncapabilities in natural language processing tasks and beyond. This success of\nLLMs has led to a large influx of research contributions in this direction.\nThese works encompass diverse topics such as architectural innovations of the\nunderlying neural networks, context length improvements, model alignment,\ntraining datasets, benchmarking, efficiency and more. With the rapid\ndevelopment of techniques and regular breakthroughs in LLM research, it has\nbecome considerably challenging to perceive the bigger picture of the advances\nin this direction. Considering the rapidly emerging plethora of literature on\nLLMs, it is imperative that the research community is able to benefit from a\nconcise yet comprehensive overview of the recent developments in this field.\nThis article provides that overview to the research community. It not only\nfocuses on a systematic treatment of the existing literature on a broad range\nof LLM related concept, but also pays special attention to providing\ncomprehensive summaries with extensive details about the individual existing\nmodels, datasets and major insights. We also pay heed to aligning our overview\nwith the emerging outlook of this research direction by accounting for the\nother recently materializing reviews of the broader research direction of LLMs.\nOur self-contained comprehensive overview of LLMs discusses relevant background\nconcepts along with covering the advanced topics at the frontier of this\nresearch direction. This review article is intended to not only provide a\nsystematic survey, but also a quick comprehensive reference for the researchers\nand practitioners to draw insights from extensive informative summaries of the\nexisting works to advance the LLM research direction.\n","authors":["Humza Naveed","Asad Ullah Khan","Shi Qiu","Muhammad Saqib","Saeed Anwar","Muhammad Usman","Naveed Akhtar","Nick Barnes","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2307.06435v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03582v2","updated":"2023-08-18T12:31:52Z","published":"2023-08-07T13:38:54Z","title":"WIKITIDE: A Wikipedia-Based Timestamped Definition Pairs Dataset","summary":" A fundamental challenge in the current NLP context, dominated by language\nmodels, comes from the inflexibility of current architectures to 'learn' new\ninformation. While model-centric solutions like continual learning or\nparameter-efficient fine tuning are available, the question still remains of\nhow to reliably identify changes in language or in the world. In this paper, we\npropose WikiTiDe, a dataset derived from pairs of timestamped definitions\nextracted from Wikipedia. We argue that such resource can be helpful for\naccelerating diachronic NLP, specifically, for training models able to scan\nknowledge resources for core updates concerning a concept, an event, or a named\nentity. Our proposed end-to-end method is fully automatic, and leverages a\nbootstrapping algorithm for gradually creating a high-quality dataset. Our\nresults suggest that bootstrapping the seed version of WikiTiDe leads to better\nfine-tuned models. We also leverage fine-tuned models in a number of downstream\ntasks, showing promising results with respect to competitive baselines.\n","authors":["Hsuvas Borkakoty","Luis Espinosa-Anke"],"pdf_url":"https://arxiv.org/pdf/2308.03582v2.pdf","comment":"Accepted by RANLP 2023 main conference"},{"id":"http://arxiv.org/abs/2308.09502v1","updated":"2023-08-18T12:26:10Z","published":"2023-08-18T12:26:10Z","title":"Semantic relatedness in DBpedia: A comparative and experimental\n assessment","summary":" Evaluating semantic relatedness of Web resources is still an open challenge.\nThis paper focuses on knowledge-based methods, which represent an alternative\nto corpus-based approaches, and rely in general on the availability of\nknowledge graphs. In particular, we have selected 10 methods from the existing\nliterature, that have been organized according to it adjacent resources, triple\npatterns, and triple weights-based methods. They have been implemented and\nevaluated by using DBpedia as reference RDF knowledge graph. Since DBpedia is\ncontinuously evolving, the experimental results provided by these methods in\nthe literature are not comparable. For this reason, in this work, such methods\nhave been experimented by running them all at once on the same DBpedia release\nand against 14 well-known golden datasets. On the basis of the correlation\nvalues with human judgment obtained according to the experimental results,\nweighting the RDF triples in combination with evaluating all the directed paths\nlinking the compared resources is the best strategy in order to compute\nsemantic relatedness in DBpedia.\n","authors":["Anna Formica","Francesco Taglino"],"pdf_url":"https://arxiv.org/pdf/2308.09502v1.pdf","comment":"37 pages, 16 figures"},{"id":"http://arxiv.org/abs/2308.09497v1","updated":"2023-08-18T12:14:25Z","published":"2023-08-18T12:14:25Z","title":"Predictive Authoring for Brazilian Portuguese Augmentative and\n Alternative Communication","summary":" Individuals with complex communication needs (CCN) often rely on augmentative\nand alternative communication (AAC) systems to have conversations and\ncommunique their wants. Such systems allow message authoring by arranging\npictograms in sequence. However, the difficulty of finding the desired item to\ncomplete a sentence can increase as the user's vocabulary increases. This paper\nproposes using BERTimbau, a Brazilian Portuguese version of BERT, for pictogram\nprediction in AAC systems. To finetune BERTimbau, we constructed an AAC corpus\nfor Brazilian Portuguese to use as a training corpus. We tested different\napproaches to representing a pictogram for prediction: as a word (using\npictogram captions), as a concept (using a dictionary definition), and as a set\nof synonyms (using related terms). We also evaluated the usage of images for\npictogram prediction. The results demonstrate that using embeddings computed\nfrom the pictograms' caption, synonyms, or definitions have a similar\nperformance. Using synonyms leads to lower perplexity, but using captions leads\nto the highest accuracies. This paper provides insight into how to represent a\npictogram for prediction using a BERT-like model and the potential of using\nimages for pictogram prediction.\n","authors":["Jayr Pereira","Rodrigo Nogueira","Cleber Zanchettin","Robson Fidalgo"],"pdf_url":"https://arxiv.org/pdf/2308.09497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09455v1","updated":"2023-08-18T10:40:25Z","published":"2023-08-18T10:40:25Z","title":"Artificial-Spiking Hierarchical Networks for Vision-Language\n Representation Learning","summary":" With the success of self-supervised learning, multimodal foundation models\nhave rapidly adapted a wide range of downstream tasks driven by vision and\nlanguage (VL) pretraining. State-of-the-art methods achieve impressive\nperformance by pre-training on large-scale datasets. However, bridging the\nsemantic gap between the two modalities remains a nonnegligible challenge for\nVL tasks. In this work, we propose an efficient computation framework for\nmultimodal alignment by introducing a novel visual semantic module to further\nimprove the performance of the VL tasks. Specifically, we propose a flexible\nmodel, namely Artificial-Spiking Hierarchical Networks (ASH-Nets), which\ncombines the complementary advantages of Artificial neural networks (ANNs) and\nSpiking neural networks (SNNs) to enrich visual semantic representations. In\nparticular, a visual concrete encoder and a semantic abstract encoder are\nconstructed to learn continuous and discrete latent variables to enhance the\nflexibility of semantic encoding. Considering the spatio-temporal properties of\nSNNs modeling, we introduce a contrastive learning method to optimize the\ninputs of similar samples. This can improve the computational efficiency of the\nhierarchical network, while the augmentation of hard samples is beneficial to\nthe learning of visual representations. Furthermore, the Spiking to Text\nUni-Alignment Learning (STUA) pre-training method is proposed, which only\nrelies on text features to enhance the encoding ability of abstract semantics.\nWe validate the performance on multiple well-established downstream VL tasks.\nExperiments show that the proposed ASH-Nets achieve competitive results.\n","authors":["Yeming Chen","Siyu Zhang","Yaoru Sun","Weijian Liang","Haoran Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09454v1","updated":"2023-08-18T10:34:46Z","published":"2023-08-18T10:34:46Z","title":"Exploring Sampling Techniques for Generating Melodies with a Transformer\n Language Model","summary":" Research in natural language processing has demonstrated that the quality of\ngenerations from trained autoregressive language models is significantly\ninfluenced by the used sampling strategy. In this study, we investigate the\nimpact of different sampling techniques on musical qualities such as diversity\nand structure. To accomplish this, we train a high-capacity transformer model\non a vast collection of highly-structured Irish folk melodies and analyze the\nmusical qualities of the samples generated using distribution truncation\nsampling techniques. Specifically, we use nucleus sampling, the recently\nproposed \"typical sampling\", and conventional ancestral sampling. We evaluate\nthe effect of these sampling strategies in two scenarios: optimal circumstances\nwith a well-calibrated model and suboptimal circumstances where we\nsystematically degrade the model's performance. We assess the generated samples\nusing objective and subjective evaluations. We discover that probability\ntruncation techniques may restrict diversity and structural patterns in optimal\ncircumstances, but may also produce more musical samples in suboptimal\ncircumstances.\n","authors":["Mathias Rose Bjare","Stefan Lattner","Gerhard Widmer"],"pdf_url":"https://arxiv.org/pdf/2308.09454v1.pdf","comment":"7 pages, 5 figures, 1 table, accepted at the 24th Int. Society for\n Music Information Retrieval Conf., Milan, Italy, 2023"},{"id":"http://arxiv.org/abs/2308.09440v1","updated":"2023-08-18T10:12:03Z","published":"2023-08-18T10:12:03Z","title":"Scope is all you need: Transforming LLMs for HPC Code","summary":" With easier access to powerful compute resources, there is a growing trend in\nthe field of AI for software development to develop larger and larger language\nmodels (LLMs) to address a variety of programming tasks. Even LLMs applied to\ntasks from the high-performance computing (HPC) domain are huge in size (e.g.,\nbillions of parameters) and demand expensive compute resources for training. We\nfound this design choice confusing - why do we need large LLMs trained on\nnatural languages and programming languages unrelated to HPC for HPC-specific\ntasks? In this line of work, we aim to question design choices made by existing\nLLMs by developing smaller LLMs for specific domains - we call them\ndomain-specific LLMs. Specifically, we start off with HPC as a domain and\npropose a novel tokenizer named Tokompiler, designed specifically for\npreprocessing code in HPC and compilation-centric tasks. Tokompiler leverages\nknowledge of language primitives to generate language-oriented tokens,\nproviding a context-aware understanding of code structure while avoiding human\nsemantics attributed to code structures completely. We applied Tokompiler to\npre-train two state-of-the-art models, SPT-Code and Polycoder, for a Fortran\ncode corpus mined from GitHub. We evaluate the performance of these models\nagainst the conventional LLMs. Results demonstrate that Tokompiler\nsignificantly enhances code completion accuracy and semantic understanding\ncompared to traditional tokenizers in normalized-perplexity tests, down to ~1\nperplexity score. This research opens avenues for further advancements in\ndomain-specific LLMs, catering to the unique demands of HPC and compilation\ntasks.\n","authors":["Tal Kadosh","Niranjan Hasabnis","Vy A. Vo","Nadav Schneider","Neva Krien","Abdul Wasay","Nesreen Ahmed","Ted Willke","Guy Tamir","Yuval Pinter","Timothy Mattson","Gal Oren"],"pdf_url":"https://arxiv.org/pdf/2308.09440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09435v1","updated":"2023-08-18T10:07:28Z","published":"2023-08-18T10:07:28Z","title":"A Methodology for Generative Spelling Correction via Natural Spelling\n Errors Emulation across Multiple Domains and Languages","summary":" Modern large language models demonstrate impressive capabilities in text\ngeneration and generalization. However, they often struggle with solving text\nediting tasks, particularly when it comes to correcting spelling errors and\nmistypings. In this paper, we present a methodology for generative spelling\ncorrection (SC), which was tested on English and Russian languages and\npotentially can be extended to any language with minor changes. Our research\nmainly focuses on exploring natural spelling errors and mistypings in texts and\nstudying the ways those errors can be emulated in correct sentences to\neffectively enrich generative models' pre-train procedure. We investigate the\nimpact of such emulations and the models' abilities across different text\ndomains. In this work, we investigate two spelling corruption techniques: 1)\nfirst one mimics human behavior when making a mistake through leveraging\nstatistics of errors from particular dataset and 2) second adds the most common\nspelling errors, keyboard miss clicks, and some heuristics within the texts. We\nconducted experiments employing various corruption strategies, models'\narchitectures and sizes on the pre-training and fine-tuning stages and\nevaluated the models using single-domain and multi-domain test sets. As a\npractical outcome of our work, we introduce SAGE (Spell checking via\nAugmentation and Generative distribution Emulation) is a library for automatic\ngenerative SC that includes a family of pre-trained generative models and\nbuilt-in augmentation algorithms.\n","authors":["Nikita Martynov","Mark Baushenko","Anastasia Kozlova","Katerina Kolomeytseva","Aleksandr Abramov","Alena Fenogenova"],"pdf_url":"https://arxiv.org/pdf/2308.09435v1.pdf","comment":"to appear in EACL 2023"},{"id":"http://arxiv.org/abs/2305.05189v3","updated":"2023-08-18T09:13:46Z","published":"2023-05-09T05:48:38Z","title":"SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models with\n Large Language Models","summary":" Diffusion models, which have emerged to become popular text-to-image\ngeneration models, can produce high-quality and content-rich images guided by\ntextual prompts. However, there are limitations to semantic understanding and\ncommonsense reasoning in existing models when the input prompts are concise\nnarrative, resulting in low-quality image generation. To improve the capacities\nfor narrative prompts, we propose a simple-yet-effective parameter-efficient\nfine-tuning approach called the Semantic Understanding and Reasoning adapter\n(SUR-adapter) for pre-trained diffusion models. To reach this goal, we first\ncollect and annotate a new dataset SURD which consists of more than 57,000\nsemantically corrected multi-modal samples. Each sample contains a simple\nnarrative prompt, a complex keyword-based prompt, and a high-quality image.\nThen, we align the semantic representation of narrative prompts to the complex\nprompts and transfer knowledge of large language models (LLMs) to our\nSUR-adapter via knowledge distillation so that it can acquire the powerful\nsemantic understanding and reasoning capabilities to build a high-quality\ntextual semantic representation for text-to-image generation. We conduct\nexperiments by integrating multiple LLMs and popular pre-trained diffusion\nmodels to show the effectiveness of our approach in enabling diffusion models\nto understand and reason concise natural language without image quality\ndegradation. Our approach can make text-to-image diffusion models easier to use\nwith better user experience, which demonstrates our approach has the potential\nfor further advancing the development of user-friendly text-to-image generation\nmodels by bridging the semantic gap between simple narrative prompts and\ncomplex keyword-based prompts. The code is released at\nhttps://github.com/Qrange-group/SUR-adapter.\n","authors":["Shanshan Zhong","Zhongzhan Huang","Wushao Wen","Jinghui Qin","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2305.05189v3.pdf","comment":"accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.09376v1","updated":"2023-08-18T08:13:23Z","published":"2023-08-18T08:13:23Z","title":"Leveraging Large Language Models for DRL-Based Anti-Jamming Strategies\n in Zero Touch Networks","summary":" As the dawn of sixth-generation (6G) networking approaches, it promises\nunprecedented advancements in communication and automation. Among the leading\ninnovations of 6G is the concept of Zero Touch Networks (ZTNs), aiming to\nachieve fully automated, self-optimizing networks with minimal human\nintervention. Despite the advantages ZTNs offer in terms of efficiency and\nscalability, challenges surrounding transparency, adaptability, and human trust\nremain prevalent. Concurrently, the advent of Large Language Models (LLMs)\npresents an opportunity to elevate the ZTN framework by bridging the gap\nbetween automated processes and human-centric interfaces. This paper explores\nthe integration of LLMs into ZTNs, highlighting their potential to enhance\nnetwork transparency and improve user interactions. Through a comprehensive\ncase study on deep reinforcement learning (DRL)-based anti-jamming technique,\nwe demonstrate how LLMs can distill intricate network operations into\nintuitive, human-readable reports. Additionally, we address the technical and\nethical intricacies of melding LLMs with ZTNs, with an emphasis on data\nprivacy, transparency, and bias reduction. Looking ahead, we identify emerging\nresearch avenues at the nexus of LLMs and ZTNs, advocating for sustained\ninnovation and interdisciplinary synergy in the domain of automated networks.\n","authors":["Abubakar S. Ali","Dimitrios Michael Manias","Abdallah Shami","Sami Muhaidat"],"pdf_url":"https://arxiv.org/pdf/2308.09376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09370v1","updated":"2023-08-18T08:06:27Z","published":"2023-08-18T08:06:27Z","title":"TrOMR:Transformer-Based Polyphonic Optical Music Recognition","summary":" Optical Music Recognition (OMR) is an important technology in music and has\nbeen researched for a long time. Previous approaches for OMR are usually based\non CNN for image understanding and RNN for music symbol classification. In this\npaper, we propose a transformer-based approach with excellent global perceptual\ncapability for end-to-end polyphonic OMR, called TrOMR. We also introduce a\nnovel consistency loss function and a reasonable approach for data annotation\nto improve recognition accuracy for complex music scores. Extensive experiments\ndemonstrate that TrOMR outperforms current OMR methods, especially in\nreal-world scenarios. We also develop a TrOMR system and build a camera scene\ndataset for full-page music scores in real-world. The code and datasets will be\nmade available for reproducibility.\n","authors":["Yixuan Li","Huaping Liu","Qiang Jin","Miaomiao Cai","Peng Li"],"pdf_url":"https://arxiv.org/pdf/2308.09370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09368v1","updated":"2023-08-18T08:02:52Z","published":"2023-08-18T08:02:52Z","title":"A tailored Handwritten-Text-Recognition System for Medieval Latin","summary":" The Bavarian Academy of Sciences and Humanities aims to digitize its Medieval\nLatin Dictionary. This dictionary entails record cards referring to lemmas in\nmedieval Latin, a low-resource language. A crucial step of the digitization\nprocess is the Handwritten Text Recognition (HTR) of the handwritten lemmas\nfound on these record cards. In our work, we introduce an end-to-end pipeline,\ntailored to the medieval Latin dictionary, for locating, extracting, and\ntranscribing the lemmas. We employ two state-of-the-art (SOTA) image\nsegmentation models to prepare the initial data set for the HTR task.\nFurthermore, we experiment with different transformer-based models and conduct\na set of experiments to explore the capabilities of different combinations of\nvision encoders with a GPT-2 decoder. Additionally, we also apply extensive\ndata augmentation resulting in a highly competitive model. The best-performing\nsetup achieved a Character Error Rate (CER) of 0.015, which is even superior to\nthe commercial Google Cloud Vision model, and shows more stable performance.\n","authors":["Philipp Koch","Gilary Vera Nuñez","Esteban Garces Arias","Christian Heumann","Matthias Schöffel","Alexander Häberlin","Matthias Aßenmacher"],"pdf_url":"https://arxiv.org/pdf/2308.09368v1.pdf","comment":"This paper has been accepted at the First Workshop on Ancient\n Language Processing, co-located with RANLP 2023. This is the author's version\n of the work. The definite version of record will be published in the\n proceedings"},{"id":"http://arxiv.org/abs/2307.08487v2","updated":"2023-08-18T07:52:53Z","published":"2023-07-17T13:49:52Z","title":"Latent Jailbreak: A Test Suite for Evaluating Both Text Safety and\n Output Robustness of Large Language Models","summary":" Considerable research efforts have been devoted to ensuring that large\nlanguage models (LLMs) align with human values and generate safe text. However,\nan excessive focus on sensitivity to certain topics can compromise the model's\nrobustness in following instructions, thereby impacting its overall performance\nin completing tasks. Previous benchmarks for jailbreaking LLMs have primarily\nfocused on evaluating the safety of the models without considering their\nrobustness. In this paper, we propose a benchmark that assesses both the safety\nand robustness of LLMs, emphasizing the need for a balanced approach. To\ncomprehensively study text safety and output robustness, we introduce a latent\njailbreak prompt dataset, each involving malicious instruction embedding.\nSpecifically, we instruct the model to complete a regular task, such as\ntranslation, with the text to be translated containing malicious instructions.\nTo further analyze safety and robustness, we design a hierarchical annotation\nframework. We present a systematic analysis of the safety and robustness of\nLLMs regarding the position of explicit normal instructions, word replacements\n(verbs in explicit normal instructions, target groups in malicious\ninstructions, cue words for explicit normal instructions), and instruction\nreplacements (different explicit normal instructions). Our results demonstrate\nthat current LLMs not only prioritize certain instruction verbs but also\nexhibit varying jailbreak rates for different instruction verbs in explicit\nnormal instructions. Code and data are available at\nhttps://github.com/qiuhuachuan/latent-jailbreak.\n","authors":["Huachuan Qiu","Shuai Zhang","Anqi Li","Hongliang He","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2307.08487v2.pdf","comment":"Code and data are available at\n https://github.com/qiuhuachuan/latent-jailbreak"},{"id":"http://arxiv.org/abs/2308.09354v1","updated":"2023-08-18T07:31:13Z","published":"2023-08-18T07:31:13Z","title":"Accelerated materials language processing enabled by GPT","summary":" Materials language processing (MLP) is one of the key facilitators of\nmaterials science research, as it enables the extraction of structured\ninformation from massive materials science literature. Prior works suggested\nhigh-performance MLP models for text classification, named entity recognition\n(NER), and extractive question answering (QA), which require complex model\narchitecture, exhaustive fine-tuning and a large number of human-labelled\ndatasets. In this study, we develop generative pretrained transformer\n(GPT)-enabled pipelines where the complex architectures of prior MLP models are\nreplaced with strategic designs of prompt engineering. First, we develop a\nGPT-enabled document classification method for screening relevant documents,\nachieving comparable accuracy and reliability compared to prior models, with\nonly small dataset. Secondly, for NER task, we design an entity-centric\nprompts, and learning few-shot of them improved the performance on most of\nentities in three open datasets. Finally, we develop an GPT-enabled extractive\nQA model, which provides improved performance and shows the possibility of\nautomatically correcting annotations. While our findings confirm the potential\nof GPT-enabled MLP models as well as their value in terms of reliability and\npracticability, our scientific methods and systematic approach are applicable\nto any materials science domain to accelerate the information extraction of\nscientific literature.\n","authors":["Jaewoong Choi","Byungju Lee"],"pdf_url":"https://arxiv.org/pdf/2308.09354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09341v1","updated":"2023-08-18T06:59:55Z","published":"2023-08-18T06:59:55Z","title":"Document Automation Architectures: Updated Survey in Light of Large\n Language Models","summary":" This paper surveys the current state of the art in document automation (DA).\nThe objective of DA is to reduce the manual effort during the generation of\ndocuments by automatically creating and integrating input from different\nsources and assembling documents conforming to defined templates. There have\nbeen reviews of commercial solutions of DA, particularly in the legal domain,\nbut to date there has been no comprehensive review of the academic research on\nDA architectures and technologies. The current survey of DA reviews the\nacademic literature and provides a clearer definition and characterization of\nDA and its features, identifies state-of-the-art DA architectures and\ntechnologies in academic research, and provides ideas that can lead to new\nresearch opportunities within the DA field in light of recent advances in\ngenerative AI and large language models.\n","authors":["Mohammad Ahmadi Achachlouei","Omkar Patil","Tarun Joshi","Vijayan N. Nair"],"pdf_url":"https://arxiv.org/pdf/2308.09341v1.pdf","comment":"The current paper is the updated version of an earlier survey on\n document automation [Ahmadi Achachlouei et al. 2021]. Updates in the current\n paper are as follows: We shortened almost all sections to reduce the size of\n the main paper (without references) from 28 pages to 10 pages, added a review\n of selected papers on large language models, removed certain sections and\n most of diagrams. arXiv admin note: substantial text overlap with\n arXiv:2109.11603"},{"id":"http://arxiv.org/abs/2308.09329v1","updated":"2023-08-18T06:10:11Z","published":"2023-08-18T06:10:11Z","title":"KESDT: knowledge enhanced shallow and deep Transformer for detecting\n adverse drug reactions","summary":" Adverse drug reaction (ADR) detection is an essential task in the medical\nfield, as ADRs have a gravely detrimental impact on patients' health and the\nhealthcare system. Due to a large number of people sharing information on\nsocial media platforms, an increasing number of efforts focus on social media\ndata to carry out effective ADR detection. Despite having achieved impressive\nperformance, the existing methods of ADR detection still suffer from three main\nchallenges. Firstly, researchers have consistently ignored the interaction\nbetween domain keywords and other words in the sentence. Secondly, social media\ndatasets suffer from the challenges of low annotated data. Thirdly, the issue\nof sample imbalance is commonly observed in social media datasets. To solve\nthese challenges, we propose the Knowledge Enhanced Shallow and Deep\nTransformer(KESDT) model for ADR detection. Specifically, to cope with the\nfirst issue, we incorporate the domain keywords into the Transformer model\nthrough a shallow fusion manner, which enables the model to fully exploit the\ninteractive relationships between domain keywords and other words in the\nsentence. To overcome the low annotated data, we integrate the synonym sets\ninto the Transformer model through a deep fusion manner, which expands the size\nof the samples. To mitigate the impact of sample imbalance, we replace the\nstandard cross entropy loss function with the focal loss function for effective\nmodel training. We conduct extensive experiments on three public datasets\nincluding TwiMed, Twitter, and CADEC. The proposed KESDT outperforms\nstate-of-the-art baselines on F1 values, with relative improvements of 4.87%,\n47.83%, and 5.73% respectively, which demonstrates the effectiveness of our\nproposed KESDT.\n","authors":["Yunzhi Qiu","Xiaokun Zhang","Weiwei Wang","Tongxuan Zhang","Bo Xu","Hongfei Lin"],"pdf_url":"https://arxiv.org/pdf/2308.09329v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09311v1","updated":"2023-08-18T05:19:03Z","published":"2023-08-18T05:19:03Z","title":"Lip Reading for Low-resource Languages by Learning and Combining General\n Speech Knowledge and Language-specific Knowledge","summary":" This paper proposes a novel lip reading framework, especially for\nlow-resource languages, which has not been well addressed in the previous\nliterature. Since low-resource languages do not have enough video-text paired\ndata to train the model to have sufficient power to model lip movements and\nlanguage, it is regarded as challenging to develop lip reading models for\nlow-resource languages. In order to mitigate the challenge, we try to learn\ngeneral speech knowledge, the ability to model lip movements, from a\nhigh-resource language through the prediction of speech units. It is known that\ndifferent languages partially share common phonemes, thus general speech\nknowledge learned from one language can be extended to other languages. Then,\nwe try to learn language-specific knowledge, the ability to model language, by\nproposing Language-specific Memory-augmented Decoder (LMDecoder). LMDecoder\nsaves language-specific audio features into memory banks and can be trained on\naudio-text paired data which is more easily accessible than video-text paired\ndata. Therefore, with LMDecoder, we can transform the input speech units into\nlanguage-specific audio features and translate them into texts by utilizing the\nlearned rich language knowledge. Finally, by combining general speech knowledge\nand language-specific knowledge, we can efficiently develop lip reading models\neven for low-resource languages. Through extensive experiments using five\nlanguages, English, Spanish, French, Italian, and Portuguese, the effectiveness\nof the proposed method is evaluated.\n","authors":["Minsu Kim","Jeong Hun Yeo","Jeongsoo Choi","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2308.09311v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2306.12725v2","updated":"2023-08-18T05:12:12Z","published":"2023-06-22T07:57:19Z","title":"Generative Multimodal Entity Linking","summary":" Multimodal Entity Linking (MEL) is the task of mapping mentions with\nmultimodal contexts to the referent entities from a knowledge base (e.g.\nWikipedia). Existing MEL methods mainly focus on designing complex multimodal\ninteraction mechanisms and require fine-tuning all model parameters, which can\nbe prohibitively costly and difficult to scale in the era of Large Language\nModels (LLMs). In this work, we propose GEMEL, a simple yet effective\nGenerative Multimodal Entity Linking framework based on LLMs, which directly\ngenerates target entity names. We keep the vision and language model frozen and\nonly train a feature mapper to enable cross-modality interactions. To adapt\nLLMs to the MEL task, we take advantage of the emergent in-context learning\ncapability of LLMs by retrieving multimodal instances as demonstrations.\nExtensive experiments show that, with only ~0.3% of the model parameters\nfine-tuned, GEMEL achieves state-of-the-art results on two well-established MEL\ndatasets (7.7% accuracy gains on WikiDiverse and 8.8% accuracy gains on\nWikiMEL). The performance gain stems from mitigating the popularity bias of LLM\npredictions and disambiguating less common entities effectively. Further\nanalysis verifies the generality and scalability of GEMEL. Our approach is\ncompatible with any off-the-shelf language model, paving the way towards an\nefficient and general solution for utilizing LLMs in the MEL task.\n","authors":["Senbao Shi","Zhenran Xu","Baotian Hu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.12725v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09308v1","updated":"2023-08-18T05:05:35Z","published":"2023-08-18T05:05:35Z","title":"Differentiable Retrieval Augmentation via Generative Language Modeling\n for E-commerce Query Intent Classification","summary":" Retrieval augmentation, which enhances downstream models by a knowledge\nretriever and an external corpus instead of by merely increasing the number of\nmodel parameters, has been successfully applied to many natural language\nprocessing (NLP) tasks such as text classification, question answering and so\non. However, existing methods that separately or asynchronously train the\nretriever and downstream model mainly due to the non-differentiability between\nthe two parts, usually lead to degraded performance compared to end-to-end\njoint training.\n","authors":["Chenyu Zhao","Yunjiang Jiang","Yiming Qiu","Han Zhang","Wen-Yun Yang"],"pdf_url":"https://arxiv.org/pdf/2308.09308v1.pdf","comment":"5 pages, 2 figures; accepted by CIKM2023"},{"id":"http://arxiv.org/abs/2307.12507v2","updated":"2023-08-18T03:07:02Z","published":"2023-07-24T03:44:17Z","title":"Gradient-Based Word Substitution for Obstinate Adversarial Examples\n Generation in Language Models","summary":" In this paper, we study the problem of generating obstinate (over-stability)\nadversarial examples by word substitution in NLP, where input text is\nmeaningfully changed but the model's prediction does not, even though it\nshould. Previous word substitution approaches have predominantly focused on\nmanually designed antonym-based strategies for generating obstinate adversarial\nexamples, which hinders its application as these strategies can only find a\nsubset of obstinate adversarial examples and require human efforts. To address\nthis issue, in this paper, we introduce a novel word substitution method named\nGradObstinate, a gradient-based approach that automatically generates obstinate\nadversarial examples without any constraints on the search space or the need\nfor manual design principles. To empirically evaluate the efficacy of\nGradObstinate, we conduct comprehensive experiments on five representative\nmodels (Electra, ALBERT, Roberta, DistillBERT, and CLIP) finetuned on four NLP\nbenchmarks (SST-2, MRPC, SNLI, and SQuAD) and a language-grounding benchmark\n(MSCOCO). Extensive experiments show that our proposed GradObstinate generates\nmore powerful obstinate adversarial examples, exhibiting a higher attack\nsuccess rate compared to antonym-based methods. Furthermore, to show the\ntransferability of obstinate word substitutions found by GradObstinate, we\nreplace the words in four representative NLP benchmarks with their obstinate\nsubstitutions. Notably, obstinate substitutions exhibit a high success rate\nwhen transferred to other models in black-box settings, including even GPT-3\nand ChatGPT. Examples of obstinate adversarial examples found by GradObstinate\nare available at https://huggingface.co/spaces/anonauthors/SecretLanguage.\n","authors":["Yimu Wang","Peng Shi","Hongyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.12507v2.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2308.09217v1","updated":"2023-08-18T00:26:05Z","published":"2023-08-18T00:26:05Z","title":"Conversational Ontology Alignment with ChatGPT","summary":" This study evaluates the applicability and efficiency of ChatGPT for ontology\nalignment using a naive approach. ChatGPT's output is compared to the results\nof the Ontology Alignment Evaluation Initiative 2022 campaign using conference\ntrack ontologies. This comparison is intended to provide insights into the\ncapabilities of a conversational large language model when used in a naive way\nfor ontology matching, and to investigate the potential advantages and\ndisadvantages of this approach.\n","authors":["Sanaz Saki Norouzi","Mohammad Saeid Mahdavinejad","Pascal Hitzler"],"pdf_url":"https://arxiv.org/pdf/2308.09217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09778v4","updated":"2023-08-18T23:43:42Z","published":"2022-11-17T18:52:19Z","title":"I Can't Believe There's No Images! Learning Visual Tasks Using only\n Language Supervision","summary":" Many high-level skills that are required for computer vision tasks, such as\nparsing questions, comparing and contrasting semantics, and writing\ndescriptions, are also required in other domains such as natural language\nprocessing. In this paper, we ask whether it is possible to learn those skills\nfrom text data and then transfer them to vision tasks without ever training on\nvisual training data. Key to our approach is exploiting the joint embedding\nspace of contrastively trained vision and language encoders. In practice, there\ncan be systematic differences between embedding spaces for different modalities\nin contrastive models, and we analyze how these differences affect our approach\nand study strategies to mitigate this concern. We produce models using only\ntext training data on four representative tasks: image captioning, visual\nentailment, visual question answering and visual news captioning, and evaluate\nthem on standard benchmarks using images. We find these models perform close to\nmodels trained on images, while surpassing prior work for captioning and visual\nentailment in this text-only setting by over 9 points, and outperforming all\nprior work on visual news by over 30 points. We also showcase a variety of\nstylistic image captioning models that are trained using no image data and no\nhuman-curated language data, but instead using readily-available text data from\nbooks, the web, or language models.\n","authors":["Sophia Gu","Christopher Clark","Aniruddha Kembhavi"],"pdf_url":"https://arxiv.org/pdf/2211.09778v4.pdf","comment":"website (https://prior.allenai.org/projects/close), code\n (https://github.com/allenai/close)"},{"id":"http://arxiv.org/abs/2308.09853v1","updated":"2023-08-18T23:07:29Z","published":"2023-08-18T23:07:29Z","title":"How susceptible are LLMs to Logical Fallacies?","summary":" This paper investigates the rational thinking capability of Large Language\nModels (LLMs) in multi-round argumentative debates by exploring the impact of\nfallacious arguments on their logical reasoning performance. More specifically,\nwe present Logic Competence Measurement Benchmark (LOGICOM), a diagnostic\nbenchmark to assess the robustness of LLMs against logical fallacies. LOGICOM\ninvolves two agents: a persuader and a debater engaging in a multi-round debate\non a controversial topic, where the persuader tries to convince the debater of\nthe correctness of its claim. First, LOGICOM assesses the potential of LLMs to\nchange their opinions through reasoning. Then, it evaluates the debater's\nperformance in logical reasoning by contrasting the scenario where the\npersuader employs logical fallacies against one where logical reasoning is\nused. We use this benchmark to evaluate the performance of GPT-3.5 and GPT-4\nusing a dataset containing controversial topics, claims, and reasons supporting\nthem. Our findings indicate that both GPT-3.5 and GPT-4 can adjust their\nopinion through reasoning. However, when presented with logical fallacies,\nGPT-3.5 and GPT-4 are erroneously convinced 41% and 69% more often,\nrespectively, compared to when logical reasoning is used. Finally, we introduce\na new dataset containing over 5k pairs of logical vs. fallacious arguments. The\nsource code and dataset of this work are made publicly available.\n","authors":["Amirreza Payandeh","Dan Pluth","Jordan Hosier","Xuesu Xiao","Vijay K. Gurbani"],"pdf_url":"https://arxiv.org/pdf/2308.09853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12950v2","updated":"2023-08-18T22:16:51Z","published":"2023-07-24T17:23:22Z","title":"RLCD: Reinforcement Learning from Contrast Distillation for Language\n Model Alignment","summary":" We propose Reinforcement Learning from Contrast Distillation (RLCD), a method\nfor aligning language models to follow natural language principles without\nusing human feedback. RLCD trains a preference model using simulated preference\npairs that contain both a high-quality and low-quality example, generated using\ncontrasting positive and negative prompts. The preference model is then used to\nimprove a base unaligned language model via reinforcement learning.\nEmpirically, RLCD outperforms RLAIF (Bai et al., 2022b) and context\ndistillation (Huang et al., 2022) baselines across three diverse alignment\ntasks--harmlessness, helpfulness, and story outline generation--and on both 7B\nand 30B model scales for preference data simulation.\n","authors":["Kevin Yang","Dan Klein","Asli Celikyilmaz","Nanyun Peng","Yuandong Tian"],"pdf_url":"https://arxiv.org/pdf/2307.12950v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.13243v6","updated":"2023-08-18T21:14:43Z","published":"2022-07-27T01:59:13Z","title":"Toward Transparent AI: A Survey on Interpreting the Inner Structures of\n Deep Neural Networks","summary":" The last decade of machine learning has seen drastic increases in scale and\ncapabilities. Deep neural networks (DNNs) are increasingly being deployed in\nthe real world. However, they are difficult to analyze, raising concerns about\nusing them without a rigorous understanding of how they function. Effective\ntools for interpreting them will be important for building more trustworthy AI\nby helping to identify problems, fix bugs, and improve basic understanding. In\nparticular, \"inner\" interpretability techniques, which focus on explaining the\ninternal components of DNNs, are well-suited for developing a mechanistic\nunderstanding, guiding manual modifications, and reverse engineering solutions.\n Much recent work has focused on DNN interpretability, and rapid progress has\nthus far made a thorough systematization of methods difficult. In this survey,\nwe review over 300 works with a focus on inner interpretability tools. We\nintroduce a taxonomy that classifies methods by what part of the network they\nhelp to explain (weights, neurons, subnetworks, or latent representations) and\nwhether they are implemented during (intrinsic) or after (post hoc) training.\nTo our knowledge, we are also the first to survey a number of connections\nbetween interpretability research and work in adversarial robustness, continual\nlearning, modularity, network compression, and studying the human visual\nsystem. We discuss key challenges and argue that the status quo in\ninterpretability research is largely unproductive. Finally, we highlight the\nimportance of future work that emphasizes diagnostics, debugging, adversaries,\nand benchmarking in order to make interpretability tools more useful to\nengineers in practical applications.\n","authors":["Tilman Räuker","Anson Ho","Stephen Casper","Dylan Hadfield-Menell"],"pdf_url":"https://arxiv.org/pdf/2207.13243v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09810v1","updated":"2023-08-18T20:33:06Z","published":"2023-08-18T20:33:06Z","title":"An Image is Worth a Thousand Toxic Words: A Metamorphic Testing\n Framework for Content Moderation Software","summary":" The exponential growth of social media platforms has brought about a\nrevolution in communication and content dissemination in human society.\nNevertheless, these platforms are being increasingly misused to spread toxic\ncontent, including hate speech, malicious advertising, and pornography, leading\nto severe negative consequences such as harm to teenagers' mental health.\nDespite tremendous efforts in developing and deploying textual and image\ncontent moderation methods, malicious users can evade moderation by embedding\ntexts into images, such as screenshots of the text, usually with some\ninterference. We find that modern content moderation software's performance\nagainst such malicious inputs remains underexplored. In this work, we propose\nOASIS, a metamorphic testing framework for content moderation software. OASIS\nemploys 21 transform rules summarized from our pilot study on 5,000 real-world\ntoxic contents collected from 4 popular social media applications, including\nTwitter, Instagram, Sina Weibo, and Baidu Tieba. Given toxic textual contents,\nOASIS can generate image test cases, which preserve the toxicity yet are likely\nto bypass moderation. In the evaluation, we employ OASIS to test five\ncommercial textual content moderation software from famous companies (i.e.,\nGoogle Cloud, Microsoft Azure, Baidu Cloud, Alibaba Cloud and Tencent Cloud),\nas well as a state-of-the-art moderation research model. The results show that\nOASIS achieves up to 100% error finding rates. Moreover, through retraining the\nmodels with the test cases generated by OASIS, the robustness of the moderation\nmodel can be improved without performance degradation.\n","authors":["Wenxuan Wang","Jingyuan Huang","Jen-tse Huang","Chang Chen","Jiazhen Gu","Pinjia He","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2308.09810v1.pdf","comment":"Accepted by ASE 2023. arXiv admin note: substantial text overlap with\n arXiv:2302.05706"},{"id":"http://arxiv.org/abs/2308.09804v1","updated":"2023-08-18T20:18:30Z","published":"2023-08-18T20:18:30Z","title":"VL-PET: Vision-and-Language Parameter-Efficient Tuning via Granularity\n Control","summary":" As the model size of pre-trained language models (PLMs) grows rapidly, full\nfine-tuning becomes prohibitively expensive for model training and storage. In\nvision-and-language (VL), parameter-efficient tuning (PET) techniques are\nproposed to integrate modular modifications (e.g., Adapter and LoRA) into\nencoder-decoder PLMs. By tuning a small set of trainable parameters, these\ntechniques perform on par with full fine-tuning. However, excessive modular\nmodifications and neglecting the functionality gap between the encoders and\ndecoders can lead to performance degradation, while existing PET techniques\n(e.g., VL-Adapter) overlook these critical issues. In this paper, we propose a\nVision-and-Language Parameter-Efficient Tuning (VL-PET) framework to impose\neffective control over modular modifications via a novel granularity-controlled\nmechanism. Considering different granularity-controlled matrices generated by\nthis mechanism, a variety of model-agnostic VL-PET modules can be instantiated\nfrom our framework for better efficiency and effectiveness trade-offs. We\nfurther propose lightweight PET module designs to enhance VL alignment and\nmodeling for the encoders and maintain text generation for the decoders.\nExtensive experiments conducted on four image-text tasks and four video-text\ntasks demonstrate the efficiency, effectiveness and transferability of our\nVL-PET framework. In particular, our VL-PET-large with lightweight PET module\ndesigns significantly outperforms VL-Adapter by 2.92% (3.41%) and LoRA by 3.37%\n(7.03%) with BART-base (T5-base) on image-text tasks. Furthermore, we validate\nthe enhanced effect of employing our VL-PET designs on existing PET techniques,\nenabling them to achieve significant performance improvements. Our code is\navailable at https://github.com/HenryHZY/VL-PET.\n","authors":["Zi-Yuan Hu","Yanyang Li","Michael R. Lyu","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09804v1.pdf","comment":"ICCV 2023 (17 pages, 6 figures, 22 tables)"},{"id":"http://arxiv.org/abs/2304.09991v2","updated":"2023-08-18T20:09:46Z","published":"2023-04-19T21:59:04Z","title":"Supporting Human-AI Collaboration in Auditing LLMs with LLMs","summary":" Large language models are becoming increasingly pervasive and ubiquitous in\nsociety via deployment in sociotechnical systems. Yet these language models, be\nit for classification or generation, have been shown to be biased and behave\nirresponsibly, causing harm to people at scale. It is crucial to audit these\nlanguage models rigorously. Existing auditing tools leverage either or both\nhumans and AI to find failures. In this work, we draw upon literature in\nhuman-AI collaboration and sensemaking, and conduct interviews with research\nexperts in safe and fair AI, to build upon the auditing tool: AdaTest (Ribeiro\nand Lundberg, 2022), which is powered by a generative large language model\n(LLM). Through the design process we highlight the importance of sensemaking\nand human-AI communication to leverage complementary strengths of humans and\ngenerative models in collaborative auditing. To evaluate the effectiveness of\nthe augmented tool, AdaTest++, we conduct user studies with participants\nauditing two commercial language models: OpenAI's GPT-3 and Azure's sentiment\nanalysis model. Qualitative analysis shows that AdaTest++ effectively leverages\nhuman strengths such as schematization, hypothesis formation and testing.\nFurther, with our tool, participants identified a variety of failures modes,\ncovering 26 different topics over 2 tasks, that have been shown before in\nformal audits and also those previously under-reported.\n","authors":["Charvi Rastogi","Marco Tulio Ribeiro","Nicholas King","Saleema Amershi"],"pdf_url":"https://arxiv.org/pdf/2304.09991v2.pdf","comment":"21 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.02180v3","updated":"2023-08-18T20:05:45Z","published":"2023-08-04T07:51:15Z","title":"Scaling Clinical Trial Matching Using Large Language Models: A Case\n Study in Oncology","summary":" Clinical trial matching is a key process in health delivery and discovery. In\npractice, it is plagued by overwhelming unstructured data and unscalable manual\nprocessing. In this paper, we conduct a systematic study on scaling clinical\ntrial matching using large language models (LLMs), with oncology as the focus\narea. Our study is grounded in a clinical trial matching system currently in\ntest deployment at a large U.S. health network. Initial findings are promising:\nout of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate\neligibility criteria of clinical trials and extract complex matching logic\n(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially\noutperform prior strong baselines and may serve as a preliminary solution to\nhelp triage patient-trial candidates with humans in the loop. Our study also\nreveals a few significant growth areas for applying LLMs to end-to-end clinical\ntrial matching, such as context limitation and accuracy, especially in\nstructuring patient information from longitudinal medical records.\n","authors":["Cliff Wong","Sheng Zhang","Yu Gu","Christine Moung","Jacob Abel","Naoto Usuyama","Roshanthi Weerasinghe","Brian Piening","Tristan Naumann","Carlo Bifulco","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2308.02180v3.pdf","comment":"24 pages, 5 figures, accepted at Machine Learning for Healthcare\n (MLHC) 2023"},{"id":"http://arxiv.org/abs/2308.09778v1","updated":"2023-08-18T18:58:54Z","published":"2023-08-18T18:58:54Z","title":"Towards Grounded Visual Spatial Reasoning in Multi-Modal Vision Language\n Models","summary":" With the advances in large scale vision-and-language models (VLMs) it is of\ninterest to assess their performance on various visual reasoning tasks such as\ncounting, referring expressions and general visual question answering. The\nfocus of this work is to study the ability of these models to understanding\nspatial relations. Previously, this has been tackled using image-text matching\n(Liu, Emerson, and Collier 2022) or visual question answering task, both\nshowing poor performance and a large gap compared to human performance. To\nbetter understand the gap, we present fine-grained compositional grounding of\nspatial relationships and propose a bottom up approach for ranking spatial\nclauses and evaluating the performance of spatial relationship reasoning task.\nWe propose to combine the evidence from grounding noun phrases corresponding to\nobjects and their locations to compute the final rank of the spatial clause. We\ndemonstrate the approach on representative vision-language models (Tan and\nBansal 2019; Gupta et al. 2022; Kamath et al. 2021) and compare and highlight\ntheir abilities to reason about spatial relationships.\n","authors":["Navid Rajabi","Jana Kosecka"],"pdf_url":"https://arxiv.org/pdf/2308.09778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09768v1","updated":"2023-08-18T18:46:47Z","published":"2023-08-18T18:46:47Z","title":"YORC: Yoruba Reading Comprehension dataset","summary":" In this paper, we create YORC: a new multi-choice Yoruba Reading\nComprehension dataset that is based on Yoruba high-school reading comprehension\nexamination. We provide baseline results by performing cross-lingual transfer\nusing existing English RACE dataset based on a pre-trained encoder-only model.\nAdditionally, we provide results by prompting large language models (LLMs) like\nGPT-4.\n","authors":["Anuoluwapo Aremu","Jesujoba O. Alabi","David Ifeoluwa Adelani"],"pdf_url":"https://arxiv.org/pdf/2308.09768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09765v1","updated":"2023-08-18T18:18:55Z","published":"2023-08-18T18:18:55Z","title":"Taken by Surprise: Contrast effect for Similarity Scores","summary":" Accurately evaluating the similarity of object vector embeddings is of\ncritical importance for natural language processing, information retrieval and\nclassification tasks. Popular similarity scores (e.g cosine similarity) are\nbased on pairs of embedding vectors and disregard the distribution of the\nensemble from which objects are drawn. Human perception of object similarity\nsignificantly depends on the context in which the objects appear. In this work\nwe propose the \\emph{surprise score}, an ensemble-normalized similarity metric\nthat encapsulates the contrast effect of human perception and significantly\nimproves the classification performance on zero- and few-shot document\nclassification tasks. This score quantifies the surprise to find a given\nsimilarity between two elements relative to the pairwise ensemble similarities.\nWe evaluate this metric on zero/few shot classification and clustering tasks\nand typically find 10-15\\% better performance compared to raw cosine\nsimilarity. Our code is available at\nhttps://github.com/MeetElise/surprise-similarity.\n","authors":["homas C. Bachlechner","Mario Martone","Marjorie Schillo"],"pdf_url":"https://arxiv.org/pdf/2308.09765v1.pdf","comment":"9 pages, 2 figures and 4 tables"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.09718v1","updated":"2023-08-18T17:59:57Z","published":"2023-08-18T17:59:57Z","title":"Towards Large-scale 3D Representation Learning with Multi-dataset Point\n Prompt Training","summary":" The rapid advancement of deep learning models often attributes to their\nability to leverage massive training data. In contrast, such privilege has not\nyet fully benefited 3D deep learning, mainly due to the limited availability of\nlarge-scale 3D datasets. Merging multiple available data sources and letting\nthem collaboratively train a single model is a potential solution. However, due\nto the large domain gap between 3D point cloud datasets, such mixed supervision\ncould adversely affect the model's performance and lead to degenerated\nperformance (i.e., negative transfer) compared to single-dataset training. In\nview of this challenge, we introduce Point Prompt Training (PPT), a novel\nframework for multi-dataset synergistic learning in the context of 3D\nrepresentation learning that supports multiple pre-training paradigms. Based on\nthis framework, we propose Prompt-driven Normalization, which adapts the model\nto different datasets with domain-specific prompts and Language-guided\nCategorical Alignment that decently unifies the multiple-dataset label spaces\nby leveraging the relationship between label text. Extensive experiments verify\nthat PPT can overcome the negative transfer associated with synergistic\nlearning and produce generalizable representations. Notably, it achieves\nstate-of-the-art performance on each dataset using a single weight-shared model\nwith supervised multi-dataset training. Moreover, when served as a pre-training\nframework, it outperforms other pre-training approaches regarding\nrepresentation quality and attains remarkable state-of-the-art performance\nacross over ten diverse downstream tasks spanning both indoor and outdoor 3D\nscenarios.\n","authors":["Xiaoyang Wu","Zhuotao Tian","Xin Wen","Bohao Peng","Xihui Liu","Kaicheng Yu","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.09718v1.pdf","comment":"Code available at Pointcept (https://github.com/Pointcept/Pointcept)"},{"id":"http://arxiv.org/abs/2308.09717v1","updated":"2023-08-18T17:59:53Z","published":"2023-08-18T17:59:53Z","title":"Smoothness Similarity Regularization for Few-Shot GAN Adaptation","summary":" The task of few-shot GAN adaptation aims to adapt a pre-trained GAN model to\na small dataset with very few training images. While existing methods perform\nwell when the dataset for pre-training is structurally similar to the target\ndataset, the approaches suffer from training instabilities or memorization\nissues when the objects in the two domains have a very different structure. To\nmitigate this limitation, we propose a new smoothness similarity regularization\nthat transfers the inherently learned smoothness of the pre-trained GAN to the\nfew-shot target domain even if the two domains are very different. We evaluate\nour approach by adapting an unconditional and a class-conditional GAN to\ndiverse few-shot target domains. Our proposed method significantly outperforms\nprior few-shot GAN adaptation methods in the challenging case of structurally\ndissimilar source-target domains, while performing on par with the state of the\nart for similar source-target domains.\n","authors":["Vadim Sushko","Ruyu Wang","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2308.09717v1.pdf","comment":"International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2308.09716v1","updated":"2023-08-18T17:59:40Z","published":"2023-08-18T17:59:40Z","title":"Diff2Lip: Audio Conditioned Diffusion Models for Lip-Synchronization","summary":" The task of lip synchronization (lip-sync) seeks to match the lips of human\nfaces with different audio. It has various applications in the film industry as\nwell as for creating virtual avatars and for video conferencing. This is a\nchallenging problem as one needs to simultaneously introduce detailed,\nrealistic lip movements while preserving the identity, pose, emotions, and\nimage quality. Many of the previous methods trying to solve this problem suffer\nfrom image quality degradation due to a lack of complete contextual\ninformation. In this paper, we present Diff2Lip, an audio-conditioned\ndiffusion-based model which is able to do lip synchronization in-the-wild while\npreserving these qualities. We train our model on Voxceleb2, a video dataset\ncontaining in-the-wild talking face videos. Extensive studies show that our\nmethod outperforms popular methods like Wav2Lip and PC-AVS in Fr\\'echet\ninception distance (FID) metric and Mean Opinion Scores (MOS) of the users. We\nshow results on both reconstruction (same audio-video inputs) as well as cross\n(different audio-video inputs) settings on Voxceleb2 and LRW datasets. Video\nresults and code can be accessed from our project page (\nhttps://soumik-kanad.github.io/diff2lip ).\n","authors":["Soumik Mukhopadhyay","Saksham Suri","Ravi Teja Gadde","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2308.09716v1.pdf","comment":"Website: see https://soumik-kanad.github.io/diff2lip . Submission\n under review"},{"id":"http://arxiv.org/abs/2308.09713v1","updated":"2023-08-18T17:59:21Z","published":"2023-08-18T17:59:21Z","title":"Dynamic 3D Gaussians: Tracking by Persistent Dynamic View Synthesis","summary":" We present a method that simultaneously addresses the tasks of dynamic scene\nnovel-view synthesis and six degree-of-freedom (6-DOF) tracking of all dense\nscene elements. We follow an analysis-by-synthesis framework, inspired by\nrecent work that models scenes as a collection of 3D Gaussians which are\noptimized to reconstruct input images via differentiable rendering. To model\ndynamic scenes, we allow Gaussians to move and rotate over time while enforcing\nthat they have persistent color, opacity, and size. By regularizing Gaussians'\nmotion and rotation with local-rigidity constraints, we show that our Dynamic\n3D Gaussians correctly model the same area of physical space over time,\nincluding the rotation of that space. Dense 6-DOF tracking and dynamic\nreconstruction emerges naturally from persistent dynamic view synthesis,\nwithout requiring any correspondence or flow as input. We demonstrate a large\nnumber of downstream applications enabled by our representation, including\nfirst-person view synthesis, dynamic compositional scene synthesis, and 4D\nvideo editing.\n","authors":["Jonathon Luiten","Georgios Kopanas","Bastian Leibe","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2308.09713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09712v1","updated":"2023-08-18T17:59:04Z","published":"2023-08-18T17:59:04Z","title":"HumanLiff: Layer-wise 3D Human Generation with Diffusion Model","summary":" 3D human generation from 2D images has achieved remarkable progress through\nthe synergistic utilization of neural rendering and generative models. Existing\n3D human generative models mainly generate a clothed 3D human as an\nundetectable 3D model in a single pass, while rarely considering the layer-wise\nnature of a clothed human body, which often consists of the human body and\nvarious clothes such as underwear, outerwear, trousers, shoes, etc. In this\nwork, we propose HumanLiff, the first layer-wise 3D human generative model with\na unified diffusion process. Specifically, HumanLiff firstly generates\nminimal-clothed humans, represented by tri-plane features, in a canonical\nspace, and then progressively generates clothes in a layer-wise manner. In this\nway, the 3D human generation is thus formulated as a sequence of\ndiffusion-based 3D conditional generation. To reconstruct more fine-grained 3D\nhumans with tri-plane representation, we propose a tri-plane shift operation\nthat splits each tri-plane into three sub-planes and shifts these sub-planes to\nenable feature grid subdivision. To further enhance the controllability of 3D\ngeneration with 3D layered conditions, HumanLiff hierarchically fuses tri-plane\nfeatures and 3D layered conditions to facilitate the 3D diffusion model\nlearning. Extensive experiments on two layer-wise 3D human datasets, SynBody\n(synthetic) and TightCap (real-world), validate that HumanLiff significantly\noutperforms state-of-the-art methods in layer-wise 3D human generation. Our\ncode will be available at https://skhu101.github.io/HumanLiff.\n","authors":["Shoukang Hu","Fangzhou Hong","Tao Hu","Liang Pan","Haiyi Mei","Weiye Xiao","Lei Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.09712v1.pdf","comment":"Project page: https://skhu101.github.io/HumanLiff/"},{"id":"http://arxiv.org/abs/2308.09711v1","updated":"2023-08-18T17:59:01Z","published":"2023-08-18T17:59:01Z","title":"Robust Monocular Depth Estimation under Challenging Conditions","summary":" While state-of-the-art monocular depth estimation approaches achieve\nimpressive results in ideal settings, they are highly unreliable under\nchallenging illumination and weather conditions, such as at nighttime or in the\npresence of rain. In this paper, we uncover these safety-critical issues and\ntackle them with md4all: a simple and effective solution that works reliably\nunder both adverse and ideal conditions, as well as for different types of\nlearning supervision. We achieve this by exploiting the efficacy of existing\nmethods under perfect settings. Therefore, we provide valid training signals\nindependently of what is in the input. First, we generate a set of complex\nsamples corresponding to the normal training ones. Then, we train the model by\nguiding its self- or full-supervision by feeding the generated samples and\ncomputing the standard losses on the corresponding original images. Doing so\nenables a single model to recover information across diverse conditions without\nmodifications at inference time. Extensive experiments on two challenging\npublic datasets, namely nuScenes and Oxford RobotCar, demonstrate the\neffectiveness of our techniques, outperforming prior works by a large margin in\nboth standard and challenging conditions. Source code and data are available\nat: https://md4all.github.io.\n","authors":["Stefano Gasperini","Nils Morbitzer","HyunJun Jung","Nassir Navab","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2308.09711v1.pdf","comment":"ICCV 2023. Source code and data: https://md4all.github.io"},{"id":"http://arxiv.org/abs/2308.09710v1","updated":"2023-08-18T17:58:44Z","published":"2023-08-18T17:58:44Z","title":"SimDA: Simple Diffusion Adapter for Efficient Video Generation","summary":" The recent wave of AI-generated content has witnessed the great development\nand success of Text-to-Image (T2I) technologies. By contrast, Text-to-Video\n(T2V) still falls short of expectations though attracting increasing interests.\nExisting works either train from scratch or adapt large T2I model to videos,\nboth of which are computation and resource expensive. In this work, we propose\na Simple Diffusion Adapter (SimDA) that fine-tunes only 24M out of 1.1B\nparameters of a strong T2I model, adapting it to video generation in a\nparameter-efficient way. In particular, we turn the T2I model for T2V by\ndesigning light-weight spatial and temporal adapters for transfer learning.\nBesides, we change the original spatial attention to the proposed Latent-Shift\nAttention (LSA) for temporal consistency. With similar model architecture, we\nfurther train a video super-resolution model to generate high-definition\n(1024x1024) videos. In addition to T2V generation in the wild, SimDA could also\nbe utilized in one-shot video editing with only 2 minutes tuning. Doing so, our\nmethod could minimize the training effort with extremely few tunable parameters\nfor model adaptation.\n","authors":["Zhen Xing","Qi Dai","Han Hu","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.09710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09708v1","updated":"2023-08-18T17:58:10Z","published":"2023-08-18T17:58:10Z","title":"Training with Product Digital Twins for AutoRetail Checkout","summary":" Automating the checkout process is important in smart retail, where users\neffortlessly pass products by hand through a camera, triggering automatic\nproduct detection, tracking, and counting. In this emerging area, due to the\nlack of annotated training data, we introduce a dataset comprised of product 3D\nmodels, which allows for fast, flexible, and large-scale training data\ngeneration through graphic engine rendering. Within this context, we discern an\nintriguing facet, because of the user \"hands-on\" approach, bias in user\nbehavior leads to distinct patterns in the real checkout process. The existence\nof such patterns would compromise training effectiveness if training data fail\nto reflect the same. To address this user bias problem, we propose a training\ndata optimization framework, i.e., training with digital twins (DtTrain).\nSpecifically, we leverage the product 3D models and optimize their rendering\nviewpoint and illumination to generate \"digital twins\" that visually resemble\nrepresentative user images. These digital twins, inherit product labels and,\nwhen augmented, form the Digital Twin training set (DT set). Because the\ndigital twins individually mimic user bias, the resulting DT training set\nbetter reflects the characteristics of the target scenario and allows us to\ntrain more effective product detection and tracking models. In our experiment,\nwe show that DT set outperforms training sets created by existing dataset\nsynthesis methods in terms of counting accuracy. Moreover, by combining DT set\nwith pseudo-labeled real checkout data, further improvement is observed. The\ncode is available at https://github.com/yorkeyao/Automated-Retail-Checkout.\n","authors":["Yue Yao","Xinyu Tian","Zheng Tang","Sujit Biswas","Huan Lei","Tom Gedeon","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.09708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05407v4","updated":"2023-08-18T17:57:13Z","published":"2022-09-12T16:59:36Z","title":"Segmenting Known Objects and Unseen Unknowns without Prior Knowledge","summary":" Panoptic segmentation methods assign a known class to each pixel given in\ninput. Even for state-of-the-art approaches, this inevitably enforces decisions\nthat systematically lead to wrong predictions for objects outside the training\ncategories. However, robustness against out-of-distribution samples and corner\ncases is crucial in safety-critical settings to avoid dangerous consequences.\nSince real-world datasets cannot contain enough data points to adequately\nsample the long tail of the underlying distribution, models must be able to\ndeal with unseen and unknown scenarios as well. Previous methods targeted this\nby re-identifying already-seen unlabeled objects. In this work, we propose the\nnecessary step to extend segmentation with a new setting which we term holistic\nsegmentation. Holistic segmentation aims to identify and separate objects of\nunseen, unknown categories into instances without any prior knowledge about\nthem while performing panoptic segmentation of known classes. We tackle this\nnew problem with U3HS, which finds unknowns as highly uncertain regions and\nclusters their corresponding instance-aware embeddings into individual objects.\nBy doing so, for the first time in panoptic segmentation with unknown objects,\nour U3HS is trained without unknown categories, reducing assumptions and\nleaving the settings as unconstrained as in real-life scenarios. Extensive\nexperiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate\nthe effectiveness of U3HS for this new, challenging, and assumptions-free\nsetting called holistic segmentation. Project page:\nhttps://holisticseg.github.io.\n","authors":["Stefano Gasperini","Alvaro Marcos-Ramiro","Michael Schmidt","Nassir Navab","Benjamin Busam","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2209.05407v4.pdf","comment":"ICCV 2023. Project page: https://holisticseg.github.io"},{"id":"http://arxiv.org/abs/2308.09705v1","updated":"2023-08-18T17:55:47Z","published":"2023-08-18T17:55:47Z","title":"Guide3D: Create 3D Avatars from Text and Image Guidance","summary":" Recently, text-to-image generation has exhibited remarkable advancements,\nwith the ability to produce visually impressive results. In contrast,\ntext-to-3D generation has not yet reached a comparable level of quality.\nExisting methods primarily rely on text-guided score distillation sampling\n(SDS), and they encounter difficulties in transferring 2D attributes of the\ngenerated images to 3D content. In this work, we aim to develop an effective 3D\ngenerative model capable of synthesizing high-resolution textured meshes by\nleveraging both textual and image information. To this end, we introduce\nGuide3D, a zero-shot text-and-image-guided generative model for 3D avatar\ngeneration based on diffusion models. Our model involves (1) generating\nsparse-view images of a text-consistent character using diffusion models, and\n(2) jointly optimizing multi-resolution differentiable marching tetrahedral\ngrids with pixel-aligned image features. We further propose a similarity-aware\nfeature fusion strategy for efficiently integrating features from different\nviews. Moreover, we introduce two novel training objectives as an alternative\nto calculating SDS, significantly enhancing the optimization process. We\nthoroughly evaluate the performance and components of our framework, which\noutperforms the current state-of-the-art in producing topologically and\nstructurally correct geometry and high-resolution textures. Guide3D enables the\ndirect transfer of 2D-generated images to the 3D space. Our code will be made\npublicly available.\n","authors":["Yukang Cao","Yan-Pei Cao","Kai Han","Ying Shan","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2308.09705v1.pdf","comment":"25 pages, 22 figures"},{"id":"http://arxiv.org/abs/2308.09694v1","updated":"2023-08-18T17:43:12Z","published":"2023-08-18T17:43:12Z","title":"Invariant Training 2D-3D Joint Hard Samples for Few-Shot Point Cloud\n Recognition","summary":" We tackle the data scarcity challenge in few-shot point cloud recognition of\n3D objects by using a joint prediction from a conventional 3D model and a\nwell-trained 2D model. Surprisingly, such an ensemble, though seems trivial,\nhas hardly been shown effective in recent 2D-3D models. We find out the crux is\nthe less effective training for the ''joint hard samples'', which have high\nconfidence prediction on different wrong labels, implying that the 2D and 3D\nmodels do not collaborate well. To this end, our proposed invariant training\nstrategy, called InvJoint, does not only emphasize the training more on the\nhard samples, but also seeks the invariance between the conflicting 2D and 3D\nambiguous predictions. InvJoint can learn more collaborative 2D and 3D\nrepresentations for better ensemble. Extensive experiments on 3D shape\nclassification with widely adopted ModelNet10/40, ScanObjectNN and Toys4K, and\nshape retrieval with ShapeNet-Core validate the superiority of our InvJoint.\n","authors":["Xuanyu Yi","Jiajun Deng","Qianru Sun","Xian-Sheng Hua","Joo-Hwee Lim","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09693v1","updated":"2023-08-18T17:41:39Z","published":"2023-08-18T17:41:39Z","title":"A Lightweight Transformer for Faster and Robust EBSD Data Collection","summary":" Three dimensional electron back-scattered diffraction (EBSD) microscopy is a\ncritical tool in many applications in materials science, yet its data quality\ncan fluctuate greatly during the arduous collection process, particularly via\nserial-sectioning. Fortunately, 3D EBSD data is inherently sequential, opening\nup the opportunity to use transformers, state-of-the-art deep learning\narchitectures that have made breakthroughs in a plethora of domains, for data\nprocessing and recovery. To be more robust to errors and accelerate this 3D\nEBSD data collection, we introduce a two step method that recovers missing\nslices in an 3D EBSD volume, using an efficient transformer model and a\nprojection algorithm to process the transformer's outputs. Overcoming the\ncomputational and practical hurdles of deep learning with scarce high\ndimensional data, we train this model using only synthetic 3D EBSD data with\nself-supervision and obtain superior recovery accuracy on real 3D EBSD data,\ncompared to existing methods.\n","authors":["Harry Dong","Sean Donegan","Megna Shah","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2308.09693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09691v2","updated":"2023-08-18T17:17:27Z","published":"2023-04-19T14:32:56Z","title":"DarSwin: Distortion Aware Radial Swin Transformer","summary":" Wide-angle lenses are commonly used in perception tasks requiring a large\nfield of view. Unfortunately, these lenses produce significant distortions\nmaking conventional models that ignore the distortion effects unable to adapt\nto wide-angle images. In this paper, we present a novel transformer-based model\nthat automatically adapts to the distortion produced by wide-angle lenses. We\nleverage the physical characteristics of such lenses, which are analytically\ndefined by the radial distortion profile (assumed to be known), to develop a\ndistortion aware radial swin transformer (DarSwin). In contrast to conventional\ntransformer-based architectures, DarSwin comprises a radial patch partitioning,\na distortion-based sampling technique for creating token embeddings, and an\nangular position encoding for radial patch merging. We validate our method on\nclassification tasks using synthetically distorted ImageNet data and show\nthrough extensive experiments that DarSwin can perform zero-shot adaptation to\nunseen distortions of different wide-angle lenses. Compared to other baselines,\nDarSwin achieves the best results (in terms of Top-1 accuracy) with significant\ngains when trained on bounded levels of distortions (very-low, low, medium, and\nhigh) and tested on all including out-of-distribution distortions. The code and\nmodels are publicly available at https://lvsn.github.io/darswin/\n","authors":["Akshaya Athwale","Arman Afrasiyabi","Justin Lague","Ichrak Shili","Ola Ahmad","Jean-Francois Lalonde"],"pdf_url":"https://arxiv.org/pdf/2304.09691v2.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.09685v1","updated":"2023-08-18T17:13:45Z","published":"2023-08-18T17:13:45Z","title":"Audiovisual Moments in Time: A Large-Scale Annotated Dataset of\n Audiovisual Actions","summary":" We present Audiovisual Moments in Time (AVMIT), a large-scale dataset of\naudiovisual action events. In an extensive annotation task 11 participants\nlabelled a subset of 3-second audiovisual videos from the Moments in Time\ndataset (MIT). For each trial, participants assessed whether the labelled\naudiovisual action event was present and whether it was the most prominent\nfeature of the video. The dataset includes the annotation of 57,177 audiovisual\nvideos, each independently evaluated by 3 of 11 trained participants. From this\ninitial collection, we created a curated test set of 16 distinct action\nclasses, with 60 videos each (960 videos). We also offer 2 sets of pre-computed\naudiovisual feature embeddings, using VGGish/YamNet for audio data and\nVGG16/EfficientNetB0 for visual data, thereby lowering the barrier to entry for\naudiovisual DNN research. We explored the advantages of AVMIT annotations and\nfeature embeddings to improve performance on audiovisual event recognition. A\nseries of 6 Recurrent Neural Networks (RNNs) were trained on either\nAVMIT-filtered audiovisual events or modality-agnostic events from MIT, and\nthen tested on our audiovisual test set. In all RNNs, top 1 accuracy was\nincreased by 2.71-5.94\\% by training exclusively on audiovisual events, even\noutweighing a three-fold increase in training data. We anticipate that the\nnewly annotated AVMIT dataset will serve as a valuable resource for research\nand comparative experiments involving computational models and human\nparticipants, specifically when addressing research questions where audiovisual\ncorrespondence is of critical importance.\n","authors":["Michael Joannou","Pia Rotshtein","Uta Noppeney"],"pdf_url":"https://arxiv.org/pdf/2308.09685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13848v2","updated":"2023-08-18T17:12:13Z","published":"2023-02-27T14:49:53Z","title":"ELITE: Encoding Visual Concepts into Textual Embeddings for Customized\n Text-to-Image Generation","summary":" In addition to the unprecedented ability in imaginary creation, large\ntext-to-image models are expected to take customized concepts in image\ngeneration. Existing works generally learn such concepts in an\noptimization-based manner, yet bringing excessive computation or memory burden.\nIn this paper, we instead propose a learning-based encoder, which consists of a\nglobal and a local mapping networks for fast and accurate customized\ntext-to-image generation. In specific, the global mapping network projects the\nhierarchical features of a given image into multiple new words in the textual\nword embedding space, i.e., one primary word for well-editable concept and\nother auxiliary words to exclude irrelevant disturbances (e.g., background). In\nthe meantime, a local mapping network injects the encoded patch features into\ncross attention layers to provide omitted details, without sacrificing the\neditability of primary concepts. We compare our method with existing\noptimization-based approaches on a variety of user-defined concepts, and\ndemonstrate that our method enables high-fidelity inversion and more robust\neditability with a significantly faster encoding process. Our code is publicly\navailable at https://github.com/csyxwei/ELITE.\n","authors":["Yuxiang Wei","Yabo Zhang","Zhilong Ji","Jinfeng Bai","Lei Zhang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2302.13848v2.pdf","comment":"Accepted by ICCV 2023, oral presentation. Code:\n https://github.com/csyxwei/ELITE"},{"id":"http://arxiv.org/abs/2305.15700v2","updated":"2023-08-18T16:59:24Z","published":"2023-05-25T04:16:07Z","title":"Fairness Continual Learning Approach to Semantic Scene Understanding in\n Open-World Environments","summary":" Continual semantic segmentation aims to learn new classes while maintaining\nthe information from the previous classes. Although prior studies have shown\nimpressive progress in recent years, the fairness concern in the continual\nsemantic segmentation needs to be better addressed. Meanwhile, fairness is one\nof the most vital factors in deploying the deep learning model, especially in\nhuman-related or safety applications. In this paper, we present a novel\nFairness Continual Learning approach to the semantic segmentation problem. In\nparticular, under the fairness objective, a new fairness continual learning\nframework is proposed based on class distributions. Then, a novel Prototypical\nContrastive Clustering loss is proposed to address the significant challenges\nin continual learning, i.e., catastrophic forgetting and background shift. Our\nproposed loss has also been proven as a novel, generalized learning paradigm of\nknowledge distillation commonly used in continual learning. Moreover, the\nproposed Conditional Structural Consistency loss further regularized the\nstructural constraint of the predicted segmentation. Our proposed approach has\nachieved State-of-the-Art performance on three standard scene understanding\nbenchmarks, i.e., ADE20K, Cityscapes, and Pascal VOC, and promoted the fairness\nof the segmentation model.\n","authors":["Thanh-Dat Truong","Hoang-Quan Nguyen","Bhiksha Raj","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2305.15700v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09678v1","updated":"2023-08-18T16:57:25Z","published":"2023-08-18T16:57:25Z","title":"PoSynDA: Multi-Hypothesis Pose Synthesis Domain Adaptation for Robust 3D\n Human Pose Estimation","summary":" The current 3D human pose estimators face challenges in adapting to new\ndatasets due to the scarcity of 2D-3D pose pairs in target domain training\nsets. We present the \\textit{Multi-Hypothesis \\textbf{P}ose \\textbf{Syn}thesis\n\\textbf{D}omain \\textbf{A}daptation} (\\textbf{PoSynDA}) framework to overcome\nthis issue without extensive target domain annotation. Utilizing a\ndiffusion-centric structure, PoSynDA simulates the 3D pose distribution in the\ntarget domain, filling the data diversity gap. By incorporating a\nmulti-hypothesis network, it creates diverse pose hypotheses and aligns them\nwith the target domain. Target-specific source augmentation obtains the target\ndomain distribution data from the source domain by decoupling the scale and\nposition parameters. The teacher-student paradigm and low-rank adaptation\nfurther refine the process. PoSynDA demonstrates competitive performance on\nbenchmarks, such as Human3.6M, MPI-INF-3DHP, and 3DPW, even comparable with the\ntarget-trained MixSTE model~\\cite{zhang2022mixste}. This work paves the way for\nthe practical application of 3D human pose estimation. The code is available at\nhttps://github.com/hbing-l/PoSynDA.\n","authors":["Hanbing Liu","Jun-Yan He","Zhi-Qi Cheng","Wangmeng Xiang","Qize Yang","Wenhao Chai","Gaoang Wang","Xu Bao","Bin Luo","Yifeng Geng","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2308.09678v1.pdf","comment":"Accepted to ACM Multimedia 2023; 10 pages, 4 figures, 8 tables; the\n code is at https://github.com/hbing-l/PoSynDA"},{"id":"http://arxiv.org/abs/2305.16437v2","updated":"2023-08-18T16:54:02Z","published":"2023-05-25T19:30:21Z","title":"KeyPosS: Plug-and-Play Facial Landmark Detection through GPS-Inspired\n True-Range Multilateration","summary":" In the realm of facial analysis, accurate landmark detection is crucial for\nvarious applications, ranging from face recognition and expression analysis to\nanimation. Conventional heatmap or coordinate regression-based techniques,\nhowever, often face challenges in terms of computational burden and\nquantization errors. To address these issues, we present the KeyPoint\nPositioning System (KeyPosS) - a groundbreaking facial landmark detection\nframework that stands out from existing methods. The framework utilizes a fully\nconvolutional network to predict a distance map, which computes the distance\nbetween a Point of Interest (POI) and multiple anchor points. These anchor\npoints are ingeniously harnessed to triangulate the POI's position through the\nTrue-range Multilateration algorithm. Notably, the plug-and-play nature of\nKeyPosS enables seamless integration into any decoding stage, ensuring a\nversatile and adaptable solution. We conducted a thorough evaluation of\nKeyPosS's performance by benchmarking it against state-of-the-art models on\nfour different datasets. The results show that KeyPosS substantially\noutperforms leading methods in low-resolution settings while requiring a\nminimal time overhead. The code is available at\nhttps://github.com/zhiqic/KeyPosS.\n","authors":["Xu Bao","Zhi-Qi Cheng","Jun-Yan He","Chenyang Li","Wangmeng Xiang","Jingdong Sun","Hanbing Liu","Wei Liu","Bin Luo","Yifeng Geng","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2305.16437v2.pdf","comment":"Accepted to ACM Multimedia 2023; 10 pages, 7 figures, 6 tables; the\n code is at https://github.com/zhiqic/KeyPosS"},{"id":"http://arxiv.org/abs/2308.09658v1","updated":"2023-08-18T16:21:40Z","published":"2023-08-18T16:21:40Z","title":"Tree-of-Mixed-Thought: Combining Fast and Slow Thinking for Multi-hop\n Visual Reasoning","summary":" There emerges a promising trend of using large language models (LLMs) to\ngenerate code-like plans for complex inference tasks such as visual reasoning.\nThis paradigm, known as LLM-based planning, provides flexibility in problem\nsolving and endows better interpretability. However, current research is mostly\nlimited to basic scenarios of simple questions that can be straightforward\nanswered in a few inference steps. Planning for the more challenging multi-hop\nvisual reasoning tasks remains under-explored. Specifically, under multi-hop\nreasoning situations, the trade-off between accuracy and the complexity of\nplan-searching becomes prominent. The prevailing algorithms either address the\nefficiency issue by employing the fast one-stop generation or adopt a complex\niterative generation method to improve accuracy. Both fail to balance the need\nfor efficiency and performance. Drawing inspiration from the dual system of\ncognition in the human brain, the fast and the slow think processes, we propose\na hierarchical plan-searching algorithm that integrates the one-stop reasoning\n(fast) and the Tree-of-thought (slow). Our approach succeeds in performance\nwhile significantly saving inference steps. Moreover, we repurpose the PTR and\nthe CLEVER datasets, developing a systematic framework for evaluating the\nperformance and efficiency of LLMs-based plan-search algorithms under reasoning\ntasks at different levels of difficulty. Extensive experiments demonstrate the\nsuperiority of our proposed algorithm in terms of performance and efficiency.\nThe dataset and code will be release soon.\n","authors":["Pengbo Hu","Ji Qi","Xingyu Li","Hong Li","Xinqi Wang","Bing Quan","Ruiyu Wang","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.09658v1.pdf","comment":"16 pages,1 figures, under review"},{"id":"http://arxiv.org/abs/2307.12644v2","updated":"2023-08-18T16:03:06Z","published":"2023-07-24T09:35:47Z","title":"Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation\n of rPPG","summary":" rPPG (Remote photoplethysmography) is a technology that measures and analyzes\nBVP (Blood Volume Pulse) by using the light absorption characteristics of\nhemoglobin captured through a camera. Analyzing the measured BVP can derive\nvarious physiological signals such as heart rate, stress level, and blood\npressure, which can be applied to various applications such as telemedicine,\nremote patient monitoring, and early prediction of cardiovascular disease. rPPG\nis rapidly evolving and attracting great attention from both academia and\nindustry by providing great usability and convenience as it can measure\nbiosignals using a camera-equipped device without medical or wearable devices.\nDespite extensive efforts and advances in this field, serious challenges\nremain, including issues related to skin color, camera characteristics, ambient\nlighting, and other sources of noise and artifacts, which degrade accuracy\nperformance. We argue that fair and evaluable benchmarking is urgently required\nto overcome these challenges and make meaningful progress from both academic\nand commercial perspectives. In most existing work, models are trained, tested,\nand validated only on limited datasets. Even worse, some studies lack available\ncode or reproducibility, making it difficult to fairly evaluate and compare\nperformance. Therefore, the purpose of this study is to provide a benchmarking\nframework to evaluate various rPPG techniques across a wide range of datasets\nfor fair evaluation and comparison, including both conventional non-deep neural\nnetwork (non-DNN) and deep neural network (DNN) methods. GitHub URL:\nhttps://github.com/remotebiosensing/rppg\n","authors":["Dae-Yeol Kim","Eunsu Goh","KwangKee Lee","JongEui Chae","JongHyeon Mun","Junyeong Na","Chae-bong Sohn","Do-Yup Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12644v2.pdf","comment":"20 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.09640v1","updated":"2023-08-18T15:59:55Z","published":"2023-08-18T15:59:55Z","title":"Revisiting Skin Tone Fairness in Dermatological Lesion Classification","summary":" Addressing fairness in lesion classification from dermatological images is\ncrucial due to variations in how skin diseases manifest across skin tones.\nHowever, the absence of skin tone labels in public datasets hinders building a\nfair classifier. To date, such skin tone labels have been estimated prior to\nfairness analysis in independent studies using the Individual Typology Angle\n(ITA). Briefly, ITA calculates an angle based on pixels extracted from skin\nimages taking into account the lightness and yellow-blue tints. These angles\nare then categorised into skin tones that are subsequently used to analyse\nfairness in skin cancer classification. In this work, we review and compare\nfour ITA-based approaches of skin tone classification on the ISIC18 dataset, a\ncommon benchmark for assessing skin cancer classification fairness in the\nliterature. Our analyses reveal a high disagreement among previously published\nstudies demonstrating the risks of ITA-based skin tone estimation methods.\nMoreover, we investigate the causes of such large discrepancy among these\napproaches and find that the lack of diversity in the ISIC18 dataset limits its\nuse as a testbed for fairness analysis. Finally, we recommend further research\non robust ITA estimation and diverse dataset acquisition with skin tone\nannotation to facilitate conclusive fairness assessments of artificial\nintelligence tools in dermatology. Our code is available at\nhttps://github.com/tkalbl/RevisitingSkinToneFairness.\n","authors":["Thorsten Kalb","Kaisar Kushibar","Celia Cintas","Karim Lekadir","Oliver Diaz","Richard Osuala"],"pdf_url":"https://arxiv.org/pdf/2308.09640v1.pdf","comment":"Accepted at 2023 MICCAI FAIMI Workshop"},{"id":"http://arxiv.org/abs/2308.09632v1","updated":"2023-08-18T15:44:45Z","published":"2023-08-18T15:44:45Z","title":"VALERIE22 -- A photorealistic, richly metadata annotated dataset of\n urban environments","summary":" The VALERIE tool pipeline is a synthetic data generator developed with the\ngoal to contribute to the understanding of domain-specific factors that\ninfluence perception performance of DNNs (deep neural networks). This work was\ncarried out under the German research project KI Absicherung in order to\ndevelop a methodology for the validation of DNNs in the context of pedestrian\ndetection in urban environments for automated driving. The VALERIE22 dataset\nwas generated with the VALERIE procedural tools pipeline providing a\nphotorealistic sensor simulation rendered from automatically synthesized\nscenes. The dataset provides a uniquely rich set of metadata, allowing\nextraction of specific scene and semantic features (like pixel-accurate\nocclusion rates, positions in the scene and distance + angle to the camera).\nThis enables a multitude of possible tests on the data and we hope to stimulate\nresearch on understanding performance of DNNs. Based on performance metric a\ncomparison with several other publicly available datasets is provided,\ndemonstrating that VALERIE22 is one of best performing synthetic datasets\ncurrently available in the open domain.\n","authors":["Oliver Grau","Korbinian Hagn"],"pdf_url":"https://arxiv.org/pdf/2308.09632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05566v5","updated":"2023-08-18T15:43:37Z","published":"2022-12-11T18:15:40Z","title":"YoloCurvSeg: You Only Label One Noisy Skeleton for Vessel-style\n Curvilinear Structure Segmentation","summary":" Weakly-supervised learning (WSL) has been proposed to alleviate the conflict\nbetween data annotation cost and model performance through employing\nsparsely-grained (i.e., point-, box-, scribble-wise) supervision and has shown\npromising performance, particularly in the image segmentation field. However,\nit is still a very challenging task due to the limited supervision, especially\nwhen only a small number of labeled samples are available. Additionally, almost\nall existing WSL segmentation methods are designed for star-convex structures\nwhich are very different from curvilinear structures such as vessels and\nnerves. In this paper, we propose a novel sparsely annotated segmentation\nframework for curvilinear structures, named YoloCurvSeg. A very essential\ncomponent of YoloCurvSeg is image synthesis. Specifically, a background\ngenerator delivers image backgrounds that closely match the real distributions\nthrough inpainting dilated skeletons. The extracted backgrounds are then\ncombined with randomly emulated curves generated by a Space Colonization\nAlgorithm-based foreground generator and through a multilayer patch-wise\ncontrastive learning synthesizer. In this way, a synthetic dataset with both\nimages and curve segmentation labels is obtained, at the cost of only one or a\nfew noisy skeleton annotations. Finally, a segmenter is trained with the\ngenerated dataset and possibly an unlabeled dataset. The proposed YoloCurvSeg\nis evaluated on four publicly available datasets (OCTA500, CORN, DRIVE and\nCHASEDB1) and the results show that YoloCurvSeg outperforms state-of-the-art\nWSL segmentation methods by large margins. With only one noisy skeleton\nannotation (respectively 0.14\\%, 0.03\\%, 1.40\\%, and 0.65\\% of the full\nannotation), YoloCurvSeg achieves more than 97\\% of the fully-supervised\nperformance on each dataset. Code and datasets will be released at\nhttps://github.com/llmir/YoloCurvSeg.\n","authors":["Li Lin","Linkai Peng","Huaqing He","Pujin Cheng","Jiewei Wu","Kenneth K. Y. Wong","Xiaoying Tang"],"pdf_url":"https://arxiv.org/pdf/2212.05566v5.pdf","comment":"20 pages, 15 figures, MEDIA accepted"},{"id":"http://arxiv.org/abs/2308.05681v2","updated":"2023-08-18T15:34:40Z","published":"2023-08-10T16:34:20Z","title":"Hard No-Box Adversarial Attack on Skeleton-Based Human Action\n Recognition with Skeleton-Motion-Informed Gradient","summary":" Recently, methods for skeleton-based human activity recognition have been\nshown to be vulnerable to adversarial attacks. However, these attack methods\nrequire either the full knowledge of the victim (i.e. white-box attacks),\naccess to training data (i.e. transfer-based attacks) or frequent model queries\n(i.e. black-box attacks). All their requirements are highly restrictive,\nraising the question of how detrimental the vulnerability is. In this paper, we\nshow that the vulnerability indeed exists. To this end, we consider a new\nattack task: the attacker has no access to the victim model or the training\ndata or labels, where we coin the term hard no-box attack. Specifically, we\nfirst learn a motion manifold where we define an adversarial loss to compute a\nnew gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our\ngradient contains information of the motion dynamics, which is different from\nexisting gradient-based attack methods that compute the loss gradient assuming\neach dimension in the data is independent. The SMI gradient can augment many\ngradient-based attack methods, leading to a new family of no-box attack\nmethods. Extensive evaluation and comparison show that our method imposes a\nreal threat to existing classifiers. They also show that the SMI gradient\nimproves the transferability and imperceptibility of adversarial samples in\nboth no-box and transfer-based black-box settings.\n","authors":["Zhengzhi Lu","He Wang","Ziyi Chang","Guoan Yang","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2308.05681v2.pdf","comment":"Camera-ready version for ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09624v1","updated":"2023-08-18T15:32:01Z","published":"2023-08-18T15:32:01Z","title":"GeoDTR+: Toward generic cross-view geolocalization via geometric\n disentanglement","summary":" Cross-View Geo-Localization (CVGL) estimates the location of a ground image\nby matching it to a geo-tagged aerial image in a database. Recent works achieve\noutstanding progress on CVGL benchmarks. However, existing methods still suffer\nfrom poor performance in cross-area evaluation, in which the training and\ntesting data are captured from completely distinct areas. We attribute this\ndeficiency to the lack of ability to extract the geometric layout of visual\nfeatures and models' overfitting to low-level details. Our preliminary work\nintroduced a Geometric Layout Extractor (GLE) to capture the geometric layout\nfrom input features. However, the previous GLE does not fully exploit\ninformation in the input feature. In this work, we propose GeoDTR+ with an\nenhanced GLE module that better models the correlations among visual features.\nTo fully explore the LS techniques from our preliminary work, we further\npropose Contrastive Hard Samples Generation (CHSG) to facilitate model\ntraining. Extensive experiments show that GeoDTR+ achieves state-of-the-art\n(SOTA) results in cross-area evaluation on CVUSA, CVACT, and VIGOR by a large\nmargin ($16.44\\%$, $22.71\\%$, and $17.02\\%$ without polar transformation) while\nkeeping the same-area performance comparable to existing SOTA. Moreover, we\nprovide detailed analyses of GeoDTR+.\n","authors":["Xiaohan Zhang","Xingyu Li","Waqas Sultani","Chen Chen","Safwan Wshah"],"pdf_url":"https://arxiv.org/pdf/2308.09624v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2212.04074"},{"id":"http://arxiv.org/abs/2308.09622v1","updated":"2023-08-18T15:27:22Z","published":"2023-08-18T15:27:22Z","title":"Is context all you need? Scaling Neural Sign Language Translation to\n Large Domains of Discourse","summary":" Sign Language Translation (SLT) is a challenging task that aims to generate\nspoken language sentences from sign language videos, both of which have\ndifferent grammar and word/gloss order. From a Neural Machine Translation (NMT)\nperspective, the straightforward way of training translation models is to use\nsign language phrase-spoken language sentence pairs. However, human\ninterpreters heavily rely on the context to understand the conveyed\ninformation, especially for sign language interpretation, where the vocabulary\nsize may be significantly smaller than their spoken language equivalent.\n Taking direct inspiration from how humans translate, we propose a novel\nmulti-modal transformer architecture that tackles the translation task in a\ncontext-aware manner, as a human would. We use the context from previous\nsequences and confident predictions to disambiguate weaker visual cues. To\nachieve this we use complementary transformer encoders, namely: (1) A Video\nEncoder, that captures the low-level video features at the frame-level, (2) A\nSpotting Encoder, that models the recognized sign glosses in the video, and (3)\nA Context Encoder, which captures the context of the preceding sign sequences.\nWe combine the information coming from these encoders in a final transformer\ndecoder to generate spoken language translations.\n We evaluate our approach on the recently published large-scale BOBSL dataset,\nwhich contains ~1.2M sequences, and on the SRF dataset, which was part of the\nWMT-SLT 2022 challenge. We report significant improvements on state-of-the-art\ntranslation performance using contextual information, nearly doubling the\nreported BLEU-4 scores of baseline approaches.\n","authors":["Ozge Mercanoglu Sincan","Necati Cihan Camgoz","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2308.09622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09618v1","updated":"2023-08-18T15:21:15Z","published":"2023-08-18T15:21:15Z","title":"LaRS: A Diverse Panoptic Maritime Obstacle Detection Dataset and\n Benchmark","summary":" The progress in maritime obstacle detection is hindered by the lack of a\ndiverse dataset that adequately captures the complexity of general maritime\nenvironments. We present the first maritime panoptic obstacle detection\nbenchmark LaRS, featuring scenes from Lakes, Rivers and Seas. Our major\ncontribution is the new dataset, which boasts the largest diversity in\nrecording locations, scene types, obstacle classes, and acquisition conditions\namong the related datasets. LaRS is composed of over 4000 per-pixel labeled key\nframes with nine preceding frames to allow utilization of the temporal texture,\namounting to over 40k frames. Each key frame is annotated with 8 thing, 3 stuff\nclasses and 19 global scene attributes. We report the results of 27 semantic\nand panoptic segmentation methods, along with several performance insights and\nfuture research directions. To enable objective evaluation, we have implemented\nan online evaluation server. The LaRS dataset, evaluation toolkit and benchmark\nare publicly available at: https://lojzezust.github.io/lars-dataset\n","authors":["Lojze Žust","Janez Perš","Matej Kristan"],"pdf_url":"https://arxiv.org/pdf/2308.09618v1.pdf","comment":"ICCV 2023, 9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.09616v1","updated":"2023-08-18T15:19:17Z","published":"2023-08-18T15:19:17Z","title":"Far3D: Expanding the Horizon for Surround-view 3D Object Detection","summary":" Recently 3D object detection from surround-view images has made notable\nadvancements with its low deployment cost. However, most works have primarily\nfocused on close perception range while leaving long-range detection less\nexplored. Expanding existing methods directly to cover long distances poses\nchallenges such as heavy computation costs and unstable convergence. To address\nthese limitations, this paper proposes a novel sparse query-based framework,\ndubbed Far3D. By utilizing high-quality 2D object priors, we generate 3D\nadaptive queries that complement the 3D global queries. To efficiently capture\ndiscriminative features across different views and scales for long-range\nobjects, we introduce a perspective-aware aggregation module. Additionally, we\npropose a range-modulated 3D denoising approach to address query error\npropagation and mitigate convergence issues in long-range tasks. Significantly,\nFar3D demonstrates SoTA performance on the challenging Argoverse 2 dataset,\ncovering a wide range of 150 meters, surpassing several LiDAR-based approaches.\nMeanwhile, Far3D exhibits superior performance compared to previous methods on\nthe nuScenes dataset. The code will be available soon.\n","authors":["Xiaohui Jiang","Shuailin Li","Yingfei Liu","Shihao Wang","Fan Jia","Tiancai Wang","Lijin Han","Xiangyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09611v1","updated":"2023-08-18T15:13:03Z","published":"2023-08-18T15:13:03Z","title":"Language-guided Human Motion Synthesis with Atomic Actions","summary":" Language-guided human motion synthesis has been a challenging task due to the\ninherent complexity and diversity of human behaviors. Previous methods face\nlimitations in generalization to novel actions, often resulting in unrealistic\nor incoherent motion sequences. In this paper, we propose ATOM (ATomic mOtion\nModeling) to mitigate this problem, by decomposing actions into atomic actions,\nand employing a curriculum learning strategy to learn atomic action\ncomposition. First, we disentangle complex human motions into a set of atomic\nactions during learning, and then assemble novel actions using the learned\natomic actions, which offers better adaptability to new actions. Moreover, we\nintroduce a curriculum learning training strategy that leverages masked motion\nmodeling with a gradual increase in the mask ratio, and thus facilitates atomic\naction assembly. This approach mitigates the overfitting problem commonly\nencountered in previous methods while enforcing the model to learn better\nmotion representations. We demonstrate the effectiveness of ATOM through\nextensive experiments, including text-to-motion and action-to-motion synthesis\ntasks. We further illustrate its superiority in synthesizing plausible and\ncoherent text-guided human motion sequences.\n","authors":["Yuanhao Zhai","Mingzhen Huang","Tianyu Luan","Lu Dong","Ifeoma Nwogu","Siwei Lyu","David Doermann","Junsong Yuan"],"pdf_url":"https://arxiv.org/pdf/2308.09611v1.pdf","comment":"Accepted to ACM MM 2023, code: https://github.com/yhZhai/ATOM"},{"id":"http://arxiv.org/abs/2307.08388v2","updated":"2023-08-18T15:12:06Z","published":"2023-07-17T10:55:58Z","title":"Dynamic Snake Convolution based on Topological Geometric Constraints for\n Tubular Structure Segmentation","summary":" Accurate segmentation of topological tubular structures, such as blood\nvessels and roads, is crucial in various fields, ensuring accuracy and\nefficiency in downstream tasks. However, many factors complicate the task,\nincluding thin local structures and variable global morphologies. In this work,\nwe note the specificity of tubular structures and use this knowledge to guide\nour DSCNet to simultaneously enhance perception in three stages: feature\nextraction, feature fusion, and loss constraint. First, we propose a dynamic\nsnake convolution to accurately capture the features of tubular structures by\nadaptively focusing on slender and tortuous local structures. Subsequently, we\npropose a multi-view feature fusion strategy to complement the attention to\nfeatures from multiple perspectives during feature fusion, ensuring the\nretention of important information from different global morphologies. Finally,\na continuity constraint loss function, based on persistent homology, is\nproposed to constrain the topological continuity of the segmentation better.\nExperiments on 2D and 3D datasets show that our DSCNet provides better accuracy\nand continuity on the tubular structure segmentation task compared with several\nmethods. Our codes will be publicly available.\n","authors":["Yaolei Qi","Yuting He","Xiaoming Qi","Yuan Zhang","Guanyu Yang"],"pdf_url":"https://arxiv.org/pdf/2307.08388v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09610v1","updated":"2023-08-18T15:11:16Z","published":"2023-08-18T15:11:16Z","title":"On the Effectiveness of LayerNorm Tuning for Continual Learning in\n Vision Transformers","summary":" State-of-the-art rehearsal-free continual learning methods exploit the\npeculiarities of Vision Transformers to learn task-specific prompts,\ndrastically reducing catastrophic forgetting. However, there is a tradeoff\nbetween the number of learned parameters and the performance, making such\nmodels computationally expensive. In this work, we aim to reduce this cost\nwhile maintaining competitive performance. We achieve this by revisiting and\nextending a simple transfer learning idea: learning task-specific normalization\nlayers. Specifically, we tune the scale and bias parameters of LayerNorm for\neach continual learning task, selecting them at inference time based on the\nsimilarity between task-specific keys and the output of the pre-trained model.\nTo make the classifier robust to incorrect selection of parameters during\ninference, we introduce a two-stage training procedure, where we first optimize\nthe task-specific parameters and then train the classifier with the same\nselection procedure of the inference time. Experiments on ImageNet-R and\nCIFAR-100 show that our method achieves results that are either superior or on\npar with {the state of the art} while being computationally cheaper.\n","authors":["Thomas De Min","Massimiliano Mancini","Karteek Alahari","Xavier Alameda-Pineda","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2308.09610v1.pdf","comment":"In The First Workshop on Visual Continual Learning (ICCVW 2023); Oral"},{"id":"http://arxiv.org/abs/2211.10408v3","updated":"2023-08-18T15:06:20Z","published":"2022-11-18T18:18:53Z","title":"CroCo v2: Improved Cross-view Completion Pre-training for Stereo\n Matching and Optical Flow","summary":" Despite impressive performance for high-level downstream tasks,\nself-supervised pre-training methods have not yet fully delivered on dense\ngeometric vision tasks such as stereo matching or optical flow. The application\nof self-supervised concepts, such as instance discrimination or masked image\nmodeling, to geometric tasks is an active area of research. In this work, we\nbuild on the recent cross-view completion framework, a variation of masked\nimage modeling that leverages a second view from the same scene which makes it\nwell suited for binocular downstream tasks. The applicability of this concept\nhas so far been limited in at least two ways: (a) by the difficulty of\ncollecting real-world image pairs -- in practice only synthetic data have been\nused -- and (b) by the lack of generalization of vanilla transformers to dense\ndownstream tasks for which relative position is more meaningful than absolute\nposition. We explore three avenues of improvement. First, we introduce a method\nto collect suitable real-world image pairs at large scale. Second, we\nexperiment with relative positional embeddings and show that they enable vision\ntransformers to perform substantially better. Third, we scale up vision\ntransformer based cross-completion architectures, which is made possible by the\nuse of large amounts of data. With these improvements, we show for the first\ntime that state-of-the-art results on stereo matching and optical flow can be\nreached without using any classical task-specific techniques like correlation\nvolume, iterative estimation, image warping or multi-scale reasoning, thus\npaving the way towards universal vision models.\n","authors":["Philippe Weinzaepfel","Thomas Lucas","Vincent Leroy","Yohann Cabon","Vaibhav Arora","Romain Brégier","Gabriela Csurka","Leonid Antsfeld","Boris Chidlovskii","Jérôme Revaud"],"pdf_url":"https://arxiv.org/pdf/2211.10408v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.06853v2","updated":"2023-08-18T15:02:05Z","published":"2023-07-13T16:09:53Z","title":"LVLane: Deep Learning for Lane Detection and Classification in\n Challenging Conditions","summary":" Lane detection plays a pivotal role in the field of autonomous vehicles and\nadvanced driving assistant systems (ADAS). Despite advances from image\nprocessing to deep learning based models, algorithm performance is highly\ndependent on training data matching the local challenges such as extreme\nlighting conditions, partially visible lane markings, and sparse lane markings\nlike Botts' dots. To address this, we present an end-to-end lane detection and\nclassification system based on deep learning methodologies. In our study, we\nintroduce a unique dataset meticulously curated to encompass scenarios that\npose significant challenges for state-of-the-art (SOTA) lane localization\nmodels. Moreover, we propose a CNN-based classification branch, seamlessly\nintegrated with the detector, facilitating the identification of distinct lane\ntypes. This architecture enables informed lane-changing decisions and empowers\nmore resilient ADAS capabilities. We also investigate the effect of using mixed\nprecision training and testing on different models and batch sizes.\nExperimental evaluations conducted on the widely-used TuSimple dataset, Caltech\nLane dataset, and our LVLane dataset demonstrate the effectiveness of our model\nin accurately detecting and classifying lanes amidst challenging scenarios. Our\nmethod achieves state-of-the-art classification results on the TuSimple\ndataset. The code of the work can be found on www.github.com/zillur-av/LVLane.\n","authors":["Zillur Rahman","Brendan Tran Morris"],"pdf_url":"https://arxiv.org/pdf/2307.06853v2.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2307.06542v3","updated":"2023-08-18T14:59:57Z","published":"2023-07-13T03:11:09Z","title":"Quantum Image Denoising: A Framework via Boltzmann Machines, QUBO, and\n Quantum Annealing","summary":" We investigate a framework for binary image denoising via restricted\nBoltzmann machines (RBMs) that introduces a denoising objective in quadratic\nunconstrained binary optimization (QUBO) form and is well-suited for quantum\nannealing. The denoising objective is attained by balancing the distribution\nlearned by a trained RBM with a penalty term for derivations from the noisy\nimage. We derive the statistically optimal choice of the penalty parameter\nassuming the target distribution has been well-approximated, and further\nsuggest an empirically supported modification to make the method robust to that\nidealistic assumption. We also show under additional assumptions that the\ndenoised images attained by our method are, in expectation, strictly closer to\nthe noise-free images than the noisy images are. While we frame the model as an\nimage denoising model, it can be applied to any binary data. As the QUBO\nformulation is well-suited for implementation on quantum annealers, we test the\nmodel on a D-Wave Advantage machine, and also test on data too large for\ncurrent quantum annealers by approximating QUBO solutions through classical\nheuristics.\n","authors":["Phillip Kerger","Ryoji Miyazaki"],"pdf_url":"https://arxiv.org/pdf/2307.06542v3.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.09599v1","updated":"2023-08-18T14:54:13Z","published":"2023-08-18T14:54:13Z","title":"Language-Guided Diffusion Model for Visual Grounding","summary":" Visual grounding (VG) tasks involve explicit cross-modal alignment, as\nsemantically corresponding image regions are to be located for the language\nphrases provided. Existing approaches complete such visual-text reasoning in a\nsingle-step manner. Their performance causes high demands on large-scale\nanchors and over-designed multi-modal fusion modules based on human priors,\nleading to complicated frameworks that may be difficult to train and overfit to\nspecific scenarios. Even worse, such once-for-all reasoning mechanisms are\nincapable of refining boxes continuously to enhance query-region matching. In\ncontrast, in this paper, we formulate an iterative reasoning process by\ndenoising diffusion modeling. Specifically, we propose a language-guided\ndiffusion framework for visual grounding, LG-DVG, which trains the model to\nprogressively reason queried object boxes by denoising a set of noisy boxes\nwith the language guide. To achieve this, LG-DVG gradually perturbs\nquery-aligned ground truth boxes to noisy ones and reverses this process step\nby step, conditional on query semantics. Extensive experiments for our proposed\nframework on five widely used datasets validate the superior performance of\nsolving visual grounding, a cross-modal alignment task, in a generative way.\nThe source codes are available at\n\\url{https://github.com/iQua/vgbase/tree/DiffusionVG}.\n","authors":["Sijia Chen","Baochun Li"],"pdf_url":"https://arxiv.org/pdf/2308.09599v1.pdf","comment":"20 pages, 16 figures"},{"id":"http://arxiv.org/abs/2303.08010v2","updated":"2023-08-18T14:51:30Z","published":"2023-03-14T15:57:54Z","title":"Window-Based Early-Exit Cascades for Uncertainty Estimation: When Deep\n Ensembles are More Efficient than Single Models","summary":" Deep Ensembles are a simple, reliable, and effective method of improving both\nthe predictive performance and uncertainty estimates of deep learning\napproaches. However, they are widely criticised as being computationally\nexpensive, due to the need to deploy multiple independent models. Recent work\nhas challenged this view, showing that for predictive accuracy, ensembles can\nbe more computationally efficient (at inference) than scaling single models\nwithin an architecture family. This is achieved by cascading ensemble members\nvia an early-exit approach. In this work, we investigate extending these\nefficiency gains to tasks related to uncertainty estimation. As many such\ntasks, e.g. selective classification, are binary classification, our key novel\ninsight is to only pass samples within a window close to the binary decision\nboundary to later cascade stages. Experiments on ImageNet-scale data across a\nnumber of network architectures and uncertainty tasks show that the proposed\nwindow-based early-exit approach is able to achieve a superior\nuncertainty-computation trade-off compared to scaling single models. For\nexample, a cascaded EfficientNet-B2 ensemble is able to achieve similar\ncoverage at 5% risk as a single EfficientNet-B4 with <30% the number of MACs.\nWe also find that cascades/ensembles give more reliable improvements on OOD\ndata vs scaling models up. Code for this work is available at:\nhttps://github.com/Guoxoug/window-early-exit.\n","authors":["Guoxuan Xia","Christos-Savvas Bouganis"],"pdf_url":"https://arxiv.org/pdf/2303.08010v2.pdf","comment":"Accepted to ICCV 2023 (camera-ready version, 9 pages)"},{"id":"http://arxiv.org/abs/2308.03867v2","updated":"2023-08-18T14:46:04Z","published":"2023-08-07T18:39:14Z","title":"From Sky to the Ground: A Large-scale Benchmark and Simple Baseline\n Towards Real Rain Removal","summary":" Learning-based image deraining methods have made great progress. However, the\nlack of large-scale high-quality paired training samples is the main bottleneck\nto hamper the real image deraining (RID). To address this dilemma and advance\nRID, we construct a Large-scale High-quality Paired real rain benchmark\n(LHP-Rain), including 3000 video sequences with 1 million high-resolution\n(1920*1080) frame pairs. The advantages of the proposed dataset over the\nexisting ones are three-fold: rain with higher-diversity and larger-scale,\nimage with higher-resolution and higher-quality ground-truth. Specifically, the\nreal rains in LHP-Rain not only contain the classical rain\nstreak/veiling/occlusion in the sky, but also the \\textbf{splashing on the\nground} overlooked by deraining community. Moreover, we propose a novel robust\nlow-rank tensor recovery model to generate the GT with better separating the\nstatic background from the dynamic rain. In addition, we design a simple\ntransformer-based single image deraining baseline, which simultaneously utilize\nthe self-attention and cross-layer attention within the image and rain layer\nwith discriminative feature representation. Extensive experiments verify the\nsuperiority of the proposed dataset and deraining method over state-of-the-art.\n","authors":["Yun Guo","Xueyao Xiao","Yi Chang","Shumin Deng","Luxin Yan"],"pdf_url":"https://arxiv.org/pdf/2308.03867v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.02535v2","updated":"2023-08-18T14:42:58Z","published":"2023-08-01T10:02:26Z","title":"Learning to Generate Training Datasets for Robust Semantic Segmentation","summary":" Semantic segmentation techniques have shown significant progress in recent\nyears, but their robustness to real-world perturbations and data samples not\nseen during training remains a challenge, particularly in safety-critical\napplications. In this paper, we propose a novel approach to improve the\nrobustness of semantic segmentation techniques by leveraging the synergy\nbetween label-to-image generators and image-to-label segmentation models.\nSpecifically, we design and train Robusta, a novel robust conditional\ngenerative adversarial network to generate realistic and plausible perturbed or\noutlier images that can be used to train reliable segmentation models. We\nconduct in-depth studies of the proposed generative model, assess the\nperformance and robustness of the downstream segmentation network, and\ndemonstrate that our approach can significantly enhance the robustness of\nsemantic segmentation techniques in the face of real-world perturbations,\ndistribution shifts, and out-of-distribution samples. Our results suggest that\nthis approach could be valuable in safety-critical applications, where the\nreliability of semantic segmentation techniques is of utmost importance and\ncomes with a limited computational budget in inference. We will release our\ncode shortly.\n","authors":["Marwane Hariat","Olivier Laurent","Rémi Kazmierczak","Shihao Zhang","Andrei Bursuc","Angela Yao","Gianni Franchi"],"pdf_url":"https://arxiv.org/pdf/2308.02535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09593v1","updated":"2023-08-18T14:41:51Z","published":"2023-08-18T14:41:51Z","title":"Investigation of Architectures and Receptive Fields for Appearance-based\n Gaze Estimation","summary":" With the rapid development of deep learning technology in the past decade,\nappearance-based gaze estimation has attracted great attention from both\ncomputer vision and human-computer interaction research communities.\nFascinating methods were proposed with variant mechanisms including soft\nattention, hard attention, two-eye asymmetry, feature disentanglement, rotation\nconsistency, and contrastive learning. Most of these methods take the\nsingle-face or multi-region as input, yet the basic architecture of gaze\nestimation has not been fully explored. In this paper, we reveal the fact that\ntuning a few simple parameters of a ResNet architecture can outperform most of\nthe existing state-of-the-art methods for the gaze estimation task on three\npopular datasets. With our extensive experiments, we conclude that the stride\nnumber, input image resolution, and multi-region architecture are critical for\nthe gaze estimation performance while their effectiveness dependent on the\nquality of the input face image. We obtain the state-of-the-art performances on\nthree datasets with 3.64 on ETH-XGaze, 4.50 on MPIIFaceGaze, and 9.13 on\nGaze360 degrees gaze estimation error by taking ResNet-50 as the backbone.\n","authors":["Yunhan Wang","Xiangwei Shi","Shalini De Mello","Hyung Jin Chang","Xucong Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09592v1","updated":"2023-08-18T14:39:16Z","published":"2023-08-18T14:39:16Z","title":"StableVideo: Text-driven Consistency-aware Diffusion Video Editing","summary":" Diffusion-based methods can generate realistic images and videos, but they\nstruggle to edit existing objects in a video while preserving their appearance\nover time. This prevents diffusion models from being applied to natural video\nediting in practical scenarios. In this paper, we tackle this problem by\nintroducing temporal dependency to existing text-driven diffusion models, which\nallows them to generate consistent appearance for the edited objects.\nSpecifically, we develop a novel inter-frame propagation mechanism for\ndiffusion video editing, which leverages the concept of layered representations\nto propagate the appearance information from one frame to the next. We then\nbuild up a text-driven video editing framework based on this mechanism, namely\nStableVideo, which can achieve consistency-aware video editing. Extensive\nexperiments demonstrate the strong editing capability of our approach. Compared\nwith state-of-the-art video editing methods, our approach shows superior\nqualitative and quantitative results. Our code is available at\n\\href{https://github.com/rese1f/StableVideo}{this https URL}.\n","authors":["Wenhao Chai","Xun Guo","Gaoang Wang","Yan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.09592v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09591v1","updated":"2023-08-18T14:38:31Z","published":"2023-08-18T14:38:31Z","title":"O^2-Recon: Completing 3D Reconstruction of Occluded Objects in the Scene\n with a Pre-trained 2D Diffusion Model","summary":" Occlusion is a common issue in 3D reconstruction from RGB-D videos, often\nblocking the complete reconstruction of objects and presenting an ongoing\nproblem. In this paper, we propose a novel framework, empowered by a 2D\ndiffusion-based in-painting model, to reconstruct complete surfaces for the\nhidden parts of objects. Specifically, we utilize a pre-trained diffusion model\nto fill in the hidden areas of 2D images. Then we use these in-painted images\nto optimize a neural implicit surface representation for each instance for 3D\nreconstruction. Since creating the in-painting masks needed for this process is\ntricky, we adopt a human-in-the-loop strategy that involves very little human\nengagement to generate high-quality masks. Moreover, some parts of objects can\nbe totally hidden because the videos are usually shot from limited\nperspectives. To ensure recovering these invisible areas, we develop a cascaded\nnetwork architecture for predicting signed distance field, making use of\ndifferent frequency bands of positional encoding and maintaining overall\nsmoothness. Besides the commonly used rendering loss, Eikonal loss, and\nsilhouette loss, we adopt a CLIP-based semantic consistency loss to guide the\nsurface from unseen camera angles. Experiments on ScanNet scenes show that our\nproposed framework achieves state-of-the-art accuracy and completeness in\nobject-level reconstruction from scene-level RGB-D videos.\n","authors":["Yubin Hu","Sheng Ye","Wang Zhao","Matthieu Lin","Yuze He","Yu-Hui Wen","Ying He","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2308.09591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.00288v4","updated":"2023-08-18T14:34:03Z","published":"2022-11-01T05:48:18Z","title":"Self-supervised Character-to-Character Distillation for Text Recognition","summary":" When handling complicated text images (e.g., irregular structures, low\nresolution, heavy occlusion, and uneven illumination), existing supervised text\nrecognition methods are data-hungry. Although these methods employ large-scale\nsynthetic text images to reduce the dependence on annotated real images, the\ndomain gap still limits the recognition performance. Therefore, exploring the\nrobust text feature representations on unlabeled real images by self-supervised\nlearning is a good solution. However, existing self-supervised text recognition\nmethods conduct sequence-to-sequence representation learning by roughly\nsplitting the visual features along the horizontal axis, which limits the\nflexibility of the augmentations, as large geometric-based augmentations may\nlead to sequence-to-sequence feature inconsistency. Motivated by this, we\npropose a novel self-supervised Character-to-Character Distillation method,\nCCD, which enables versatile augmentations to facilitate general text\nrepresentation learning. Specifically, we delineate the character structures of\nunlabeled real images by designing a self-supervised character segmentation\nmodule. Following this, CCD easily enriches the diversity of local characters\nwhile keeping their pairwise alignment under flexible augmentations, using the\ntransformation matrix between two augmented views from images. Experiments\ndemonstrate that CCD achieves state-of-the-art results, with average\nperformance gains of 1.38% in text recognition, 1.7% in text segmentation, 0.24\ndB (PSNR) and 0.0321 (SSIM) in text super-resolution. Code is available at\nhttps://github.com/TongkunGuan/CCD.\n","authors":["Tongkun Guan","Wei Shen","Xue Yang","Qi Feng","Zekun Jiang","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2211.00288v4.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.06668v2","updated":"2023-08-18T14:16:37Z","published":"2023-08-13T02:59:36Z","title":"Foundation Models in Smart Agriculture: Basics, Opportunities, and\n Challenges","summary":" The past decade has witnessed the rapid development of ML and DL\nmethodologies in agricultural systems, showcased by great successes in variety\nof agricultural applications. However, these conventional ML/DL models have\ncertain limitations: They heavily rely on large, costly-to-acquire labeled\ndatasets for training, require specialized expertise for development and\nmaintenance, and are mostly tailored for specific tasks, thus lacking\ngeneralizability. Recently, foundation models have demonstrated remarkable\nsuccesses in language and vision tasks across various domains. These models are\ntrained on a vast amount of data from multiple domains and modalities. Once\ntrained, they can accomplish versatile tasks with just minor fine-tuning and\nminimal task-specific labeled data. Despite their proven effectiveness and huge\npotential, there has been little exploration of applying FMs to agriculture\nfields. Therefore, this study aims to explore the potential of FMs in the field\nof smart agriculture. In particular, we present conceptual tools and technical\nbackground to facilitate the understanding of the problem space and uncover new\nresearch directions in this field. To this end, we first review recent FMs in\nthe general computer science domain and categorize them into four categories:\nlanguage FMs, vision FMs, multimodal FMs, and reinforcement learning FMs.\nSubsequently, we outline the process of developing agriculture FMs and discuss\ntheir potential applications in smart agriculture. We also discuss the unique\nchallenges associated with developing AFMs, including model training,\nvalidation, and deployment. Through this study, we contribute to the\nadvancement of AI in agriculture by introducing AFMs as a promising paradigm\nthat can significantly mitigate the reliance on extensive labeled datasets and\nenhance the efficiency, effectiveness, and generalization of agricultural AI\nsystems.\n","authors":["Jiajia Li","Mingle Xu","Lirong Xiang","Dong Chen","Weichao Zhuang","Xunyuan Yin","Zhaojian Li"],"pdf_url":"https://arxiv.org/pdf/2308.06668v2.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.09568v1","updated":"2023-08-18T14:01:37Z","published":"2023-08-18T14:01:37Z","title":"PUMGPT: A Large Vision-Language Model for Product Understanding","summary":" Recent developments of multi-modal large language models have demonstrated\nits strong ability in solving vision-language tasks. In this paper, we focus on\nthe product understanding task, which plays an essential role in enhancing\nonline shopping experience. Product understanding task includes a variety of\nsub-tasks, which require models to respond diverse queries based on multi-modal\nproduct information. Traditional methods design distinct model architectures\nfor each sub-task. On the contrary, we present PUMGPT, a large vision-language\nmodel aims at unifying all product understanding tasks under a singular model\nstructure. To bridge the gap between vision and text representations, we\npropose Layer-wise Adapters (LA), an approach that provides enhanced alignment\nwith fewer visual tokens and enables parameter-efficient fine-tuning. Moreover,\nthe inherent parameter-efficient fine-tuning ability allows PUMGPT to be\nreadily adapted to new product understanding tasks and emerging products. We\ndesign instruction templates to generate diverse product instruction datasets.\nSimultaneously, we utilize open-domain datasets during training to improve the\nperformance of PUMGPT and its generalization ability. Through extensive\nevaluations, PUMGPT demonstrates its superior performance across multiple\nproduct understanding tasks, including product captioning, category\nquestion-answering, attribute extraction, attribute question-answering, and\neven free-form question-answering about products.\n","authors":["Shuhui Wu","Zengming Tang","Zongyi Guo","Weiwei Zhang","Baoliang Cui","Haihong Tang","Weiming Lu"],"pdf_url":"https://arxiv.org/pdf/2308.09568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09564v1","updated":"2023-08-18T13:56:03Z","published":"2023-08-18T13:56:03Z","title":"Deep Equilibrium Object Detection","summary":" Query-based object detectors directly decode image features into object\ninstances with a set of learnable queries. These query vectors are\nprogressively refined to stable meaningful representations through a sequence\nof decoder layers, and then used to directly predict object locations and\ncategories with simple FFN heads. In this paper, we present a new query-based\nobject detector (DEQDet) by designing a deep equilibrium decoder. Our DEQ\ndecoder models the query vector refinement as the fixed point solving of an\n{implicit} layer and is equivalent to applying {infinite} steps of refinement.\nTo be more specific to object decoding, we use a two-step unrolled equilibrium\nequation to explicitly capture the query vector refinement. Accordingly, we are\nable to incorporate refinement awareness into the DEQ training with the inexact\ngradient back-propagation (RAG). In addition, to stabilize the training of our\nDEQDet and improve its generalization ability, we devise the deep supervision\nscheme on the optimization path of DEQ with refinement-aware\nperturbation~(RAP). Our experiments demonstrate DEQDet converges faster,\nconsumes less memory, and achieves better results than the baseline counterpart\n(AdaMixer). In particular, our DEQDet with ResNet50 backbone and 300 queries\nachieves the $49.5$ mAP and $33.0$ AP$_s$ on the MS COCO benchmark under\n$2\\times$ training scheme (24 epochs).\n","authors":["Shuai Wang","Yao Teng","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09544v1","updated":"2023-08-18T13:22:59Z","published":"2023-08-18T13:22:59Z","title":"Adapt Your Teacher: Improving Knowledge Distillation for Exemplar-free\n Continual Learning","summary":" In this work, we investigate exemplar-free class incremental learning (CIL)\nwith knowledge distillation (KD) as a regularization strategy, aiming to\nprevent forgetting. KD-based methods are successfully used in CIL, but they\noften struggle to regularize the model without access to exemplars of the\ntraining data from previous tasks. Our analysis reveals that this issue\noriginates from substantial representation shifts in the teacher network when\ndealing with out-of-distribution data. This causes large errors in the KD loss\ncomponent, leading to performance degradation in CIL. Inspired by recent\ntest-time adaptation methods, we introduce Teacher Adaptation (TA), a method\nthat concurrently updates the teacher and the main model during incremental\ntraining. Our method seamlessly integrates with KD-based CIL approaches and\nallows for consistent enhancement of their performance across multiple\nexemplar-free CIL benchmarks.\n","authors":["Filip Szatkowski","Mateusz Pyla","Marcin Przewięźlikowski","Sebastian Cygert","Bartłomiej Twardowski","Tomasz Trzciński"],"pdf_url":"https://arxiv.org/pdf/2308.09544v1.pdf","comment":"VCL workshop at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09542v1","updated":"2023-08-18T13:19:26Z","published":"2023-08-18T13:19:26Z","title":"Decoupled conditional contrastive learning with variable metadata for\n prostate lesion detection","summary":" Early diagnosis of prostate cancer is crucial for efficient treatment.\nMulti-parametric Magnetic Resonance Images (mp-MRI) are widely used for lesion\ndetection. The Prostate Imaging Reporting and Data System (PI-RADS) has\nstandardized interpretation of prostate MRI by defining a score for lesion\nmalignancy. PI-RADS data is readily available from radiology reports but is\nsubject to high inter-reports variability. We propose a new contrastive loss\nfunction that leverages weak metadata with multiple annotators per sample and\ntakes advantage of inter-reports variability by defining metadata confidence.\nBy combining metadata of varying confidence with unannotated data into a single\nconditional contrastive loss function, we report a 3% AUC increase on lesion\ndetection on the public PI-CAI challenge dataset.\n Code is available at: https://github.com/camilleruppli/decoupled_ccl\n","authors":["Camille Ruppli","Pietro Gori","Roberto Ardon","Isabelle Bloch"],"pdf_url":"https://arxiv.org/pdf/2308.09542v1.pdf","comment":"Accepted at MILLanD workshop (MICCAI)"},{"id":"http://arxiv.org/abs/2308.09540v1","updated":"2023-08-18T13:17:07Z","published":"2023-08-18T13:17:07Z","title":"Meta-ZSDETR: Zero-shot DETR with Meta-learning","summary":" Zero-shot object detection aims to localize and recognize objects of unseen\nclasses. Most of existing works face two problems: the low recall of RPN in\nunseen classes and the confusion of unseen classes with background. In this\npaper, we present the first method that combines DETR and meta-learning to\nperform zero-shot object detection, named Meta-ZSDETR, where model training is\nformalized as an individual episode based meta-learning task. Different from\nFaster R-CNN based methods that firstly generate class-agnostic proposals, and\nthen classify them with visual-semantic alignment module, Meta-ZSDETR directly\npredict class-specific boxes with class-specific queries and further filter\nthem with the predicted accuracy from classification head. The model is\noptimized with meta-contrastive learning, which contains a regression head to\ngenerate the coordinates of class-specific boxes, a classification head to\npredict the accuracy of generated boxes, and a contrastive head that utilizes\nthe proposed contrastive-reconstruction loss to further separate different\nclasses in visual space. We conduct extensive experiments on two benchmark\ndatasets MS COCO and PASCAL VOC. Experimental results show that our method\noutperforms the existing ZSD methods by a large margin.\n","authors":["Lu Zhang","Chenbo Zhang","Jiajia Zhao","Jihong Guan","Shuigeng Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.09540v1.pdf","comment":"Accepted in ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09538v1","updated":"2023-08-18T13:16:00Z","published":"2023-08-18T13:16:00Z","title":"Uncertainty-based quality assurance of carotid artery wall segmentation\n in black-blood MRI","summary":" The application of deep learning models to large-scale data sets requires\nmeans for automatic quality assurance. We have previously developed a fully\nautomatic algorithm for carotid artery wall segmentation in black-blood MRI\nthat we aim to apply to large-scale data sets. This method identifies nested\nartery walls in 3D patches centered on the carotid artery. In this study, we\ninvestigate to what extent the uncertainty in the model predictions for the\ncontour location can serve as a surrogate for error detection and,\nconsequently, automatic quality assurance. We express the quality of automatic\nsegmentations using the Dice similarity coefficient. The uncertainty in the\nmodel's prediction is estimated using either Monte Carlo dropout or test-time\ndata augmentation. We found that (1) including uncertainty measurements did not\ndegrade the quality of the segmentations, (2) uncertainty metrics provide a\ngood proxy of the quality of our contours if the center found during the first\nstep is enclosed in the lumen of the carotid artery and (3) they could be used\nto detect low-quality segmentations at the participant level. This automatic\nquality assurance tool might enable the application of our model in large-scale\ndata sets.\n","authors":["Elina Thibeau-Sutre","Dieuwertje Alblas","Sophie Buurman","Christoph Brune","Jelmer M. Wolterink"],"pdf_url":"https://arxiv.org/pdf/2308.09538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09534v1","updated":"2023-08-18T13:13:09Z","published":"2023-08-18T13:13:09Z","title":"Small Object Detection via Coarse-to-fine Proposal Generation and\n Imitation Learning","summary":" The past few years have witnessed the immense success of object detection,\nwhile current excellent detectors struggle on tackling size-limited instances.\nConcretely, the well-known challenge of low overlaps between the priors and\nobject regions leads to a constrained sample pool for optimization, and the\npaucity of discriminative information further aggravates the recognition. To\nalleviate the aforementioned issues, we propose CFINet, a two-stage framework\ntailored for small object detection based on the Coarse-to-fine pipeline and\nFeature Imitation learning. Firstly, we introduce Coarse-to-fine RPN (CRPN) to\nensure sufficient and high-quality proposals for small objects through the\ndynamic anchor selection strategy and cascade regression. Then, we equip the\nconventional detection head with a Feature Imitation (FI) branch to facilitate\nthe region representations of size-limited instances that perplex the model in\nan imitation manner. Moreover, an auxiliary imitation loss following supervised\ncontrastive learning paradigm is devised to optimize this branch. When\nintegrated with Faster RCNN, CFINet achieves state-of-the-art performance on\nthe large-scale small object detection benchmarks, SODA-D and SODA-A,\nunderscoring its superiority over baseline detector and other mainstream\ndetection approaches.\n","authors":["Xiang Yuan","Gong Cheng","Kebing Yan","Qinghua Zeng","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2308.09534v1.pdf","comment":"Camera-ready version for ICCV2023. Our code will be available at\n https://github.com/shaunyuan22/CFINet"},{"id":"http://arxiv.org/abs/2306.16080v2","updated":"2023-08-18T13:11:02Z","published":"2023-06-28T10:27:17Z","title":"A serial dual-channel library occupancy detection system based on Faster\n RCNN","summary":" The phenomenon of seat occupancy in university libraries is a prevalent\nissue. However, existing solutions, such as software-based seat reservations\nand sensors-based occupancy detection, have proven to be inadequate in\neffectively addressing this problem. In this study, we propose a novel\napproach: a serial dual-channel object detection model based on Faster RCNN.\nThis model is designed to discern all instances of occupied seats within the\nlibrary and continuously update real-time information regarding seat occupancy\nstatus. To train the neural network, a distinctive dataset is utilized, which\nblends virtual images generated using Unreal Engine 5 (UE5) with real-world\nimages. Notably, our test results underscore the remarkable performance uplift\nattained through the application of self-generated virtual datasets in training\nConvolutional Neural Networks (CNNs), particularly within specialized\nscenarios. Furthermore, this study introduces a pioneering detection model that\nseamlessly amalgamates the Faster R-CNN-based object detection framework with a\ntransfer learning-based object classification algorithm. This amalgamation not\nonly significantly curtails the computational resources and time investments\nneeded for neural network training but also considerably heightens the\nefficiency of single-frame detection rates. Additionally, a user-friendly web\ninterface and a mobile application have been meticulously developed,\nconstituting a computer vision-driven platform for detecting seat occupancy\nwithin library premises. Noteworthy is the substantial enhancement in seat\noccupancy recognition accuracy, coupled with a reduction in computational\nresources required for neural network training, collectively contributing to a\nconsiderable amplification in the overall efficiency of library seat\nmanagement.\n","authors":["Guoqiang Yang","Xiaowen Chang","Zitong Wang","Min Yang"],"pdf_url":"https://arxiv.org/pdf/2306.16080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09525v1","updated":"2023-08-18T13:05:10Z","published":"2023-08-18T13:05:10Z","title":"Improving 3D Pose Estimation for Sign Language","summary":" This work addresses 3D human pose reconstruction in single images. We present\na method that combines Forward Kinematics (FK) with neural networks to ensure a\nfast and valid prediction of 3D pose. Pose is represented as a hierarchical\ntree/graph with nodes corresponding to human joints that model their physical\nlimits. Given a 2D detection of keypoints in the image, we lift the skeleton to\n3D using neural networks to predict both the joint rotations and bone lengths.\nThese predictions are then combined with skeletal constraints using an FK layer\nimplemented as a network layer in PyTorch. The result is a fast and accurate\napproach to the estimation of 3D skeletal pose. Through quantitative and\nqualitative evaluation, we demonstrate the method is significantly more\naccurate than MediaPipe in terms of both per joint positional error and visual\nappearance. Furthermore, we demonstrate generalization over different datasets.\nThe implementation in PyTorch runs at between 100-200 milliseconds per image\n(including CNN detection) using CPU only.\n","authors":["Maksym Ivashechkin","Oscar Mendez","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2308.09525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09523v1","updated":"2023-08-18T12:57:22Z","published":"2023-08-18T12:57:22Z","title":"Denoising Diffusion for 3D Hand Pose Estimation from Images","summary":" Hand pose estimation from a single image has many applications. However,\napproaches to full 3D body pose estimation are typically trained on day-to-day\nactivities or actions. As such, detailed hand-to-hand interactions are poorly\nrepresented, especially during motion. We see this in the failure cases of\ntechniques such as OpenPose or MediaPipe. However, accurate hand pose\nestimation is crucial for many applications where the global body motion is\nless important than accurate hand pose estimation.\n This paper addresses the problem of 3D hand pose estimation from monocular\nimages or sequences. We present a novel end-to-end framework for 3D hand\nregression that employs diffusion models that have shown excellent ability to\ncapture the distribution of data for generative purposes. Moreover, we enforce\nkinematic constraints to ensure realistic poses are generated by incorporating\nan explicit forward kinematic layer as part of the network. The proposed model\nprovides state-of-the-art performance when lifting a 2D single-hand image to\n3D. However, when sequence data is available, we add a Transformer module over\na temporal window of consecutive frames to refine the results, overcoming\njittering and further increasing accuracy.\n The method is quantitatively and qualitatively evaluated showing\nstate-of-the-art robustness, generalization, and accuracy on several different\ndatasets.\n","authors":["Maksym Ivashechkin","Oscar Mendez","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2308.09523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14277v2","updated":"2023-08-18T12:54:20Z","published":"2023-07-26T16:14:21Z","title":"G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and\n Game Theory","summary":" The recent video grounding works attempt to introduce vanilla contrastive\nlearning into video grounding. However, we claim that this naive solution is\nsuboptimal. Contrastive learning requires two key properties: (1)\n\\emph{alignment} of features of similar samples, and (2) \\emph{uniformity} of\nthe induced distribution of the normalized features on the hypersphere. Due to\ntwo annoying issues in video grounding: (1) the co-existence of some visual\nentities in both ground truth and other moments, \\ie semantic overlapping; (2)\nonly a few moments in the video are annotated, \\ie sparse annotation dilemma,\nvanilla contrastive learning is unable to model the correlations between\ntemporally distant moments and learned inconsistent video representations. Both\ncharacteristics lead to vanilla contrastive learning being unsuitable for video\ngrounding. In this paper, we introduce Geodesic and Game Localization (G2L), a\nsemantically aligned and uniform video grounding framework via geodesic and\ngame theory. We quantify the correlations among moments leveraging the geodesic\ndistance that guides the model to learn the correct cross-modal\nrepresentations. Furthermore, from the novel perspective of game theory, we\npropose semantic Shapley interaction based on geodesic distance sampling to\nlearn fine-grained semantic alignment in similar moments. Experiments on three\nbenchmarks demonstrate the effectiveness of our method.\n","authors":["Hongxiang Li","Meng Cao","Xuxin Cheng","Yaowei Li","Zhihong Zhu","Yuexian Zou"],"pdf_url":"https://arxiv.org/pdf/2307.14277v2.pdf","comment":"ICCV2023 oral"},{"id":"http://arxiv.org/abs/2308.09519v1","updated":"2023-08-18T12:54:12Z","published":"2023-08-18T12:54:12Z","title":"Leveraging Intrinsic Properties for Non-Rigid Garment Alignment","summary":" We address the problem of aligning real-world 3D data of garments, which\nbenefits many applications such as texture learning, physical parameter\nestimation, generative modeling of garments, etc. Existing extrinsic methods\ntypically perform non-rigid iterative closest point and struggle to align\ndetails due to incorrect closest matches and rigidity constraints. While\nintrinsic methods based on functional maps can produce high-quality\ncorrespondences, they work under isometric assumptions and become unreliable\nfor garment deformations which are highly non-isometric. To achieve\nwrinkle-level as well as texture-level alignment, we present a novel\ncoarse-to-fine two-stage method that leverages intrinsic manifold properties\nwith two neural deformation fields, in the 3D space and the intrinsic space,\nrespectively. The coarse stage performs a 3D fitting, where we leverage\nintrinsic manifold properties to define a manifold deformation field. The\ncoarse fitting then induces a functional map that produces an alignment of\nintrinsic embeddings. We further refine the intrinsic alignment with a second\nneural deformation field for higher accuracy. We evaluate our method with our\ncaptured garment dataset, GarmCap. The method achieves accurate wrinkle-level\nand texture-level alignment and works for difficult garment types such as long\ncoats. Our project page is\nhttps://jsnln.github.io/iccv2023_intrinsic/index.html.\n","authors":["Siyou Lin","Boyao Zhou","Zerong Zheng","Hongwen Zhang","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2308.09519v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2212.00786v4","updated":"2023-08-18T12:51:38Z","published":"2022-12-01T18:59:21Z","title":"3D Segmentation of Humans in Point Clouds with Synthetic Data","summary":" Segmenting humans in 3D indoor scenes has become increasingly important with\nthe rise of human-centered robotics and AR/VR applications. To this end, we\npropose the task of joint 3D human semantic segmentation, instance segmentation\nand multi-human body-part segmentation. Few works have attempted to directly\nsegment humans in cluttered 3D scenes, which is largely due to the lack of\nannotated training data of humans interacting with 3D scenes. We address this\nchallenge and propose a framework for generating training data of synthetic\nhumans interacting with real 3D scenes. Furthermore, we propose a novel\ntransformer-based model, Human3D, which is the first end-to-end model for\nsegmenting multiple human instances and their body-parts in a unified manner.\nThe key advantage of our synthetic data generation framework is its ability to\ngenerate diverse and realistic human-scene interactions, with highly accurate\nground truth. Our experiments show that pre-training on synthetic data improves\nperformance on a wide variety of 3D human segmentation tasks. Finally, we\ndemonstrate that Human3D outperforms even task-specific state-of-the-art 3D\nsegmentation methods.\n","authors":["Ayça Takmaz","Jonas Schult","Irem Kaftan","Mertcan Akçay","Bastian Leibe","Robert Sumner","Francis Engelmann","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2212.00786v4.pdf","comment":"project page: https://human-3d.github.io/"},{"id":"http://arxiv.org/abs/2308.09515v1","updated":"2023-08-18T12:47:18Z","published":"2023-08-18T12:47:18Z","title":"Learnt Contrastive Concept Embeddings for Sign Recognition","summary":" In natural language processing (NLP) of spoken languages, word embeddings\nhave been shown to be a useful method to encode the meaning of words. Sign\nlanguages are visual languages, which require sign embeddings to capture the\nvisual and linguistic semantics of sign. Unlike many common approaches to Sign\nRecognition, we focus on explicitly creating sign embeddings that bridge the\ngap between sign language and spoken language. We propose a learning framework\nto derive LCC (Learnt Contrastive Concept) embeddings for sign language, a\nweakly supervised contrastive approach to learning sign embeddings. We train a\nvocabulary of embeddings that are based on the linguistic labels for sign\nvideo. Additionally, we develop a conceptual similarity loss which is able to\nutilise word embeddings from NLP methods to create sign embeddings that have\nbetter sign language to spoken language correspondence. These learnt\nrepresentations allow the model to automatically localise the sign in time. Our\napproach achieves state-of-the-art keypoint-based sign recognition performance\non the WLASL and BOBSL datasets.\n","authors":["Ryan Wong","Necati Cihan Camgoz","Richard Bowden"],"pdf_url":"https://arxiv.org/pdf/2308.09515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09511v1","updated":"2023-08-18T12:41:10Z","published":"2023-08-18T12:41:10Z","title":"ResQ: Residual Quantization for Video Perception","summary":" This paper accelerates video perception, such as semantic segmentation and\nhuman pose estimation, by levering cross-frame redundancies. Unlike the\nexisting approaches, which avoid redundant computations by warping the past\nfeatures using optical-flow or by performing sparse convolutions on frame\ndifferences, we approach the problem from a new perspective: low-bit\nquantization. We observe that residuals, as the difference in network\nactivations between two neighboring frames, exhibit properties that make them\nhighly quantizable. Based on this observation, we propose a novel quantization\nscheme for video networks coined as Residual Quantization. ResQ extends the\nstandard, frame-by-frame, quantization scheme by incorporating temporal\ndependencies that lead to better performance in terms of accuracy vs.\nbit-width. Furthermore, we extend our model to dynamically adjust the bit-width\nproportional to the amount of changes in the video. We demonstrate the\nsuperiority of our model, against the standard quantization and existing\nefficient video perception models, using various architectures on semantic\nsegmentation and human pose estimation benchmarks.\n","authors":["Davide Abati","Haitam Ben Yahia","Markus Nagel","Amirhossein Habibian"],"pdf_url":"https://arxiv.org/pdf/2308.09511v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2211.10181v2","updated":"2023-08-18T12:35:59Z","published":"2022-11-18T11:59:37Z","title":"LVOS: A Benchmark for Long-term Video Object Segmentation","summary":" Existing video object segmentation (VOS) benchmarks focus on short-term\nvideos which just last about 3-5 seconds and where objects are visible most of\nthe time. These videos are poorly representative of practical applications, and\nthe absence of long-term datasets restricts further investigation of VOS on the\napplication in realistic scenarios. So, in this paper, we present a new\nbenchmark dataset named \\textbf{LVOS}, which consists of 220 videos with a\ntotal duration of 421 minutes. To the best of our knowledge, LVOS is the first\ndensely annotated long-term VOS dataset. The videos in our LVOS last 1.59\nminutes on average, which is 20 times longer than videos in existing VOS\ndatasets. Each video includes various attributes, especially challenges\nderiving from the wild, such as long-term reappearing and cross-temporal\nsimilar objeccts.Based on LVOS, we assess existing video object segmentation\nalgorithms and propose a Diverse Dynamic Memory network (DDMemory) that\nconsists of three complementary memory banks to exploit temporal information\nadequately. The experimental results demonstrate the strength and weaknesses of\nprior methods, pointing promising directions for further study. Data and code\nare available at https://lingyihongfd.github.io/lvos.github.io/.\n","authors":["Lingyi Hong","Wenchao Chen","Zhongying Liu","Wei Zhang","Pinxue Guo","Zhaoyu Chen","Wenqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2211.10181v2.pdf","comment":"Accepted by ICCV 2023. Project page:\n https://lingyihongfd.github.io/lvos.github.io/"},{"id":"http://arxiv.org/abs/2303.06040v3","updated":"2023-08-18T12:31:45Z","published":"2023-03-10T16:30:09Z","title":"Importance of Aligning Training Strategy with Evaluation for Diffusion\n Models in 3D Multiclass Segmentation","summary":" Recently, denoising diffusion probabilistic models (DDPM) have been applied\nto image segmentation by generating segmentation masks conditioned on images,\nwhile the applications were mainly limited to 2D networks without exploiting\npotential benefits from the 3D formulation. In this work, we studied the\nDDPM-based segmentation model for 3D multiclass segmentation on two large\nmulticlass data sets (prostate MR and abdominal CT). We observed that the\ndifference between training and test methods led to inferior performance for\nexisting DDPM methods. To mitigate the inconsistency, we proposed a recycling\nmethod which generated corrupted masks based on the model's prediction at a\nprevious time step instead of using ground truth. The proposed method achieved\nstatistically significantly improved performance compared to existing DDPMs,\nindependent of a number of other techniques for reducing train-test\ndiscrepancy, including performing mask prediction, using Dice loss, and\nreducing the number of diffusion time steps during training. The performance of\ndiffusion models was also competitive and visually similar to\nnon-diffusion-based U-net, within the same compute budget. The JAX-based\ndiffusion framework has been released at\nhttps://github.com/mathpluscode/ImgX-DiffSeg.\n","authors":["Yunguan Fu","Yiwen Li","Shaheer U. Saeed","Matthew J. Clarkson","Yipeng Hu"],"pdf_url":"https://arxiv.org/pdf/2303.06040v3.pdf","comment":"Accepted at Deep Generative Models workshop at MICCAI 2023"},{"id":"http://arxiv.org/abs/2306.08370v2","updated":"2023-08-18T12:27:27Z","published":"2023-06-14T09:01:50Z","title":"Object Detection in Hyperspectral Image via Unified Spectral-Spatial\n Feature Aggregation","summary":" Deep learning-based hyperspectral image (HSI) classification and object\ndetection techniques have gained significant attention due to their vital role\nin image content analysis, interpretation, and wider HSI applications. However,\ncurrent hyperspectral object detection approaches predominantly emphasize\neither spectral or spatial information, overlooking the valuable complementary\nrelationship between these two aspects. In this study, we present a novel\n\\textbf{S}pectral-\\textbf{S}patial \\textbf{A}ggregation (S2ADet) object\ndetector that effectively harnesses the rich spectral and spatial complementary\ninformation inherent in hyperspectral images. S2ADet comprises a hyperspectral\ninformation decoupling (HID) module, a two-stream feature extraction network,\nand a one-stage detection head. The HID module processes hyperspectral images\nby aggregating spectral and spatial information via band selection and\nprincipal components analysis, consequently reducing redundancy. Based on the\nacquired spatial and spectral aggregation information, we propose a feature\naggregation two-stream network for interacting spectral-spatial features.\nFurthermore, to address the limitations of existing databases, we annotate an\nextensive dataset, designated as HOD3K, containing 3,242 hyperspectral images\ncaptured across diverse real-world scenes and encompassing three object\nclasses. These images possess a resolution of 512x256 pixels and cover 16 bands\nranging from 470 nm to 620 nm. Comprehensive experiments on two datasets\ndemonstrate that S2ADet surpasses existing state-of-the-art methods, achieving\nrobust and reliable results. The demo code and dataset of this work are\npublicly available at \\url{https://github.com/hexiao-cs/S2ADet}.\n","authors":["Xiao He","Chang Tang","Xinwang Liu","Wei Zhang","Kun Sun","Jiangfeng Xu"],"pdf_url":"https://arxiv.org/pdf/2306.08370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15225v2","updated":"2023-08-18T12:17:26Z","published":"2023-03-27T14:05:34Z","title":"GP-PCS: One-shot Feature-Preserving Point Cloud Simplification with\n Gaussian Processes on Riemannian Manifolds","summary":" The processing, storage and transmission of large-scale point clouds is an\nongoing challenge in the computer vision community which hinders progress in\nthe application of 3D models to real-world settings, such as autonomous\ndriving, virtual reality and remote sensing. We propose a novel, one-shot point\ncloud simplification method which preserves both the salient structural\nfeatures and the overall shape of a point cloud without any prior surface\nreconstruction step. Our method employs Gaussian processes suitable for\nfunctions defined on Riemannian manifolds, allowing us to model the surface\nvariation function across any given point cloud. A simplified version of the\noriginal cloud is obtained by sequentially selecting points using a greedy\nsparsification scheme. The selection criterion used for this scheme ensures\nthat the simplified cloud best represents the surface variation of the original\npoint cloud. We evaluate our method on several benchmark and self-acquired\npoint clouds, compare it to a range of existing methods, demonstrate its\napplication in downstream tasks of registration and surface reconstruction, and\nshow that our method is competitive both in terms of empirical performance and\ncomputational efficiency.\n","authors":["Stuti Pathak","Thomas M. McDonald","Rudi Penne"],"pdf_url":"https://arxiv.org/pdf/2303.15225v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2307.09066v2","updated":"2023-08-18T11:53:27Z","published":"2023-07-18T08:37:37Z","title":"PatchCT: Aligning Patch Set and Label Set with Conditional Transport for\n Multi-Label Image Classification","summary":" Multi-label image classification is a prediction task that aims to identify\nmore than one label from a given image. This paper considers the semantic\nconsistency of the latent space between the visual patch and linguistic label\ndomains and introduces the conditional transport (CT) theory to bridge the\nacknowledged gap. While recent cross-modal attention-based studies have\nattempted to align such two representations and achieved impressive\nperformance, they required carefully-designed alignment modules and extra\ncomplex operations in the attention computation. We find that by formulating\nthe multi-label classification as a CT problem, we can exploit the interactions\nbetween the image and label efficiently by minimizing the bidirectional CT\ncost. Specifically, after feeding the images and textual labels into the\nmodality-specific encoders, we view each image as a mixture of patch embeddings\nand a mixture of label embeddings, which capture the local region features and\nthe class prototypes, respectively. CT is then employed to learn and align\nthose two semantic sets by defining the forward and backward navigators.\nImportantly, the defined navigators in CT distance model the similarities\nbetween patches and labels, which provides an interpretable tool to visualize\nthe learned prototypes. Extensive experiments on three public image benchmarks\nshow that the proposed model consistently outperforms the previous methods.\n","authors":["Miaoge Li","Dongsheng Wang","Xinyang Liu","Zequn Zeng","Ruiying Lu","Bo Chen","Mingyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.09066v2.pdf","comment":"accepted by ICCV23"},{"id":"http://arxiv.org/abs/2303.09051v2","updated":"2023-08-18T11:34:10Z","published":"2023-03-16T02:47:59Z","title":"Robust Evaluation of Diffusion-Based Adversarial Purification","summary":" We question the current evaluation practice on diffusion-based purification\nmethods. Diffusion-based purification methods aim to remove adversarial effects\nfrom an input data point at test time. The approach gains increasing attention\nas an alternative to adversarial training due to the disentangling between\ntraining and testing. Well-known white-box attacks are often employed to\nmeasure the robustness of the purification. However, it is unknown whether\nthese attacks are the most effective for the diffusion-based purification since\nthe attacks are often tailored for adversarial training. We analyze the current\npractices and provide a new guideline for measuring the robustness of\npurification methods against adversarial attacks. Based on our analysis, we\nfurther propose a new purification strategy improving robustness compared to\nthe current diffusion-based purification methods.\n","authors":["Minjong Lee","Dongwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2303.09051v2.pdf","comment":"Accepted by ICCV 2023, Oral presentation"},{"id":"http://arxiv.org/abs/2308.09475v1","updated":"2023-08-18T11:24:06Z","published":"2023-08-18T11:24:06Z","title":"Video-Instrument Synergistic Network for Referring Video Instrument\n Segmentation in Robotic Surgery","summary":" Robot-assisted surgery has made significant progress, with instrument\nsegmentation being a critical factor in surgical intervention quality. It\nserves as the building block to facilitate surgical robot navigation and\nsurgical education for the next generation of operating intelligence. Although\nexisting methods have achieved accurate instrument segmentation results, they\nsimultaneously generate segmentation masks for all instruments, without the\ncapability to specify a target object and allow an interactive experience. This\nwork explores a new task of Referring Surgical Video Instrument Segmentation\n(RSVIS), which aims to automatically identify and segment the corresponding\nsurgical instruments based on the given language expression. To achieve this,\nwe devise a novel Video-Instrument Synergistic Network (VIS-Net) to learn both\nvideo-level and instrument-level knowledge to boost performance, while previous\nwork only used video-level information. Meanwhile, we design a Graph-based\nRelation-aware Module (GRM) to model the correlation between multi-modal\ninformation (i.e., textual description and video frame) to facilitate the\nextraction of instrument-level information. We are also the first to produce\ntwo RSVIS datasets to promote related research. Our method is verified on these\ndatasets, and experimental results exhibit that the VIS-Net can significantly\noutperform existing state-of-the-art referring segmentation methods. Our code\nand our datasets will be released upon the publication of this work.\n","authors":["Hongqiu Wang","Lei Zhu","Guang Yang","Yike Guo","Shichen Zhang","Bo Xu","Yueming Jin"],"pdf_url":"https://arxiv.org/pdf/2308.09475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09472v1","updated":"2023-08-18T11:15:31Z","published":"2023-08-18T11:15:31Z","title":"Vision Relation Transformer for Unbiased Scene Graph Generation","summary":" Recent years have seen a growing interest in Scene Graph Generation (SGG), a\ncomprehensive visual scene understanding task that aims to predict entity\nrelationships using a relation encoder-decoder pipeline stacked on top of an\nobject encoder-decoder backbone. Unfortunately, current SGG methods suffer from\nan information loss regarding the entities local-level cues during the relation\nencoding process. To mitigate this, we introduce the Vision rElation\nTransfOrmer (VETO), consisting of a novel local-level entity relation encoder.\nWe further observe that many existing SGG methods claim to be unbiased, but are\nstill biased towards either head or tail classes. To overcome this bias, we\nintroduce a Mutually Exclusive ExperT (MEET) learning strategy that captures\nimportant relation features without bias towards head or tail classes.\nExperimental results on the VG and GQA datasets demonstrate that VETO + MEET\nboosts the predictive performance by up to 47 percentage over the state of the\nart while being 10 times smaller.\n","authors":["Gopika Sudhakaran","Devendra Singh Dhami","Kristian Kersting","Stefan Roth"],"pdf_url":"https://arxiv.org/pdf/2308.09472v1.pdf","comment":"Accepted for publication in ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09467v1","updated":"2023-08-18T11:07:39Z","published":"2023-08-18T11:07:39Z","title":"Quantitative Susceptibility Mapping through Model-based Deep Image Prior\n (MoDIP)","summary":" The data-driven approach of supervised learning methods has limited\napplicability in solving dipole inversion in Quantitative Susceptibility\nMapping (QSM) with varying scan parameters across different objects. To address\nthis generalization issue in supervised QSM methods, we propose a novel\ntraining-free model-based unsupervised method called MoDIP (Model-based Deep\nImage Prior). MoDIP comprises a small, untrained network and a Data Fidelity\nOptimization (DFO) module. The network converges to an interim state, acting as\nan implicit prior for image regularization, while the optimization process\nenforces the physical model of QSM dipole inversion. Experimental results\ndemonstrate MoDIP's excellent generalizability in solving QSM dipole inversion\nacross different scan parameters. It exhibits robustness against pathological\nbrain QSM, achieving over 32% accuracy improvement than supervised deep\nlearning and traditional iterative methods. It is also 33% more computationally\nefficient and runs 4 times faster than conventional DIP-based approaches,\nenabling 3D high-resolution image reconstruction in under 4.5 minutes.\n","authors":["Zhuang Xiong","Yang Gao","Yin Liu","Amir Fazlollahi","Peter Nestor","Feng Liu","Hongfu Sun"],"pdf_url":"https://arxiv.org/pdf/2308.09467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08376v2","updated":"2023-08-18T11:03:04Z","published":"2023-08-16T13:59:43Z","title":"Automated Semiconductor Defect Inspection in Scanning Electron\n Microscope Images: a Systematic Review","summary":" A growing need exists for efficient and accurate methods for detecting\ndefects in semiconductor materials and devices. These defects can have a\ndetrimental impact on the efficiency of the manufacturing process, because they\ncause critical failures and wafer-yield limitations. As nodes and patterns get\nsmaller, even high-resolution imaging techniques such as Scanning Electron\nMicroscopy (SEM) produce noisy images due to operating close to sensitivity\nlevels and due to varying physical properties of different underlayers or\nresist materials. This inherent noise is one of the main challenges for defect\ninspection. One promising approach is the use of machine learning algorithms,\nwhich can be trained to accurately classify and locate defects in semiconductor\nsamples. Recently, convolutional neural networks have proved to be particularly\nuseful in this regard. This systematic review provides a comprehensive overview\nof the state of automated semiconductor defect inspection on SEM images,\nincluding the most recent innovations and developments. 38 publications were\nselected on this topic, indexed in IEEE Xplore and SPIE databases. For each of\nthese, the application, methodology, dataset, results, limitations and future\nwork were summarized. A comprehensive overview and analysis of their methods is\nprovided. Finally, promising avenues for future work in the field of SEM-based\ndefect inspection are suggested.\n","authors":["Thibault Lechien","Enrique Dehaerne","Bappaditya Dey","Victor Blanco","Sandip Halder","Stefan De Gendt","Wannes Meert"],"pdf_url":"https://arxiv.org/pdf/2308.08376v2.pdf","comment":"16 pages, 12 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.09464v1","updated":"2023-08-18T11:02:27Z","published":"2023-08-18T11:02:27Z","title":"Data augmentation and explainability for bias discovery and mitigation\n in deep learning","summary":" This dissertation explores the impact of bias in deep neural networks and\npresents methods for reducing its influence on model performance. The first\npart begins by categorizing and describing potential sources of bias and errors\nin data and models, with a particular focus on bias in machine learning\npipelines. The next chapter outlines a taxonomy and methods of Explainable AI\nas a way to justify predictions and control and improve the model. Then, as an\nexample of a laborious manual data inspection and bias discovery process, a\nskin lesion dataset is manually examined. A Global Explanation for the Bias\nIdentification method is proposed as an alternative semi-automatic approach to\nmanual data exploration for discovering potential biases in data. Relevant\nnumerical methods and metrics are discussed for assessing the effects of the\nidentified biases on the model. Whereas identifying errors and bias is\ncritical, improving the model and reducing the number of flaws in the future is\nan absolute priority. Hence, the second part of the thesis focuses on\nmitigating the influence of bias on ML models. Three approaches are proposed\nand discussed: Style Transfer Data Augmentation, Targeted Data Augmentations,\nand Attribution Feedback. Style Transfer Data Augmentation aims to address\nshape and texture bias by merging a style of a malignant lesion with a\nconflicting shape of a benign one. Targeted Data Augmentations randomly insert\npossible biases into all images in the dataset during the training, as a way to\nmake the process random and, thus, destroy spurious correlations. Lastly,\nAttribution Feedback is used to fine-tune the model to improve its accuracy by\neliminating obvious mistakes and teaching it to ignore insignificant input\nparts via an attribution loss. The goal of these approaches is to reduce the\ninfluence of bias on machine learning models, rather than eliminate it\nentirely.\n","authors":["Agnieszka Mikołajczyk-Bareła"],"pdf_url":"https://arxiv.org/pdf/2308.09464v1.pdf","comment":"A PhD Thesis"},{"id":"http://arxiv.org/abs/2308.09460v1","updated":"2023-08-18T10:55:49Z","published":"2023-08-18T10:55:49Z","title":"Accelerated Bayesian imaging by relaxed proximal-point Langevin sampling","summary":" This paper presents a new accelerated proximal Markov chain Monte Carlo\nmethodology to perform Bayesian inference in imaging inverse problems with an\nunderlying convex geometry. The proposed strategy takes the form of a\nstochastic relaxed proximal-point iteration that admits two complementary\ninterpretations. For models that are smooth or regularised by Moreau-Yosida\nsmoothing, the algorithm is equivalent to an implicit midpoint discretisation\nof an overdamped Langevin diffusion targeting the posterior distribution of\ninterest. This discretisation is asymptotically unbiased for Gaussian targets\nand shown to converge in an accelerated manner for any target that is\n$\\kappa$-strongly log-concave (i.e., requiring in the order of $\\sqrt{\\kappa}$\niterations to converge, similarly to accelerated optimisation schemes),\ncomparing favorably to [M. Pereyra, L. Vargas Mieles, K.C. Zygalakis, SIAM J.\nImaging Sciences, 13, 2 (2020), pp. 905-935] which is only provably accelerated\nfor Gaussian targets and has bias. For models that are not smooth, the\nalgorithm is equivalent to a Leimkuhler-Matthews discretisation of a Langevin\ndiffusion targeting a Moreau-Yosida approximation of the posterior distribution\nof interest, and hence achieves a significantly lower bias than conventional\nunadjusted Langevin strategies based on the Euler-Maruyama discretisation. For\ntargets that are $\\kappa$-strongly log-concave, the provided non-asymptotic\nconvergence analysis also identifies the optimal time step which maximizes the\nconvergence speed. The proposed methodology is demonstrated through a range of\nexperiments related to image deconvolution with Gaussian and Poisson noise,\nwith assumption-driven and data-driven convex priors.\n","authors":["Teresa Klatzer","Paul Dobson","Yoann Altmann","Marcelo Pereyra","Jesús María Sanz-Serna","Konstantinos C. Zygalakis"],"pdf_url":"https://arxiv.org/pdf/2308.09460v1.pdf","comment":"34 pages, 13 figures"},{"id":"http://arxiv.org/abs/2212.05680v2","updated":"2023-08-18T10:46:35Z","published":"2022-12-12T03:35:05Z","title":"REAP: A Large-Scale Realistic Adversarial Patch Benchmark","summary":" Machine learning models are known to be susceptible to adversarial\nperturbation. One famous attack is the adversarial patch, a sticker with a\nparticularly crafted pattern that makes the model incorrectly predict the\nobject it is placed on. This attack presents a critical threat to\ncyber-physical systems that rely on cameras such as autonomous cars. Despite\nthe significance of the problem, conducting research in this setting has been\ndifficult; evaluating attacks and defenses in the real world is exceptionally\ncostly while synthetic data are unrealistic. In this work, we propose the REAP\n(REalistic Adversarial Patch) benchmark, a digital benchmark that allows the\nuser to evaluate patch attacks on real images, and under real-world conditions.\nBuilt on top of the Mapillary Vistas dataset, our benchmark contains over\n14,000 traffic signs. Each sign is augmented with a pair of geometric and\nlighting transformations, which can be used to apply a digitally generated\npatch realistically onto the sign. Using our benchmark, we perform the first\nlarge-scale assessments of adversarial patch attacks under realistic\nconditions. Our experiments suggest that adversarial patch attacks may present\na smaller threat than previously believed and that the success rate of an\nattack on simpler digital simulations is not predictive of its actual\neffectiveness in practice. We release our benchmark publicly at\nhttps://github.com/wagner-group/reap-benchmark.\n","authors":["Nabeel Hingun","Chawin Sitawarin","Jerry Li","David Wagner"],"pdf_url":"https://arxiv.org/pdf/2212.05680v2.pdf","comment":"ICCV 2023. Code and benchmark can be found at\n https://github.com/wagner-group/reap-benchmark"},{"id":"http://arxiv.org/abs/2308.09455v1","updated":"2023-08-18T10:40:25Z","published":"2023-08-18T10:40:25Z","title":"Artificial-Spiking Hierarchical Networks for Vision-Language\n Representation Learning","summary":" With the success of self-supervised learning, multimodal foundation models\nhave rapidly adapted a wide range of downstream tasks driven by vision and\nlanguage (VL) pretraining. State-of-the-art methods achieve impressive\nperformance by pre-training on large-scale datasets. However, bridging the\nsemantic gap between the two modalities remains a nonnegligible challenge for\nVL tasks. In this work, we propose an efficient computation framework for\nmultimodal alignment by introducing a novel visual semantic module to further\nimprove the performance of the VL tasks. Specifically, we propose a flexible\nmodel, namely Artificial-Spiking Hierarchical Networks (ASH-Nets), which\ncombines the complementary advantages of Artificial neural networks (ANNs) and\nSpiking neural networks (SNNs) to enrich visual semantic representations. In\nparticular, a visual concrete encoder and a semantic abstract encoder are\nconstructed to learn continuous and discrete latent variables to enhance the\nflexibility of semantic encoding. Considering the spatio-temporal properties of\nSNNs modeling, we introduce a contrastive learning method to optimize the\ninputs of similar samples. This can improve the computational efficiency of the\nhierarchical network, while the augmentation of hard samples is beneficial to\nthe learning of visual representations. Furthermore, the Spiking to Text\nUni-Alignment Learning (STUA) pre-training method is proposed, which only\nrelies on text features to enhance the encoding ability of abstract semantics.\nWe validate the performance on multiple well-established downstream VL tasks.\nExperiments show that the proposed ASH-Nets achieve competitive results.\n","authors":["Yeming Chen","Siyu Zhang","Yaoru Sun","Weijian Liang","Haoran Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09437v1","updated":"2023-08-18T10:07:46Z","published":"2023-08-18T10:07:46Z","title":"From Hope to Safety: Unlearning Biases of Deep Models by Enforcing the\n Right Reasons in Latent Space","summary":" Deep Neural Networks are prone to learning spurious correlations embedded in\nthe training data, leading to potentially biased predictions. This poses risks\nwhen deploying these models for high-stake decision-making, such as in medical\napplications. Current methods for post-hoc model correction either require\ninput-level annotations, which are only possible for spatially localized\nbiases, or augment the latent feature space, thereby hoping to enforce the\nright reasons. We present a novel method ensuring the right reasons on the\nconcept level by reducing the model's sensitivity towards biases through the\ngradient. When modeling biases via Concept Activation Vectors, we highlight the\nimportance of choosing robust directions, as traditional regression-based\napproaches such as Support Vector Machines tend to result in diverging\ndirections. We effectively mitigate biases in controlled and real-world\nsettings on the ISIC, Bone Age, ImageNet and CelebA datasets using VGG, ResNet\nand EfficientNet architectures.\n","authors":["Maximilian Dreyer","Frederik Pahde","Christopher J. Anders","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2308.09437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09436v1","updated":"2023-08-18T10:07:38Z","published":"2023-08-18T10:07:38Z","title":"Transformer-based Detection of Microorganismson High-Resolution Petri\n Dish Images","summary":" Many medical or pharmaceutical processes have strict guidelines regarding\ncontinuous hygiene monitoring. This often involves the labor-intensive task of\nmanually counting microorganisms in Petri dishes by trained personnel.\nAutomation attempts often struggle due to major challenges: significant scaling\ndifferences, low separation, low contrast, etc. To address these challenges, we\nintroduce AttnPAFPN, a high-resolution detection pipeline that leverages a\nnovel transformer variation, the efficient-global self-attention mechanism. Our\nstreamlined approach can be easily integrated in almost any multi-scale object\ndetection pipeline. In a comprehensive evaluation on the publicly available\nAGAR dataset, we demonstrate the superior accuracy of our network over the\ncurrent state-of-the-art. In order to demonstrate the task-independent\nperformance of our approach, we perform further experiments on COCO and\nLIVECell datasets.\n","authors":["Nikolas Ebert","Didier Stricker","Oliver Wasenmüller"],"pdf_url":"https://arxiv.org/pdf/2308.09436v1.pdf","comment":"This paper has been accepted at IEEE International Conference on\n Computer Vision Workshops (ICCV workshop), 2023"},{"id":"http://arxiv.org/abs/2308.09433v1","updated":"2023-08-18T10:07:17Z","published":"2023-08-18T10:07:17Z","title":"Can ultrasound confidence maps predict sonographers' labeling\n variability?","summary":" Measuring cross-sectional areas in ultrasound images is a standard tool to\nevaluate disease progress or treatment response. Often addressed today with\nsupervised deep-learning segmentation approaches, existing solutions highly\ndepend upon the quality of experts' annotations. However, the annotation\nquality in ultrasound is anisotropic and position-variant due to the inherent\nphysical imaging principles, including attenuation, shadows, and missing\nboundaries, commonly exacerbated with depth. This work proposes a novel\napproach that guides ultrasound segmentation networks to account for\nsonographers' uncertainties and generate predictions with variability similar\nto the experts. We claim that realistic variability can reduce overconfident\npredictions and improve physicians' acceptance of deep-learning cross-sectional\nsegmentation solutions. Our method provides CM's certainty for each pixel for\nminimal computational overhead as it can be precalculated directly from the\nimage. We show that there is a correlation between low values in the confidence\nmaps and expert's label uncertainty. Therefore, we propose to give the\nconfidence maps as additional information to the networks. We study the effect\nof the proposed use of ultrasound CMs in combination with four state-of-the-art\nneural networks and in two configurations: as a second input channel and as\npart of the loss. We evaluate our method on 3D ultrasound datasets of the\nthyroid and lower limb muscles. Our results show ultrasound CMs increase the\nDice score, improve the Hausdorff and Average Surface Distances, and decrease\nthe number of isolated pixel predictions. Furthermore, our findings suggest\nthat ultrasound CMs improve the penalization of uncertain areas in the ground\ntruth data, thereby improving problematic interpolations. Our code and example\ndata will be made public at\nhttps://github.com/IFL-CAMP/Confidence-segmentation.\n","authors":["Vanessa Gonzalez Duque","Leonhard Zirus","Yordanka Velikova","Nassir Navab","Diana Mateus"],"pdf_url":"https://arxiv.org/pdf/2308.09433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09426v1","updated":"2023-08-18T09:51:11Z","published":"2023-08-18T09:51:11Z","title":"Self-Supervised Single-Image Deconvolution with Siamese Neural Networks","summary":" Inverse problems in image reconstruction are fundamentally complicated by\nunknown noise properties. Classical iterative deconvolution approaches amplify\nnoise and require careful parameter selection for an optimal trade-off between\nsharpness and grain. Deep learning methods allow for flexible parametrization\nof the noise and learning its properties directly from the data. Recently,\nself-supervised blind-spot neural networks were successfully adopted for image\ndeconvolution by including a known point-spread function in the end-to-end\ntraining. However, their practical application has been limited to 2D images in\nthe biomedical domain because it implies large kernels that are poorly\noptimized. We tackle this problem with Fast Fourier Transform convolutions that\nprovide training speed-up in 3D microscopy deconvolution tasks. Further, we\npropose to adopt a Siamese invariance loss for deconvolution and empirically\nidentify its optimal position in the neural network between blind-spot and full\nimage branches. The experimental results show that our improved framework\noutperforms the previous state-of-the-art deconvolution methods with a known\npoint spread function.\n","authors":["Mikhail Papkov","Kaupo Palo","Leopold Parts"],"pdf_url":"https://arxiv.org/pdf/2308.09426v1.pdf","comment":"Accepted for DALI @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.09421v1","updated":"2023-08-18T09:39:52Z","published":"2023-08-18T09:39:52Z","title":"MonoNeRD: NeRF-like Representations for Monocular 3D Object Detection","summary":" In the field of monocular 3D detection, it is common practice to utilize\nscene geometric clues to enhance the detector's performance. However, many\nexisting works adopt these clues explicitly such as estimating a depth map and\nback-projecting it into 3D space. This explicit methodology induces sparsity in\n3D representations due to the increased dimensionality from 2D to 3D, and leads\nto substantial information loss, especially for distant and occluded objects.\nTo alleviate this issue, we propose MonoNeRD, a novel detection framework that\ncan infer dense 3D geometry and occupancy. Specifically, we model scenes with\nSigned Distance Functions (SDF), facilitating the production of dense 3D\nrepresentations. We treat these representations as Neural Radiance Fields\n(NeRF) and then employ volume rendering to recover RGB images and depth maps.\nTo the best of our knowledge, this work is the first to introduce volume\nrendering for M3D, and demonstrates the potential of implicit reconstruction\nfor image-based 3D perception. Extensive experiments conducted on the KITTI-3D\nbenchmark and Waymo Open Dataset demonstrate the effectiveness of MonoNeRD.\nCodes are available at https://github.com/cskkxjk/MonoNeRD.\n","authors":["Junkai Xu","Liang Peng","Haoran Cheng","Hao Li","Wei Qian","Ke Li","Wenxiao Wang","Deng Cai"],"pdf_url":"https://arxiv.org/pdf/2308.09421v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09411v1","updated":"2023-08-18T09:23:55Z","published":"2023-08-18T09:23:55Z","title":"Metadata Improves Segmentation Through Multitasking Elicitation","summary":" Metainformation is a common companion to biomedical images. However, this\npotentially powerful additional source of signal from image acquisition has had\nlimited use in deep learning methods, for semantic segmentation in particular.\nHere, we incorporate metadata by employing a channel modulation mechanism in\nconvolutional networks and study its effect on semantic segmentation tasks. We\ndemonstrate that metadata as additional input to a convolutional network can\nimprove segmentation results while being inexpensive in implementation as a\nnimble add-on to popular models. We hypothesize that this benefit of metadata\ncan be attributed to facilitating multitask switching. This aspect of\nmetadata-driven systems is explored and discussed in detail.\n","authors":["Iaroslav Plutenko","Mikhail Papkov","Kaupo Palo","Leopold Parts","Dmytro Fishman"],"pdf_url":"https://arxiv.org/pdf/2308.09411v1.pdf","comment":"Accepted for DART @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2305.05189v3","updated":"2023-08-18T09:13:46Z","published":"2023-05-09T05:48:38Z","title":"SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models with\n Large Language Models","summary":" Diffusion models, which have emerged to become popular text-to-image\ngeneration models, can produce high-quality and content-rich images guided by\ntextual prompts. However, there are limitations to semantic understanding and\ncommonsense reasoning in existing models when the input prompts are concise\nnarrative, resulting in low-quality image generation. To improve the capacities\nfor narrative prompts, we propose a simple-yet-effective parameter-efficient\nfine-tuning approach called the Semantic Understanding and Reasoning adapter\n(SUR-adapter) for pre-trained diffusion models. To reach this goal, we first\ncollect and annotate a new dataset SURD which consists of more than 57,000\nsemantically corrected multi-modal samples. Each sample contains a simple\nnarrative prompt, a complex keyword-based prompt, and a high-quality image.\nThen, we align the semantic representation of narrative prompts to the complex\nprompts and transfer knowledge of large language models (LLMs) to our\nSUR-adapter via knowledge distillation so that it can acquire the powerful\nsemantic understanding and reasoning capabilities to build a high-quality\ntextual semantic representation for text-to-image generation. We conduct\nexperiments by integrating multiple LLMs and popular pre-trained diffusion\nmodels to show the effectiveness of our approach in enabling diffusion models\nto understand and reason concise natural language without image quality\ndegradation. Our approach can make text-to-image diffusion models easier to use\nwith better user experience, which demonstrates our approach has the potential\nfor further advancing the development of user-friendly text-to-image generation\nmodels by bridging the semantic gap between simple narrative prompts and\ncomplex keyword-based prompts. The code is released at\nhttps://github.com/Qrange-group/SUR-adapter.\n","authors":["Shanshan Zhong","Zhongzhan Huang","Wushao Wen","Jinghui Qin","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2305.05189v3.pdf","comment":"accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.09391v1","updated":"2023-08-18T08:46:02Z","published":"2023-08-18T08:46:02Z","title":"Generalizable Decision Boundaries: Dualistic Meta-Learning for Open Set\n Domain Generalization","summary":" Domain generalization (DG) is proposed to deal with the issue of domain\nshift, which occurs when statistical differences exist between source and\ntarget domains. However, most current methods do not account for a common\nrealistic scenario where the source and target domains have different classes.\nTo overcome this deficiency, open set domain generalization (OSDG) then emerges\nas a more practical setting to recognize unseen classes in unseen domains. An\nintuitive approach is to use multiple one-vs-all classifiers to define decision\nboundaries for each class and reject the outliers as unknown. However, the\nsignificant class imbalance between positive and negative samples often causes\nthe boundaries biased towards positive ones, resulting in misclassification for\nknown samples in the unseen target domain. In this paper, we propose a novel\nmeta-learning-based framework called dualistic MEta-learning with joint\nDomaIn-Class matching (MEDIC), which considers gradient matching towards\ninter-domain and inter-class splits simultaneously to find a generalizable\nboundary balanced for all tasks. Experimental results demonstrate that MEDIC\nnot only outperforms previous methods in open set scenarios, but also maintains\ncompetitive close set generalization ability at the same time. Our code is\navailable at https://github.com/zzwdx/MEDIC.\n","authors":["Xiran Wang","Jian Zhang","Lei Qi","Yinghuan Shi"],"pdf_url":"https://arxiv.org/pdf/2308.09391v1.pdf","comment":"10 pages, 5 figures, accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.09388v1","updated":"2023-08-18T08:40:38Z","published":"2023-08-18T08:40:38Z","title":"Diffusion Models for Image Restoration and Enhancement -- A\n Comprehensive Survey","summary":" Image restoration (IR) has been an indispensable and challenging task in the\nlow-level vision field, which strives to improve the subjective quality of\nimages distorted by various forms of degradation. Recently, the diffusion model\nhas achieved significant advancements in the visual generation of AIGC, thereby\nraising an intuitive question, \"whether diffusion model can boost image\nrestoration\". To answer this, some pioneering studies attempt to integrate\ndiffusion models into the image restoration task, resulting in superior\nperformances than previous GAN-based methods. Despite that, a comprehensive and\nenlightening survey on diffusion model-based image restoration remains scarce.\nIn this paper, we are the first to present a comprehensive review of recent\ndiffusion model-based methods on image restoration, encompassing the learning\nparadigm, conditional strategy, framework design, modeling strategy, and\nevaluation. Concretely, we first introduce the background of the diffusion\nmodel briefly and then present two prevalent workflows that exploit diffusion\nmodels in image restoration. Subsequently, we classify and emphasize the\ninnovative designs using diffusion models for both IR and blind/real-world IR,\nintending to inspire future development. To evaluate existing methods\nthoroughly, we summarize the commonly-used dataset, implementation details, and\nevaluation metrics. Additionally, we present the objective comparison for\nopen-sourced methods across three tasks, including image super-resolution,\ndeblurring, and inpainting. Ultimately, informed by the limitations in existing\nworks, we propose five potential and challenging directions for the future\nresearch of diffusion model-based IR, including sampling efficiency, model\ncompression, distortion simulation and estimation, distortion invariant\nlearning, and framework design.\n","authors":["Xin Li","Yulin Ren","Xin Jin","Cuiling Lan","Xingrui Wang","Wenjun Zeng","Xinchao Wang","Zhibo Chen"],"pdf_url":"https://arxiv.org/pdf/2308.09388v1.pdf","comment":"34 pages"},{"id":"http://arxiv.org/abs/2308.09386v1","updated":"2023-08-18T08:37:49Z","published":"2023-08-18T08:37:49Z","title":"DReg-NeRF: Deep Registration for Neural Radiance Fields","summary":" Although Neural Radiance Fields (NeRF) is popular in the computer vision\ncommunity recently, registering multiple NeRFs has yet to gain much attention.\nUnlike the existing work, NeRF2NeRF, which is based on traditional optimization\nmethods and needs human annotated keypoints, we propose DReg-NeRF to solve the\nNeRF registration problem on object-centric scenes without human intervention.\nAfter training NeRF models, our DReg-NeRF first extracts features from the\noccupancy grid in NeRF. Subsequently, our DReg-NeRF utilizes a transformer\narchitecture with self-attention and cross-attention layers to learn the\nrelations between pairwise NeRF blocks. In contrast to state-of-the-art (SOTA)\npoint cloud registration methods, the decoupled correspondences are supervised\nby surface fields without any ground truth overlapping labels. We construct a\nnovel view synthesis dataset with 1,700+ 3D objects obtained from Objaverse to\ntrain our network. When evaluated on the test set, our proposed method beats\nthe SOTA point cloud registration methods by a large margin, with a mean\n$\\text{RPE}=9.67^{\\circ}$ and a mean $\\text{RTE}=0.038$.\n Our code is available at https://github.com/AIBluefisher/DReg-NeRF.\n","authors":["Yu Chen","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2308.09386v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.01090v2","updated":"2023-08-18T08:31:30Z","published":"2023-07-03T15:09:10Z","title":"Streamlined Lensed Quasar Identification in Multiband Images via\n Ensemble Networks","summary":" Quasars experiencing strong lensing offer unique viewpoints on subjects\nrelated to the cosmic expansion rate, the dark matter profile within the\nforeground deflectors, and the quasar host galaxies. Unfortunately, identifying\nthem in astronomical images is challenging since they are overwhelmed by the\nabundance of non-lenses. To address this, we have developed a novel approach by\nensembling cutting-edge convolutional networks (CNNs) -- for instance, ResNet,\nInception, NASNet, MobileNet, EfficientNet, and RegNet -- along with vision\ntransformers (ViTs) trained on realistic galaxy-quasar lens simulations based\non the Hyper Suprime-Cam (HSC) multiband images. While the individual model\nexhibits remarkable performance when evaluated against the test dataset,\nachieving an area under the receiver operating characteristic curve of $>$97.3%\nand a median false positive rate of 3.6%, it struggles to generalize in real\ndata, indicated by numerous spurious sources picked by each classifier. A\nsignificant improvement is achieved by averaging these CNNs and ViTs, resulting\nin the impurities being downsized by factors up to 50. Subsequently, combining\nthe HSC images with the UKIRT, VISTA, and unWISE data, we retrieve\napproximately 60 million sources as parent samples and reduce this to 892,609\nafter employing a photometry preselection to discover $z>1.5$ lensed quasars\nwith Einstein radii of $\\theta_\\mathrm{E}<5$ arcsec. Afterward, the ensemble\nclassifier indicates 3080 sources with a high probability of being lenses, for\nwhich we visually inspect, yielding 210 prevailing candidates awaiting\nspectroscopic confirmation. These outcomes suggest that automated deep learning\npipelines hold great potential in effectively detecting strong lenses in vast\ndatasets with minimal manual visual inspection involved.\n","authors":["Irham Taufik Andika","Sherry H. Suyu","Raoul Cañameras","Alejandra Melo","Stefan Schuldt","Yiping Shu","Anna-Christina Eilers","Anton Timur Jaelani","Minghao Yue"],"pdf_url":"https://arxiv.org/pdf/2307.01090v2.pdf","comment":"Accepted for publication in the Astronomy & Astrophysics journal. 28\n pages, 11 figures, and 3 tables. We welcome comments from the reader"},{"id":"http://arxiv.org/abs/2306.05888v2","updated":"2023-08-18T08:31:15Z","published":"2023-06-09T13:31:50Z","title":"TrajectoryFormer: 3D Object Tracking Transformer with Predictive\n Trajectory Hypotheses","summary":" 3D multi-object tracking (MOT) is vital for many applications including\nautonomous driving vehicles and service robots. With the commonly used\ntracking-by-detection paradigm, 3D MOT has made important progress in recent\nyears. However, these methods only use the detection boxes of the current frame\nto obtain trajectory-box association results, which makes it impossible for the\ntracker to recover objects missed by the detector. In this paper, we present\nTrajectoryFormer, a novel point-cloud-based 3D MOT framework. To recover the\nmissed object by detector, we generates multiple trajectory hypotheses with\nhybrid candidate boxes, including temporally predicted boxes and current-frame\ndetection boxes, for trajectory-box association. The predicted boxes can\npropagate object's history trajectory information to the current frame and thus\nthe network can tolerate short-term miss detection of the tracked objects. We\ncombine long-term object motion feature and short-term object appearance\nfeature to create per-hypothesis feature embedding, which reduces the\ncomputational overhead for spatial-temporal encoding. Additionally, we\nintroduce a Global-Local Interaction Module to conduct information interaction\namong all hypotheses and models their spatial relations, leading to accurate\nestimation of hypotheses. Our TrajectoryFormer achieves state-of-the-art\nperformance on the Waymo 3D MOT benchmarks. Code is available at\nhttps://github.com/poodarchu/EFG .\n","authors":["Xuesong Chen","Shaoshuai Shi","Chao Zhang","Benjin Zhu","Qiang Wang","Ka Chun Cheung","Simon See","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2306.05888v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09383v1","updated":"2023-08-18T08:28:17Z","published":"2023-08-18T08:28:17Z","title":"Label-Free Event-based Object Recognition via Joint Learning with Image\n Reconstruction from Events","summary":" Recognizing objects from sparse and noisy events becomes extremely difficult\nwhen paired images and category labels do not exist. In this paper, we study\nlabel-free event-based object recognition where category labels and paired\nimages are not available. To this end, we propose a joint formulation of object\nrecognition and image reconstruction in a complementary manner. Our method\nfirst reconstructs images from events and performs object recognition through\nContrastive Language-Image Pre-training (CLIP), enabling better recognition\nthrough a rich context of images. Since the category information is essential\nin reconstructing images, we propose category-guided attraction loss and\ncategory-agnostic repulsion loss to bridge the textual features of predicted\ncategories and the visual features of reconstructed images using CLIP.\nMoreover, we introduce a reliable data sampling strategy and local-global\nreconstruction consistency to boost joint learning of two tasks. To enhance the\naccuracy of prediction and quality of reconstruction, we also propose a\nprototype-based approach using unpaired images. Extensive experiments\ndemonstrate the superiority of our method and its extensibility for zero-shot\nobject recognition. Our project code is available at\n\\url{https://github.com/Chohoonhee/Ev-LaFOR}.\n","authors":["Hoonhee Cho","Hyeonseong Kim","Yujeong Chae","Kuk-Jin Yoon"],"pdf_url":"https://arxiv.org/pdf/2308.09383v1.pdf","comment":"Accepted to ICCV 2023 (Oral)"},{"id":"http://arxiv.org/abs/2308.09380v1","updated":"2023-08-18T08:23:47Z","published":"2023-08-18T08:23:47Z","title":"Deciphering knee osteoarthritis diagnostic features with explainable\n artificial intelligence: A systematic review","summary":" Existing artificial intelligence (AI) models for diagnosing knee\nosteoarthritis (OA) have faced criticism for their lack of transparency and\ninterpretability, despite achieving medical-expert-like performance. This\nopacity makes them challenging to trust in clinical practice. Recently,\nexplainable artificial intelligence (XAI) has emerged as a specialized\ntechnique that can provide confidence in the model's prediction by revealing\nhow the prediction is derived, thus promoting the use of AI systems in\nhealthcare. This paper presents the first survey of XAI techniques used for\nknee OA diagnosis. The XAI techniques are discussed from two perspectives: data\ninterpretability and model interpretability. The aim of this paper is to\nprovide valuable insights into XAI's potential towards a more reliable knee OA\ndiagnosis approach and encourage its adoption in clinical practice.\n","authors":["Yun Xin Teoh","Alice Othmani","Siew Li Goh","Juliana Usman","Khin Wee Lai"],"pdf_url":"https://arxiv.org/pdf/2308.09380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.20087v3","updated":"2023-08-18T08:20:06Z","published":"2023-05-31T17:59:03Z","title":"Too Large; Data Reduction for Vision-Language Pre-Training","summary":" This paper examines the problems of severe image-text misalignment and high\nredundancy in the widely-used large-scale Vision-Language Pre-Training (VLP)\ndatasets. To address these issues, we propose an efficient and straightforward\nVision-Language learning algorithm called TL;DR, which aims to compress the\nexisting large VLP data into a small, high-quality set. Our approach consists\nof two major steps. First, a codebook-based encoder-decoder captioner is\ndeveloped to select representative samples. Second, a new caption is generated\nto complement the original captions for selected samples, mitigating the\ntext-image misalignment problem while maintaining uniqueness. As the result,\nTL;DR enables us to reduce the large dataset into a small set of high-quality\ndata, which can serve as an alternative pre-training dataset. This algorithm\nsignificantly speeds up the time-consuming pretraining process. Specifically,\nTL;DR can compress the mainstream VLP datasets at a high ratio, e.g., reduce\nwell-cleaned CC3M dataset from 2.82M to 0.67M ($\\sim$24\\%) and noisy YFCC15M\nfrom 15M to 2.5M ($\\sim$16.7\\%). Extensive experiments with three popular VLP\nmodels over seven downstream tasks show that VLP model trained on the\ncompressed dataset provided by TL;DR can perform similar or even better results\ncompared with training on the full-scale dataset. The code will be made\navailable at \\url{https://github.com/showlab/datacentric.vlp}.\n","authors":["Alex Jinpeng Wang","Kevin Qinghong Lin","David Junhao Zhang","Stan Weixian Lei","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2305.20087v3.pdf","comment":"ICCV2023. Code: https://github.com/showlab/datacentric.vlp"},{"id":"http://arxiv.org/abs/2308.05721v3","updated":"2023-08-18T08:15:25Z","published":"2023-08-10T17:37:49Z","title":"Deformable Mixer Transformer with Gating for Multi-Task Learning of\n Dense Prediction","summary":" CNNs and Transformers have their own advantages and both have been widely\nused for dense prediction in multi-task learning (MTL). Most of the current\nstudies on MTL solely rely on CNN or Transformer. In this work, we present a\nnovel MTL model by combining both merits of deformable CNN and query-based\nTransformer with shared gating for multi-task learning of dense prediction.\nThis combination may offer a simple and efficient solution owing to its\npowerful and flexible task-specific learning and advantages of lower cost, less\ncomplexity and smaller parameters than the traditional MTL methods. We\nintroduce deformable mixer Transformer with gating (DeMTG), a simple and\neffective encoder-decoder architecture up-to-date that incorporates the\nconvolution and attention mechanism in a unified network for MTL. It is\nexquisitely designed to use advantages of each block, and provide deformable\nand comprehensive features for all tasks from local and global perspective.\nFirst, the deformable mixer encoder contains two types of operators: the\nchannel-aware mixing operator leveraged to allow communication among different\nchannels, and the spatial-aware deformable operator with deformable convolution\napplied to efficiently sample more informative spatial locations. Second, the\ntask-aware gating transformer decoder is used to perform the task-specific\npredictions, in which task interaction block integrated with self-attention is\napplied to capture task interaction features, and the task query block\nintegrated with gating attention is leveraged to select corresponding\ntask-specific features. Further, the experiment results demonstrate that the\nproposed DeMTG uses fewer GFLOPs and significantly outperforms current\nTransformer-based and CNN-based competitive models on a variety of metrics on\nthree dense prediction datasets. Our code and models are available at\nhttps://github.com/yangyangxu0/DeMTG.\n","authors":["Yangyang Xu","Yibo Yang","Bernard Ghanemm","Lefei Zhang","Du Bo","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2308.05721v3.pdf","comment":"submitted to IJCV; an extension to our previous AAAI 2023 paper\n arXiv:2301.03461"},{"id":"http://arxiv.org/abs/2301.08433v2","updated":"2023-08-18T08:11:15Z","published":"2023-01-20T06:11:17Z","title":"Unsupervised Light Field Depth Estimation via Multi-view Feature\n Matching with Occlusion Prediction","summary":" Depth estimation from light field (LF) images is a fundamental step for\nnumerous applications. Recently, learning-based methods have achieved higher\naccuracy and efficiency than the traditional methods. However, it is costly to\nobtain sufficient depth labels for supervised training. In this paper, we\npropose an unsupervised framework to estimate depth from LF images. First, we\ndesign a disparity estimation network (DispNet) with a coarse-to-fine structure\nto predict disparity maps from different view combinations. It explicitly\nperforms multi-view feature matching to learn the correspondences effectively.\nAs occlusions may cause the violation of photo-consistency, we introduce an\nocclusion prediction network (OccNet) to predict the occlusion maps, which are\nused as the element-wise weights of photometric loss to solve the occlusion\nissue and assist the disparity learning. With the disparity maps estimated by\nmultiple input combinations, we then propose a disparity fusion strategy based\non the estimated errors with effective occlusion handling to obtain the final\ndisparity map with higher accuracy. Experimental results demonstrate that our\nmethod achieves superior performance on both the dense and sparse LF images,\nand also shows better robustness and generalization on the real-world LF images\ncompared to the other methods.\n","authors":["Shansi Zhang","Nan Meng","Edmund Y. Lam"],"pdf_url":"https://arxiv.org/pdf/2301.08433v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09375v1","updated":"2023-08-18T08:10:41Z","published":"2023-08-18T08:10:41Z","title":"Image Processing and Machine Learning for Hyperspectral Unmixing: An\n Overview and the HySUPP Python Package","summary":" Spectral pixels are often a mixture of the pure spectra of the materials,\ncalled endmembers, due to the low spatial resolution of hyperspectral sensors,\ndouble scattering, and intimate mixtures of materials in the scenes. Unmixing\nestimates the fractional abundances of the endmembers within the pixel.\nDepending on the prior knowledge of endmembers, linear unmixing can be divided\ninto three main groups: supervised, semi-supervised, and unsupervised (blind)\nlinear unmixing. Advances in Image processing and machine learning\nsubstantially affected unmixing. This paper provides an overview of advanced\nand conventional unmixing approaches. Additionally, we draw a critical\ncomparison between advanced and conventional techniques from the three\ncategories. We compare the performance of the unmixing techniques on three\nsimulated and two real datasets. The experimental results reveal the advantages\nof different unmixing categories for different unmixing scenarios. Moreover, we\nprovide an open-source Python-based package available at\nhttps://github.com/BehnoodRasti/HySUPP to reproduce the results.\n","authors":["Behnood Rasti","Alexandre Zouaoui","Julien Mairal","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2308.09375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09372v1","updated":"2023-08-18T08:06:49Z","published":"2023-08-18T08:06:49Z","title":"Which Transformer to Favor: A Comparative Analysis of Efficiency in\n Vision Transformers","summary":" The growing popularity of Vision Transformers as the go-to models for image\nclassification has led to an explosion of architectural modifications claiming\nto be more efficient than the original ViT. However, a wide diversity of\nexperimental conditions prevents a fair comparison between all of them, based\nsolely on their reported results. To address this gap in comparability, we\nconduct a comprehensive analysis of more than 30 models to evaluate the\nefficiency of vision transformers and related architectures, considering\nvarious performance metrics. Our benchmark provides a comparable baseline\nacross the landscape of efficiency-oriented transformers, unveiling a plethora\nof surprising insights. For example, we discover that ViT is still Pareto\noptimal across multiple efficiency metrics, despite the existence of several\nalternative approaches claiming to be more efficient. Results also indicate\nthat hybrid attention-CNN models fare particularly well when it comes to low\ninference memory and number of parameters, and also that it is better to scale\nthe model size, than the image size. Furthermore, we uncover a strong positive\ncorrelation between the number of FLOPS and the training memory, which enables\nthe estimation of required VRAM from theoretical measurements alone.\n Thanks to our holistic evaluation, this study offers valuable insights for\npractitioners and researchers, facilitating informed decisions when selecting\nmodels for specific applications. We publicly release our code and data at\nhttps://github.com/tobna/WhatTransformerToFavor\n","authors":["Tobias Christian Nauen","Sebastian Palacio","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2308.09372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09369v1","updated":"2023-08-18T08:06:18Z","published":"2023-08-18T08:06:18Z","title":"Single Frame Semantic Segmentation Using Multi-Modal Spherical Images","summary":" In recent years, the research community has shown a lot of interest to\npanoramic images that offer a 360-degree directional perspective. Multiple data\nmodalities can be fed, and complimentary characteristics can be utilized for\nmore robust and rich scene interpretation based on semantic segmentation, to\nfully realize the potential. Existing research, however, mostly concentrated on\npinhole RGB-X semantic segmentation. In this study, we propose a\ntransformer-based cross-modal fusion architecture to bridge the gap between\nmulti-modal fusion and omnidirectional scene perception. We employ\ndistortion-aware modules to address extreme object deformations and panorama\ndistortions that result from equirectangular representation. Additionally, we\nconduct cross-modal interactions for feature rectification and information\nexchange before merging the features in order to communicate long-range\ncontexts for bi-modal and tri-modal feature streams. In thorough tests using\ncombinations of four different modality types in three indoor panoramic-view\ndatasets, our technique achieved state-of-the-art mIoU performance: 60.60% on\nStanford2D3DS (RGB-HHA), 71.97% Structured3D (RGB-D-N), and 35.92% Matterport3D\n(RGB-D). We plan to release all codes and trained models soon.\n","authors":["Suresh Guttikonda","Jason Rambach"],"pdf_url":"https://arxiv.org/pdf/2308.09369v1.pdf","comment":"Accepted at WACV 2024"},{"id":"http://arxiv.org/abs/2308.09368v1","updated":"2023-08-18T08:02:52Z","published":"2023-08-18T08:02:52Z","title":"A tailored Handwritten-Text-Recognition System for Medieval Latin","summary":" The Bavarian Academy of Sciences and Humanities aims to digitize its Medieval\nLatin Dictionary. This dictionary entails record cards referring to lemmas in\nmedieval Latin, a low-resource language. A crucial step of the digitization\nprocess is the Handwritten Text Recognition (HTR) of the handwritten lemmas\nfound on these record cards. In our work, we introduce an end-to-end pipeline,\ntailored to the medieval Latin dictionary, for locating, extracting, and\ntranscribing the lemmas. We employ two state-of-the-art (SOTA) image\nsegmentation models to prepare the initial data set for the HTR task.\nFurthermore, we experiment with different transformer-based models and conduct\na set of experiments to explore the capabilities of different combinations of\nvision encoders with a GPT-2 decoder. Additionally, we also apply extensive\ndata augmentation resulting in a highly competitive model. The best-performing\nsetup achieved a Character Error Rate (CER) of 0.015, which is even superior to\nthe commercial Google Cloud Vision model, and shows more stable performance.\n","authors":["Philipp Koch","Gilary Vera Nuñez","Esteban Garces Arias","Christian Heumann","Matthias Schöffel","Alexander Häberlin","Matthias Aßenmacher"],"pdf_url":"https://arxiv.org/pdf/2308.09368v1.pdf","comment":"This paper has been accepted at the First Workshop on Ancient\n Language Processing, co-located with RANLP 2023. This is the author's version\n of the work. The definite version of record will be published in the\n proceedings"},{"id":"http://arxiv.org/abs/2307.16715v2","updated":"2023-08-18T07:56:32Z","published":"2023-07-31T14:34:49Z","title":"UniVTG: Towards Unified Video-Language Temporal Grounding","summary":" Video Temporal Grounding (VTG), which aims to ground target clips from videos\n(such as consecutive intervals or disjoint shots) according to custom language\nqueries (e.g., sentences or words), is key for video browsing on social media.\nMost methods in this direction develop taskspecific models that are trained\nwith type-specific labels, such as moment retrieval (time interval) and\nhighlight detection (worthiness curve), which limits their abilities to\ngeneralize to various VTG tasks and labels. In this paper, we propose to Unify\nthe diverse VTG labels and tasks, dubbed UniVTG, along three directions:\nFirstly, we revisit a wide range of VTG labels and tasks and define a unified\nformulation. Based on this, we develop data annotation schemes to create\nscalable pseudo supervision. Secondly, we develop an effective and flexible\ngrounding model capable of addressing each task and making full use of each\nlabel. Lastly, thanks to the unified framework, we are able to unlock temporal\ngrounding pretraining from large-scale diverse labels and develop stronger\ngrounding abilities e.g., zero-shot grounding. Extensive experiments on three\ntasks (moment retrieval, highlight detection and video summarization) across\nseven datasets (QVHighlights, Charades-STA, TACoS, Ego4D, YouTube Highlights,\nTVSum, and QFVS) demonstrate the effectiveness and flexibility of our proposed\nframework. The codes are available at https://github.com/showlab/UniVTG.\n","authors":["Kevin Qinghong Lin","Pengchuan Zhang","Joya Chen","Shraman Pramanick","Difei Gao","Alex Jinpeng Wang","Rui Yan","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2307.16715v2.pdf","comment":"Accepted by ICCV 2023. 16 pages, 10 figures, 13 tables. Code:\n https://github.com/showlab/UniVTG"},{"id":"http://arxiv.org/abs/2308.09364v1","updated":"2023-08-18T07:47:22Z","published":"2023-08-18T07:47:22Z","title":"Overlap Bias Matching is Necessary for Point Cloud Registration","summary":" Point cloud registration is a fundamental problem in many domains.\nPractically, the overlap between point clouds to be registered may be\nrelatively small. Most unsupervised methods lack effective initial evaluation\nof overlap, leading to suboptimal registration accuracy. To address this issue,\nwe propose an unsupervised network Overlap Bias Matching Network (OBMNet) for\npartial point cloud registration. Specifically, we propose a plug-and-play\nOverlap Bias Matching Module (OBMM) comprising two integral components, overlap\nsampling module and bias prediction module. These two components are utilized\nto capture the distribution of overlapping regions and predict bias\ncoefficients of point cloud common structures, respectively. Then, we integrate\nOBMM with the neighbor map matching module to robustly identify correspondences\nby precisely merging matching scores of points within the neighborhood, which\naddresses the ambiguities in single-point features. OBMNet can maintain\nefficacy even in pair-wise registration scenarios with low overlap ratios.\nExperimental results on extensive datasets demonstrate that our approach's\nperformance achieves a significant improvement compared to the state-of-the-art\nregistration approach.\n","authors":["Pengcheng Shi","Jie Zhang","Haozhe Cheng","Junyang Wang","Yiyang Zhou","Chenlin Zhao","Jihua Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.09364v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2202.11292 by other authors"},{"id":"http://arxiv.org/abs/2308.09363v1","updated":"2023-08-18T07:45:10Z","published":"2023-08-18T07:45:10Z","title":"Open-vocabulary Video Question Answering: A New Benchmark for Evaluating\n the Generalizability of Video Question Answering Models","summary":" Video Question Answering (VideoQA) is a challenging task that entails complex\nmulti-modal reasoning. In contrast to multiple-choice VideoQA which aims to\npredict the answer given several options, the goal of open-ended VideoQA is to\nanswer questions without restricting candidate answers. However, the majority\nof previous VideoQA models formulate open-ended VideoQA as a classification\ntask to classify the video-question pairs into a fixed answer set, i.e.,\nclosed-vocabulary, which contains only frequent answers (e.g., top-1000\nanswers). This leads the model to be biased toward only frequent answers and\nfail to generalize on out-of-vocabulary answers. We hence propose a new\nbenchmark, Open-vocabulary Video Question Answering (OVQA), to measure the\ngeneralizability of VideoQA models by considering rare and unseen answers. In\naddition, in order to improve the model's generalization power, we introduce a\nnovel GNN-based soft verbalizer that enhances the prediction on rare and unseen\nanswers by aggregating the information from their similar words. For\nevaluation, we introduce new baselines by modifying the existing\n(closed-vocabulary) open-ended VideoQA models and improve their performances by\nfurther taking into account rare and unseen answers. Our ablation studies and\nqualitative analyses demonstrate that our GNN-based soft verbalizer further\nimproves the model performance, especially on rare and unseen answers. We hope\nthat our benchmark OVQA can serve as a guide for evaluating the\ngeneralizability of VideoQA models and inspire future research. Code is\navailable at https://github.com/mlvlab/OVQA.\n","authors":["Dohwan Ko","Ji Soo Lee","Miso Choi","Jaewon Chu","Jihwan Park","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2308.09363v1.pdf","comment":"Accepted paper at ICCV 2023"},{"id":"http://arxiv.org/abs/2306.05178v2","updated":"2023-08-18T07:38:57Z","published":"2023-06-08T13:18:23Z","title":"SyncDiffusion: Coherent Montage via Synchronized Joint Diffusions","summary":" The remarkable capabilities of pretrained image diffusion models have been\nutilized not only for generating fixed-size images but also for creating\npanoramas. However, naive stitching of multiple images often results in visible\nseams. Recent techniques have attempted to address this issue by performing\njoint diffusions in multiple windows and averaging latent features in\noverlapping regions. However, these approaches, which focus on seamless montage\ngeneration, often yield incoherent outputs by blending different scenes within\na single image. To overcome this limitation, we propose SyncDiffusion, a\nplug-and-play module that synchronizes multiple diffusions through gradient\ndescent from a perceptual similarity loss. Specifically, we compute the\ngradient of the perceptual loss using the predicted denoised images at each\ndenoising step, providing meaningful guidance for achieving coherent montages.\nOur experimental results demonstrate that our method produces significantly\nmore coherent outputs compared to previous methods (66.35% vs. 33.65% in our\nuser study) while still maintaining fidelity (as assessed by GIQA) and\ncompatibility with the input prompt (as measured by CLIP score).\n","authors":["Yuseung Lee","Kunho Kim","Hyunjin Kim","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2306.05178v2.pdf","comment":"Project page: https://syncdiffusion.github.io"},{"id":"http://arxiv.org/abs/2308.09357v1","updated":"2023-08-18T07:38:30Z","published":"2023-08-18T07:38:30Z","title":"Multi-scale Target-Aware Framework for Constrained Image Splicing\n Detection and Localization","summary":" Constrained image splicing detection and localization (CISDL) is a\nfundamental task of multimedia forensics, which detects splicing operation\nbetween two suspected images and localizes the spliced region on both images.\nRecent works regard it as a deep matching problem and have made significant\nprogress. However, existing frameworks typically perform feature extraction and\ncorrelation matching as separate processes, which may hinder the model's\nability to learn discriminative features for matching and can be susceptible to\ninterference from ambiguous background pixels. In this work, we propose a\nmulti-scale target-aware framework to couple feature extraction and correlation\nmatching in a unified pipeline. In contrast to previous methods, we design a\ntarget-aware attention mechanism that jointly learns features and performs\ncorrelation matching between the probe and donor images. Our approach can\neffectively promote the collaborative learning of related patches, and perform\nmutual promotion of feature learning and correlation matching. Additionally, in\norder to handle scale transformations, we introduce a multi-scale projection\nmethod, which can be readily integrated into our target-aware framework that\nenables the attention process to be conducted between tokens containing\ninformation of varying scales. Our experiments demonstrate that our model,\nwhich uses a unified pipeline, outperforms state-of-the-art methods on several\nbenchmark datasets and is robust against scale transformations.\n","authors":["Yuxuan Tan","Yuanman Li","Limin Zeng","Jiaxiong Ye","Wei wang","Xia Li"],"pdf_url":"https://arxiv.org/pdf/2308.09357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09906v3","updated":"2023-08-18T07:29:19Z","published":"2023-07-19T11:10:26Z","title":"Implicit Identity Representation Conditioned Memory Compensation Network\n for Talking Head video Generation","summary":" Talking head video generation aims to animate a human face in a still image\nwith dynamic poses and expressions using motion information derived from a\ntarget-driving video, while maintaining the person's identity in the source\nimage. However, dramatic and complex motions in the driving video cause\nambiguous generation, because the still source image cannot provide sufficient\nappearance information for occluded regions or delicate expression variations,\nwhich produces severe artifacts and significantly degrades the generation\nquality. To tackle this problem, we propose to learn a global facial\nrepresentation space, and design a novel implicit identity representation\nconditioned memory compensation network, coined as MCNet, for high-fidelity\ntalking head generation.~Specifically, we devise a network module to learn a\nunified spatial facial meta-memory bank from all training samples, which can\nprovide rich facial structure and appearance priors to compensate warped source\nfacial features for the generation. Furthermore, we propose an effective query\nmechanism based on implicit identity representations learned from the discrete\nkeypoints of the source image. It can greatly facilitate the retrieval of more\ncorrelated information from the memory bank for the compensation. Extensive\nexperiments demonstrate that MCNet can learn representative and complementary\nfacial memory, and can clearly outperform previous state-of-the-art talking\nhead generation methods on VoxCeleb1 and CelebV datasets. Please check our\n\\href{https://github.com/harlanhong/ICCV2023-MCNET}{Project}.\n","authors":["Fa-Ting Hong","Dan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.09906v3.pdf","comment":"Accepted by ICCV2023, update the reference and figures"},{"id":"http://arxiv.org/abs/2308.09351v1","updated":"2023-08-18T07:17:09Z","published":"2023-08-18T07:17:09Z","title":"RLIPv2: Fast Scaling of Relational Language-Image Pre-training","summary":" Relational Language-Image Pre-training (RLIP) aims to align vision\nrepresentations with relational texts, thereby advancing the capability of\nrelational reasoning in computer vision tasks. However, hindered by the slow\nconvergence of RLIPv1 architecture and the limited availability of existing\nscene graph data, scaling RLIPv1 is challenging. In this paper, we propose\nRLIPv2, a fast converging model that enables the scaling of relational\npre-training to large-scale pseudo-labelled scene graph data. To enable fast\nscaling, RLIPv2 introduces Asymmetric Language-Image Fusion (ALIF), a mechanism\nthat facilitates earlier and deeper gated cross-modal fusion with sparsified\nlanguage encoding layers. ALIF leads to comparable or better performance than\nRLIPv1 in a fraction of the time for pre-training and fine-tuning. To obtain\nscene graph data at scale, we extend object detection datasets with free-form\nrelation labels by introducing a captioner (e.g., BLIP) and a designed Relation\nTagger. The Relation Tagger assigns BLIP-generated relation texts to region\npairs, thus enabling larger-scale relational pre-training. Through extensive\nexperiments conducted on Human-Object Interaction Detection and Scene Graph\nGeneration, RLIPv2 shows state-of-the-art performance on three benchmarks under\nfully-finetuning, few-shot and zero-shot settings. Notably, the largest RLIPv2\nachieves 23.29mAP on HICO-DET without any fine-tuning, yields 32.22mAP with\njust 1% data and yields 45.09mAP with 100% data. Code and models are publicly\navailable at https://github.com/JacobYuan7/RLIPv2.\n","authors":["Hangjie Yuan","Shiwei Zhang","Xiang Wang","Samuel Albanie","Yining Pan","Tao Feng","Jianwen Jiang","Dong Ni","Yingya Zhang","Deli Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.09351v1.pdf","comment":"Accepted to ICCV 2023. Code and models:\n https://github.com/JacobYuan7/RLIPv2"},{"id":"http://arxiv.org/abs/2308.09346v1","updated":"2023-08-18T07:07:36Z","published":"2023-08-18T07:07:36Z","title":"Boosting Few-shot Action Recognition with Graph-guided Hybrid Matching","summary":" Class prototype construction and matching are core aspects of few-shot action\nrecognition. Previous methods mainly focus on designing spatiotemporal relation\nmodeling modules or complex temporal alignment algorithms. Despite the\npromising results, they ignored the value of class prototype construction and\nmatching, leading to unsatisfactory performance in recognizing similar\ncategories in every task. In this paper, we propose GgHM, a new framework with\nGraph-guided Hybrid Matching. Concretely, we learn task-oriented features by\nthe guidance of a graph neural network during class prototype construction,\noptimizing the intra- and inter-class feature correlation explicitly. Next, we\ndesign a hybrid matching strategy, combining frame-level and tuple-level\nmatching to classify videos with multivariate styles. We additionally propose a\nlearnable dense temporal modeling module to enhance the video feature temporal\nrepresentation to build a more solid foundation for the matching process. GgHM\nshows consistent improvements over other challenging baselines on several\nfew-shot datasets, demonstrating the effectiveness of our method. The code will\nbe publicly available at https://github.com/jiazheng-xing/GgHM.\n","authors":["Jiazheng Xing","Mengmeng Wang","Yudi Ruan","Bofan Chen","Yaowei Guo","Boyu Mu","Guang Dai","Jingdong Wang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.09346v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.09345v1","updated":"2023-08-18T07:07:15Z","published":"2023-08-18T07:07:15Z","title":"Denoising diffusion-based MR to CT image translation enables whole spine\n vertebral segmentation in 2D and 3D without manual annotations","summary":" Background: Automated segmentation of spinal MR images plays a vital role\nboth scientifically and clinically. However, accurately delineating posterior\nspine structures presents challenges.\n Methods: This retrospective study, approved by the ethical committee,\ninvolved translating T1w and T2w MR image series into CT images in a total of\nn=263 pairs of CT/MR series. Landmark-based registration was performed to align\nimage pairs. We compared 2D paired (Pix2Pix, denoising diffusion implicit\nmodels (DDIM) image mode, DDIM noise mode) and unpaired (contrastive unpaired\ntranslation, SynDiff) image-to-image translation using \"peak signal to noise\nratio\" (PSNR) as quality measure. A publicly available segmentation network\nsegmented the synthesized CT datasets, and Dice scores were evaluated on\nin-house test sets and the \"MRSpineSeg Challenge\" volumes. The 2D findings were\nextended to 3D Pix2Pix and DDIM.\n Results: 2D paired methods and SynDiff exhibited similar translation\nperformance and Dice scores on paired data. DDIM image mode achieved the\nhighest image quality. SynDiff, Pix2Pix, and DDIM image mode demonstrated\nsimilar Dice scores (0.77). For craniocaudal axis rotations, at least two\nlandmarks per vertebra were required for registration. The 3D translation\noutperformed the 2D approach, resulting in improved Dice scores (0.80) and\nanatomically accurate segmentations in a higher resolution than the original MR\nimage.\n Conclusion: Two landmarks per vertebra registration enabled paired\nimage-to-image translation from MR to CT and outperformed all unpaired\napproaches. The 3D techniques provided anatomically correct segmentations,\navoiding underprediction of small structures like the spinous process.\n","authors":["Robert Graf","Joachim Schmitt","Sarah Schlaeger","Hendrik Kristian Möller","Vasiliki Sideri-Lampretsa","Anjany Sekuboyina","Sandro Manuel Krieg","Benedikt Wiestler","Bjoern Menze","Daniel Rueckert","Jan Stefan Kirschke"],"pdf_url":"https://arxiv.org/pdf/2308.09345v1.pdf","comment":"35 pages, 7 figures, Code and a model weights available\n https://doi.org/10.5281/zenodo.8221159 and\n https://doi.org/10.5281/zenodo.8198697"},{"id":"http://arxiv.org/abs/2308.09343v1","updated":"2023-08-18T07:05:30Z","published":"2023-08-18T07:05:30Z","title":"Surprise machines: revealing Harvard Art Museums' image collection","summary":" Surprise Machines is a project of experimental museology that sets out to\nvisualize the entire image collection of the Harvard Art Museums, intending to\nopen up unexpected vistas on more than 200,000 objects usually inaccessible to\nvisitors. Part of the exhibition Curatorial A(i)gents organized by metaLAB (at)\nHarvard, the project explores the limits of artificial intelligence to display\na large set of images and create surprise among visitors. To achieve such a\nfeeling of surprise, a choreographic interface was designed to connect the\naudience's movement with several unique views of the collection.\n","authors":["Dario Rodighiero","Lins Derry","Douglas Duhaime","Jordan Kruguer","Maximilian C. Mueller","Christopher Pietsch","Jeffrey T. Schnapp","Jeff Steward"],"pdf_url":"https://arxiv.org/pdf/2308.09343v1.pdf","comment":"14 pages and 7 figures"},{"id":"http://arxiv.org/abs/2207.13297v5","updated":"2023-08-18T06:38:32Z","published":"2022-07-27T05:05:04Z","title":"GPS-GLASS: Learning Nighttime Semantic Segmentation Using Daytime Video\n and GPS data","summary":" Semantic segmentation for autonomous driving should be robust against various\nin-the-wild environments. Nighttime semantic segmentation is especially\nchallenging due to a lack of annotated nighttime images and a large domain gap\nfrom daytime images with sufficient annotation. In this paper, we propose a\nnovel GPS-based training framework for nighttime semantic segmentation. Given\nGPS-aligned pairs of daytime and nighttime images, we perform cross-domain\ncorrespondence matching to obtain pixel-level pseudo supervision. Moreover, we\nconduct flow estimation between daytime video frames and apply GPS-based\nscaling to acquire another pixel-level pseudo supervision. Using these pseudo\nsupervisions with a confidence map, we train a nighttime semantic segmentation\nnetwork without any annotation from nighttime images. Experimental results\ndemonstrate the effectiveness of the proposed method on several nighttime\nsemantic segmentation datasets. Our source code is available at\nhttps://github.com/jimmy9704/GPS-GLASS.\n","authors":["Hongjae Lee","Changwoo Han","Jun-Sang Yoo","Seung-Won Jung"],"pdf_url":"https://arxiv.org/pdf/2207.13297v5.pdf","comment":"ICCVW 2023"},{"id":"http://arxiv.org/abs/2308.09332v1","updated":"2023-08-18T06:27:35Z","published":"2023-08-18T06:27:35Z","title":"LSCD: A Large-Scale Screen Content Dataset for Video Compression","summary":" Multimedia compression allows us to watch videos, see pictures and hear\nsounds within a limited bandwidth, which helps the flourish of the internet.\nDuring the past decades, multimedia compression has achieved great success\nusing hand-craft features and systems. With the development of artificial\nintelligence and video compression, there emerges a lot of research work\nrelated to using the neural network on the video compression task to get rid of\nthe complicated system. Not only producing the advanced algorithms, but\nresearchers also spread the compression to different content, such as User\nGenerated Content(UGC). With the rapid development of mobile devices, screen\ncontent videos become an important part of multimedia data. In contrast, we\nfind community lacks a large-scale dataset for screen content video\ncompression, which impedes the fast development of the corresponding\nlearning-based algorithms. In order to fulfill this blank and accelerate the\nresearch of this special type of videos, we propose the Large-scale Screen\nContent Dataset(LSCD), which contains 714 source sequences. Meanwhile, we\nprovide the analysis of the proposed dataset to show some features of screen\ncontent videos, which will help researchers have a better understanding of how\nto explore new algorithms. Besides collecting and post-processing the data to\norganize the dataset, we also provide a benchmark containing the performance of\nboth traditional codec and learning-based methods.\n","authors":["Yuhao Cheng","Siru Zhang","Yiqiang Yan","Rong Chen","Yun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09331v1","updated":"2023-08-18T06:26:22Z","published":"2023-08-18T06:26:22Z","title":"SAMedOCT: Adapting Segment Anything Model (SAM) for Retinal OCT","summary":" The Segment Anything Model (SAM) has gained significant attention in the\nfield of image segmentation due to its impressive capabilities and prompt-based\ninterface. While SAM has already been extensively evaluated in various domains,\nits adaptation to retinal OCT scans remains unexplored. To bridge this research\ngap, we conduct a comprehensive evaluation of SAM and its adaptations on a\nlarge-scale public dataset of OCTs from RETOUCH challenge. Our evaluation\ncovers diverse retinal diseases, fluid compartments, and device vendors,\ncomparing SAM against state-of-the-art retinal fluid segmentation methods.\nThrough our analysis, we showcase adapted SAM's efficacy as a powerful\nsegmentation model in retinal OCT scans, although still lagging behind\nestablished methods in some circumstances. The findings highlight SAM's\nadaptability and robustness, showcasing its utility as a valuable tool in\nretinal OCT image analysis and paving the way for further advancements in this\ndomain.\n","authors":["Botond Fazekas","José Morano","Dmitrii Lachinov","Guilherme Aresta","Hrvoje Bogunović"],"pdf_url":"https://arxiv.org/pdf/2308.09331v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09327v1","updated":"2023-08-18T06:04:39Z","published":"2023-08-18T06:04:39Z","title":"Unlimited Knowledge Distillation for Action Recognition in the Dark","summary":" Dark videos often lose essential information, which causes the knowledge\nlearned by networks is not enough to accurately recognize actions. Existing\nknowledge assembling methods require massive GPU memory to distill the\nknowledge from multiple teacher models into a student model. In action\nrecognition, this drawback becomes serious due to much computation required by\nvideo process. Constrained by limited computation source, these approaches are\ninfeasible. To address this issue, we propose an unlimited knowledge\ndistillation (UKD) in this paper. Compared with existing knowledge assembling\nmethods, our UKD can effectively assemble different knowledge without\nintroducing high GPU memory consumption. Thus, the number of teaching models\nfor distillation is unlimited. With our UKD, the network's learned knowledge\ncan be remarkably enriched. Our experiments show that the single stream network\ndistilled with our UKD even surpasses a two-stream network. Extensive\nexperiments are conducted on the ARID dataset.\n","authors":["Ruibing Jin","Guosheng Lin","Min Wu","Jie Lin","Zhengguo Li","Xiaoli Li","Zhenghua Chen"],"pdf_url":"https://arxiv.org/pdf/2308.09327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06712v2","updated":"2023-08-18T05:49:47Z","published":"2023-04-13T17:58:08Z","title":"What does CLIP know about a red circle? Visual prompt engineering for\n VLMs","summary":" Large-scale Vision-Language Models, such as CLIP, learn powerful image-text\nrepresentations that have found numerous applications, from zero-shot\nclassification to text-to-image generation. Despite that, their capabilities\nfor solving novel discriminative tasks via prompting fall behind those of large\nlanguage models, such as GPT-3. Here we explore the idea of visual prompt\nengineering for solving computer vision tasks beyond classification by editing\nin image space instead of text. In particular, we discover an emergent ability\nof CLIP, where, by simply drawing a red circle around an object, we can direct\nthe model's attention to that region, while also maintaining global\ninformation. We show the power of this simple approach by achieving\nstate-of-the-art in zero-shot referring expressions comprehension and strong\nperformance in keypoint localization tasks. Finally, we draw attention to some\npotential ethical concerns of large language-vision models.\n","authors":["Aleksandar Shtedritski","Christian Rupprecht","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2304.06712v2.pdf","comment":"ICCV 2023 Oral"},{"id":"http://arxiv.org/abs/2308.09322v1","updated":"2023-08-18T05:46:20Z","published":"2023-08-18T05:46:20Z","title":"Audio-Visual Glance Network for Efficient Video Recognition","summary":" Deep learning has made significant strides in video understanding tasks, but\nthe computation required to classify lengthy and massive videos using\nclip-level video classifiers remains impractical and prohibitively expensive.\nTo address this issue, we propose Audio-Visual Glance Network (AVGN), which\nleverages the commonly available audio and visual modalities to efficiently\nprocess the spatio-temporally important parts of a video. AVGN firstly divides\nthe video into snippets of image-audio clip pair and employs lightweight\nunimodal encoders to extract global visual features and audio features. To\nidentify the important temporal segments, we use an Audio-Visual Temporal\nSaliency Transformer (AV-TeST) that estimates the saliency scores of each\nframe. To further increase efficiency in the spatial dimension, AVGN processes\nonly the important patches instead of the whole images. We use an\nAudio-Enhanced Spatial Patch Attention (AESPA) module to produce a set of\nenhanced coarse visual features, which are fed to a policy network that\nproduces the coordinates of the important patches. This approach enables us to\nfocus only on the most important spatio-temporally parts of the video, leading\nto more efficient video recognition. Moreover, we incorporate various training\ntechniques and multi-modal feature fusion to enhance the robustness and\neffectiveness of our AVGN. By combining these strategies, our AVGN sets new\nstate-of-the-art performance in multiple video recognition benchmarks while\nachieving faster processing speed.\n","authors":["Muhammad Adi Nugroho","Sangmin Woo","Sumin Lee","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2308.09322v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09314v1","updated":"2023-08-18T05:28:25Z","published":"2023-08-18T05:28:25Z","title":"Retro-FPN: Retrospective Feature Pyramid Network for Point Cloud\n Semantic Segmentation","summary":" Learning per-point semantic features from the hierarchical feature pyramid is\nessential for point cloud semantic segmentation. However, most previous methods\nsuffered from ambiguous region features or failed to refine per-point features\neffectively, which leads to information loss and ambiguous semantic\nidentification. To resolve this, we propose Retro-FPN to model the per-point\nfeature prediction as an explicit and retrospective refining process, which\ngoes through all the pyramid layers to extract semantic features explicitly for\neach point. Its key novelty is a retro-transformer for summarizing semantic\ncontexts from the previous layer and accordingly refining the features in the\ncurrent stage. In this way, the categorization of each point is conditioned on\nits local semantic pattern. Specifically, the retro-transformer consists of a\nlocal cross-attention block and a semantic gate unit. The cross-attention\nserves to summarize the semantic pattern retrospectively from the previous\nlayer. And the gate unit carefully incorporates the summarized contexts and\nrefines the current semantic features. Retro-FPN is a pluggable neural network\nthat applies to hierarchical decoders. By integrating Retro-FPN with three\nrepresentative backbones, including both point-based and voxel-based methods,\nwe show that Retro-FPN can significantly improve performance over\nstate-of-the-art backbones. Comprehensive experiments on widely used benchmarks\ncan justify the effectiveness of our design. The source is available at\nhttps://github.com/AllenXiangX/Retro-FPN\n","authors":["Peng Xiang","Xin Wen","Yu-Shen Liu","Hui Zhang","Yi Fang","Zhizhong Han"],"pdf_url":"https://arxiv.org/pdf/2308.09314v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2208.12489v2","updated":"2023-08-18T05:26:31Z","published":"2022-08-26T08:00:02Z","title":"GHN-Q: Parameter Prediction for Unseen Quantized Convolutional\n Architectures via Graph Hypernetworks","summary":" Deep convolutional neural network (CNN) training via iterative optimization\nhas had incredible success in finding optimal parameters. However, modern CNN\narchitectures often contain millions of parameters. Thus, any given model for a\nsingle architecture resides in a massive parameter space. Models with similar\nloss could have drastically different characteristics such as adversarial\nrobustness, generalizability, and quantization robustness. For deep learning on\nthe edge, quantization robustness is often crucial. Finding a model that is\nquantization-robust can sometimes require significant efforts. Recent works\nusing Graph Hypernetworks (GHN) have shown remarkable performance predicting\nhigh-performant parameters of varying CNN architectures. Inspired by these\nsuccesses, we wonder if the graph representations of GHN-2 can be leveraged to\npredict quantization-robust parameters as well, which we call GHN-Q. We conduct\nthe first-ever study exploring the use of graph hypernetworks for predicting\nparameters of unseen quantized CNN architectures. We focus on a reduced CNN\nsearch space and find that GHN-Q can in fact predict quantization-robust\nparameters for various 8-bit quantized CNNs. Decent quantized accuracies are\nobserved even with 4-bit quantization despite GHN-Q not being trained on it.\nQuantized finetuning of GHN-Q at lower bitwidths may bring further improvements\nand is currently being explored.\n","authors":["Stone Yun","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2208.12489v2.pdf","comment":"Updated Figure 1 and added additional results in Table 1. Initial\n extended abstract version accepted at Edge Intelligence Workshop 2022 for\n poster presentation"},{"id":"http://arxiv.org/abs/2308.09311v1","updated":"2023-08-18T05:19:03Z","published":"2023-08-18T05:19:03Z","title":"Lip Reading for Low-resource Languages by Learning and Combining General\n Speech Knowledge and Language-specific Knowledge","summary":" This paper proposes a novel lip reading framework, especially for\nlow-resource languages, which has not been well addressed in the previous\nliterature. Since low-resource languages do not have enough video-text paired\ndata to train the model to have sufficient power to model lip movements and\nlanguage, it is regarded as challenging to develop lip reading models for\nlow-resource languages. In order to mitigate the challenge, we try to learn\ngeneral speech knowledge, the ability to model lip movements, from a\nhigh-resource language through the prediction of speech units. It is known that\ndifferent languages partially share common phonemes, thus general speech\nknowledge learned from one language can be extended to other languages. Then,\nwe try to learn language-specific knowledge, the ability to model language, by\nproposing Language-specific Memory-augmented Decoder (LMDecoder). LMDecoder\nsaves language-specific audio features into memory banks and can be trained on\naudio-text paired data which is more easily accessible than video-text paired\ndata. Therefore, with LMDecoder, we can transform the input speech units into\nlanguage-specific audio features and translate them into texts by utilizing the\nlearned rich language knowledge. Finally, by combining general speech knowledge\nand language-specific knowledge, we can efficiently develop lip reading models\neven for low-resource languages. Through extensive experiments using five\nlanguages, English, Spanish, French, Italian, and Portuguese, the effectiveness\nof the proposed method is evaluated.\n","authors":["Minsu Kim","Jeong Hun Yeo","Jeongsoo Choi","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2308.09311v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2304.00450v2","updated":"2023-08-18T05:08:10Z","published":"2023-04-02T05:05:58Z","title":"Sketch-based Video Object Localization","summary":" We introduce Sketch-based Video Object Localization (SVOL), a new task aimed\nat localizing spatio-temporal object boxes in video queried by the input\nsketch. We first outline the challenges in the SVOL task and build the\nSketch-Video Attention Network (SVANet) with the following design principles:\n(i) to consider temporal information of video and bridge the domain gap between\nsketch and video; (ii) to accurately identify and localize multiple objects\nsimultaneously; (iii) to handle various styles of sketches; (iv) to be\nclassification-free. In particular, SVANet is equipped with a Cross-modal\nTransformer that models the interaction between learnable object tokens, query\nsketch, and video through attention operations, and learns upon a per-frame set\nmatching strategy that enables frame-wise prediction while utilizing global\nvideo context. We evaluate SVANet on a newly curated SVOL dataset. By design,\nSVANet successfully learns the mapping between the query sketches and video\nobjects, achieving state-of-the-art results on the SVOL benchmark. We further\nconfirm the effectiveness of SVANet via extensive ablation studies and\nvisualizations. Lastly, we demonstrate its transfer capability on unseen\ndatasets and novel categories, suggesting its high scalability in real-world\napplications\n","authors":["Sangmin Woo","So-Yeong Jeon","Jinyoung Park","Minji Son","Sumin Lee","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2304.00450v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05303v2","updated":"2023-08-18T05:06:50Z","published":"2023-08-10T02:47:36Z","title":"Multi-Visual-Inertial System: Analysis, Calibration and Estimation","summary":" In this paper, we study state estimation of multi-visual-inertial systems\n(MVIS) and develop sensor fusion algorithms to optimally fuse an arbitrary\nnumber of asynchronous inertial measurement units (IMUs) or gyroscopes and\nglobal and(or) rolling shutter cameras. We are especially interested in the\nfull calibration of the associated visual-inertial sensors, including the IMU\nor camera intrinsics and the IMU-IMU(or camera) spatiotemporal extrinsics as\nwell as the image readout time of rolling-shutter cameras (if used). To this\nend, we develop a new analytic combined IMU integration with intrinsics-termed\nACI3-to preintegrate IMU measurements, which is leveraged to fuse auxiliary\nIMUs and(or) gyroscopes alongside a base IMU. We model the multi-inertial\nmeasurements to include all the necessary inertial intrinsic and IMU-IMU\nspatiotemporal extrinsic parameters, while leveraging IMU-IMU rigid-body\nconstraints to eliminate the necessity of auxiliary inertial poses and thus\nreducing computational complexity. By performing observability analysis of\nMVIS, we prove that the standard four unobservable directions remain - no\nmatter how many inertial sensors are used, and also identify, for the first\ntime, degenerate motions for IMU-IMU spatiotemporal extrinsics and auxiliary\ninertial intrinsics. In addition to the extensive simulations that validate our\nanalysis and algorithms, we have built our own MVIS sensor rig and collected\nover 25 real-world datasets to experimentally verify the proposed calibration\nagainst the state-of-the-art calibration method such as Kalibr. We show that\nthe proposed MVIS calibration is able to achieve competing accuracy with\nimproved convergence and repeatability, which is open sourced to better benefit\nthe community.\n","authors":["Yulin Yang","Patrick Geneva","Guoquan Huang"],"pdf_url":"https://arxiv.org/pdf/2308.05303v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09307v1","updated":"2023-08-18T05:05:30Z","published":"2023-08-18T05:05:30Z","title":"Rethinking Image Forgery Detection via Contrastive Learning and\n Unsupervised Clustering","summary":" Image forgery detection aims to detect and locate forged regions in an image.\nMost existing forgery detection algorithms formulate classification problems to\nclassify pixels into forged or pristine. However, the definition of forged and\npristine pixels is only relative within one single image, e.g., a forged region\nin image A is actually a pristine one in its source image B (splicing forgery).\nSuch a relative definition has been severely overlooked by existing methods,\nwhich unnecessarily mix forged (pristine) regions across different images into\nthe same category. To resolve this dilemma, we propose the FOrensic ContrAstive\ncLustering (FOCAL) method, a novel, simple yet very effective paradigm based on\ncontrastive learning and unsupervised clustering for the image forgery\ndetection. Specifically, FOCAL 1) utilizes pixel-level contrastive learning to\nsupervise the high-level forensic feature extraction in an image-by-image\nmanner, explicitly reflecting the above relative definition; 2) employs an\non-the-fly unsupervised clustering algorithm (instead of a trained one) to\ncluster the learned features into forged/pristine categories, further\nsuppressing the cross-image influence from training data; and 3) allows to\nfurther boost the detection performance via simple feature-level concatenation\nwithout the need of retraining. Extensive experimental results over six public\ntesting datasets demonstrate that our proposed FOCAL significantly outperforms\nthe state-of-the-art competing algorithms by big margins: +24.3% on Coverage,\n+18.6% on Columbia, +17.5% on FF++, +14.2% on MISD, +13.5% on CASIA and +10.3%\non NIST in terms of IoU. The paradigm of FOCAL could bring fresh insights and\nserve as a novel benchmark for the image forgery detection task. The code is\navailable at https://github.com/HighwayWu/FOCAL.\n","authors":["Haiwei Wu","Yiming Chen","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.09307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09306v1","updated":"2023-08-18T05:03:48Z","published":"2023-08-18T05:03:48Z","title":"DiffDis: Empowering Generative Diffusion Model with Cross-Modal\n Discrimination Capability","summary":" Recently, large-scale diffusion models, e.g., Stable diffusion and DallE2,\nhave shown remarkable results on image synthesis. On the other hand,\nlarge-scale cross-modal pre-trained models (e.g., CLIP, ALIGN, and FILIP) are\ncompetent for various downstream tasks by learning to align vision and language\nembeddings. In this paper, we explore the possibility of jointly modeling\ngeneration and discrimination. Specifically, we propose DiffDis to unify the\ncross-modal generative and discriminative pretraining into one single framework\nunder the diffusion process. DiffDis first formulates the image-text\ndiscriminative problem as a generative diffusion process of the text embedding\nfrom the text encoder conditioned on the image. Then, we propose a novel\ndual-stream network architecture, which fuses the noisy text embedding with the\nknowledge of latent images from different scales for image-text discriminative\nlearning. Moreover, the generative and discriminative tasks can efficiently\nshare the image-branch network structure in the multi-modality model.\nBenefiting from diffusion-based unified training, DiffDis achieves both better\ngeneration ability and cross-modal semantic alignment in one architecture.\nExperimental results show that DiffDis outperforms single-task models on both\nthe image generation and the image-text discriminative tasks, e.g., 1.65%\nimprovement on average accuracy of zero-shot classification over 12 datasets\nand 2.42 improvement on FID of zero-shot image synthesis.\n","authors":["Runhui Huang","Jianhua Han","Guansong Lu","Xiaodan Liang","Yihan Zeng","Wei Zhang","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.09306v1.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.09305v1","updated":"2023-08-18T05:01:52Z","published":"2023-08-18T05:01:52Z","title":"Human Part-wise 3D Motion Context Learning for Sign Language Recognition","summary":" In this paper, we propose P3D, the human part-wise motion context learning\nframework for sign language recognition. Our main contributions lie in two\ndimensions: learning the part-wise motion context and employing the pose\nensemble to utilize 2D and 3D pose jointly. First, our empirical observation\nimplies that part-wise context encoding benefits the performance of sign\nlanguage recognition. While previous methods of sign language recognition\nlearned motion context from the sequence of the entire pose, we argue that such\nmethods cannot exploit part-specific motion context. In order to utilize\npart-wise motion context, we propose the alternating combination of a part-wise\nencoding Transformer (PET) and a whole-body encoding Transformer (WET). PET\nencodes the motion contexts from a part sequence, while WET merges them into a\nunified context. By learning part-wise motion context, our P3D achieves\nsuperior performance on WLASL compared to previous state-of-the-art methods.\nSecond, our framework is the first to ensemble 2D and 3D poses for sign\nlanguage recognition. Since the 3D pose holds rich motion context and depth\ninformation to distinguish the words, our P3D outperformed the previous\nstate-of-the-art methods employing a pose ensemble.\n","authors":["Taeryung Lee","Yeonguk Oh","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2308.09305v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09303v1","updated":"2023-08-18T04:52:56Z","published":"2023-08-18T04:52:56Z","title":"Online Class Incremental Learning on Stochastic Blurry Task Boundary via\n Mask and Visual Prompt Tuning","summary":" Continual learning aims to learn a model from a continuous stream of data,\nbut it mainly assumes a fixed number of data and tasks with clear task\nboundaries. However, in real-world scenarios, the number of input data and\ntasks is constantly changing in a statistical way, not a static way. Although\nrecently introduced incremental learning scenarios having blurry task\nboundaries somewhat address the above issues, they still do not fully reflect\nthe statistical properties of real-world situations because of the fixed ratio\nof disjoint and blurry samples. In this paper, we propose a new Stochastic\nincremental Blurry task boundary scenario, called Si-Blurry, which reflects the\nstochastic properties of the real-world. We find that there are two major\nchallenges in the Si-Blurry scenario: (1) inter- and intra-task forgettings and\n(2) class imbalance problem. To alleviate them, we introduce Mask and Visual\nPrompt tuning (MVP). In MVP, to address the inter- and intra-task forgetting\nissues, we propose a novel instance-wise logit masking and contrastive visual\nprompt tuning loss. Both of them help our model discern the classes to be\nlearned in the current batch. It results in consolidating the previous\nknowledge. In addition, to alleviate the class imbalance problem, we introduce\na new gradient similarity-based focal loss and adaptive feature scaling to ease\noverfitting to the major classes and underfitting to the minor classes.\nExtensive experiments show that our proposed MVP significantly outperforms the\nexisting state-of-the-art methods in our challenging Si-Blurry scenario.\n","authors":["Jun-Yeong Moon","Keon-Hee Park","Jung Uk Kim","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2308.09303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09300v1","updated":"2023-08-18T04:49:38Z","published":"2023-08-18T04:49:38Z","title":"V2A-Mapper: A Lightweight Solution for Vision-to-Audio Generation by\n Connecting Foundation Models","summary":" Building artificial intelligence (AI) systems on top of a set of foundation\nmodels (FMs) is becoming a new paradigm in AI research. Their representative\nand generative abilities learnt from vast amounts of data can be easily adapted\nand transferred to a wide range of downstream tasks without extra training from\nscratch. However, leveraging FMs in cross-modal generation remains\nunder-researched when audio modality is involved. On the other hand,\nautomatically generating semantically-relevant sound from visual input is an\nimportant problem in cross-modal generation studies. To solve this\nvision-to-audio (V2A) generation problem, existing methods tend to design and\nbuild complex systems from scratch using modestly sized datasets. In this\npaper, we propose a lightweight solution to this problem by leveraging\nfoundation models, specifically CLIP, CLAP, and AudioLDM. We first investigate\nthe domain gap between the latent space of the visual CLIP and the auditory\nCLAP models. Then we propose a simple yet effective mapper mechanism\n(V2A-Mapper) to bridge the domain gap by translating the visual input between\nCLIP and CLAP spaces. Conditioned on the translated CLAP embedding, pretrained\naudio generative FM AudioLDM is adopted to produce high-fidelity and\nvisually-aligned sound. Compared to previous approaches, our method only\nrequires a quick training of the V2A-Mapper. We further analyze and conduct\nextensive experiments on the choice of the V2A-Mapper and show that a\ngenerative mapper is better at fidelity and variability (FD) while a regression\nmapper is slightly better at relevance (CS). Both objective and subjective\nevaluation on two V2A datasets demonstrate the superiority of our proposed\nmethod compared to current state-of-the-art approaches - trained with 86% fewer\nparameters but achieving 53% and 19% improvement in FD and CS, respectively.\n","authors":["Heng Wang","Jianbo Ma","Santiago Pascual","Richard Cartwright","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2308.09300v1.pdf","comment":"13 pages, 10 figures. Code, demo, and samples:\n https://v2a-mapper.github.io/"},{"id":"http://arxiv.org/abs/2308.09298v1","updated":"2023-08-18T04:48:23Z","published":"2023-08-18T04:48:23Z","title":"Inferior Alveolar Nerve Segmentation in CBCT images using\n Connectivity-Based Selective Re-training","summary":" Inferior Alveolar Nerve (IAN) canal detection in CBCT is an important step in\nmany dental and maxillofacial surgery applications to prevent irreversible\ndamage to the nerve during the procedure.The ToothFairy2023 Challenge aims to\nestablish a 3D maxillofacial dataset consisting of all sparse labels and\npartial dense labels, and improve the ability of automatic IAN segmentation. In\nthis work, in order to avoid the negative impact brought by sparse labeling, we\ntransform the mixed supervised problem into a semi-supervised problem. Inspired\nby self-training via pseudo labeling, we propose a selective re-training\nframework based on IAN connectivity. Our method is quantitatively evaluated on\nthe ToothFairy verification cases, achieving the dice similarity coefficient\n(DSC) of 0.7956, and 95\\% hausdorff distance (HD95) of 4.4905, and wining the\nchampion in the competition. Code is available at\nhttps://github.com/GaryNico517/SSL-IAN-Retraining.\n","authors":["Yusheng Liu","Rui Xin","Tao Yang","Lisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09298v1.pdf","comment":"technical paper for Miccai ToothFairy2023 Challenge"},{"id":"http://arxiv.org/abs/2308.09297v1","updated":"2023-08-18T04:47:39Z","published":"2023-08-18T04:47:39Z","title":"NAPA-VQ: Neighborhood Aware Prototype Augmentation with Vector\n Quantization for Continual Learning","summary":" Catastrophic forgetting; the loss of old knowledge upon acquiring new\nknowledge, is a pitfall faced by deep neural networks in real-world\napplications. Many prevailing solutions to this problem rely on storing\nexemplars (previously encountered data), which may not be feasible in\napplications with memory limitations or privacy constraints. Therefore, the\nrecent focus has been on Non-Exemplar based Class Incremental Learning (NECIL)\nwhere a model incrementally learns about new classes without using any past\nexemplars. However, due to the lack of old data, NECIL methods struggle to\ndiscriminate between old and new classes causing their feature representations\nto overlap. We propose NAPA-VQ: Neighborhood Aware Prototype Augmentation with\nVector Quantization, a framework that reduces this class overlap in NECIL. We\ndraw inspiration from Neural Gas to learn the topological relationships in the\nfeature space, identifying the neighboring classes that are most likely to get\nconfused with each other. This neighborhood information is utilized to enforce\nstrong separation between the neighboring classes as well as to generate old\nclass representative prototypes that can better aid in obtaining a\ndiscriminative decision boundary between old and new classes. Our comprehensive\nexperiments on CIFAR-100, TinyImageNet, and ImageNet-Subset demonstrate that\nNAPA-VQ outperforms the State-of-the-art NECIL methods by an average\nimprovement of 5%, 2%, and 4% in accuracy and 10%, 3%, and 9% in forgetting\nrespectively. Our code can be found in https://github.com/TamashaM/NAPA-VQ.git.\n","authors":["Tamasha Malepathirana","Damith Senanayake","Saman Halgamuge"],"pdf_url":"https://arxiv.org/pdf/2308.09297v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09294v1","updated":"2023-08-18T04:41:50Z","published":"2023-08-18T04:41:50Z","title":"Self-Calibrated Cross Attention Network for Few-Shot Segmentation","summary":" The key to the success of few-shot segmentation (FSS) lies in how to\neffectively utilize support samples. Most solutions compress support foreground\n(FG) features into prototypes, but lose some spatial details. Instead, others\nuse cross attention to fuse query features with uncompressed support FG. Query\nFG could be fused with support FG, however, query background (BG) cannot find\nmatched BG features in support FG, yet inevitably integrates dissimilar\nfeatures. Besides, as both query FG and BG are combined with support FG, they\nget entangled, thereby leading to ineffective segmentation. To cope with these\nissues, we design a self-calibrated cross attention (SCCA) block. For efficient\npatch-based attention, query and support features are firstly split into\npatches. Then, we design a patch alignment module to align each query patch\nwith its most similar support patch for better cross attention. Specifically,\nSCCA takes a query patch as Q, and groups the patches from the same query image\nand the aligned patches from the support image as K&V. In this way, the query\nBG features are fused with matched BG features (from query patches), and thus\nthe aforementioned issues will be mitigated. Moreover, when calculating SCCA,\nwe design a scaled-cosine mechanism to better utilize the support features for\nsimilarity calculation. Extensive experiments conducted on PASCAL-5^i and\nCOCO-20^i demonstrate the superiority of our model, e.g., the mIoU score under\n5-shot setting on COCO-20^i is 5.6%+ better than previous state-of-the-arts.\nThe code is available at https://github.com/Sam1224/SCCAN.\n","authors":["Qianxiong Xu","Wenting Zhao","Guosheng Lin","Cheng Long"],"pdf_url":"https://arxiv.org/pdf/2308.09294v1.pdf","comment":"This paper is accepted by ICCV'23"},{"id":"http://arxiv.org/abs/2307.08238v2","updated":"2023-08-18T04:35:06Z","published":"2023-07-17T04:39:18Z","title":"Unified Open-Vocabulary Dense Visual Prediction","summary":" In recent years, open-vocabulary (OV) dense visual prediction (such as OV\nobject detection, semantic, instance and panoptic segmentations) has attracted\nincreasing research attention. However, most of existing approaches are\ntask-specific and individually tackle each task. In this paper, we propose a\nUnified Open-Vocabulary Network (UOVN) to jointly address four common dense\nprediction tasks. Compared with separate models, a unified network is more\ndesirable for diverse industrial applications. Moreover, OV dense prediction\ntraining data is relatively less. Separate networks can only leverage\ntask-relevant training data, while a unified approach can integrate diverse\ntraining data to boost individual tasks. We address two major challenges in\nunified OV prediction. Firstly, unlike unified methods for fixed-set\npredictions, OV networks are usually trained with multi-modal data. Therefore,\nwe propose a multi-modal, multi-scale and multi-task (MMM) decoding mechanism\nto better leverage multi-modal data. Secondly, because UOVN uses data from\ndifferent tasks for training, there are significant domain and task gaps. We\npresent a UOVN training mechanism to reduce such gaps. Experiments on four\ndatasets demonstrate the effectiveness of our UOVN.\n","authors":["Hengcan Shi","Munawar Hayat","Jianfei Cai"],"pdf_url":"https://arxiv.org/pdf/2307.08238v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09285v1","updated":"2023-08-18T04:05:18Z","published":"2023-08-18T04:05:18Z","title":"RFDforFin: Robust Deep Forgery Detection for GAN-generated Fingerprint\n Images","summary":" With the rapid development of the image generation technologies, the\nmalicious abuses of the GAN-generated fingerprint images poses a significant\nthreat to the public safety in certain circumstances. Although the existing\nuniversal deep forgery detection approach can be applied to detect the fake\nfingerprint images, they are easily attacked and have poor robustness.\nMeanwhile, there is no specifically designed deep forgery detection method for\nfingerprint images. In this paper, we propose the first deep forgery detection\napproach for fingerprint images, which combines unique ridge features of\nfingerprint and generation artifacts of the GAN-generated images, to the best\nof our knowledge. Specifically, we firstly construct a ridge stream, which\nexploits the grayscale variations along the ridges to extract unique\nfingerprint-specific features. Then, we construct a generation artifact stream,\nin which the FFT-based spectrums of the input fingerprint images are exploited,\nto extract more robust generation artifact features. At last, the unique ridge\nfeatures and generation artifact features are fused for binary classification\n(\\textit{i.e.}, real or fake). Comprehensive experiments demonstrate that our\nproposed approach is effective and robust with low complexities.\n","authors":["Hui Miao","Yuanfang Guo","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09285v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.08428v2","updated":"2023-08-18T04:04:30Z","published":"2023-08-16T15:19:52Z","title":"ALIP: Adaptive Language-Image Pre-training with Synthetic Caption","summary":" Contrastive Language-Image Pre-training (CLIP) has significantly boosted the\nperformance of various vision-language tasks by scaling up the dataset with\nimage-text pairs collected from the web. However, the presence of intrinsic\nnoise and unmatched image-text pairs in web data can potentially affect the\nperformance of representation learning. To address this issue, we first utilize\nthe OFA model to generate synthetic captions that focus on the image content.\nThe generated captions contain complementary information that is beneficial for\npre-training. Then, we propose an Adaptive Language-Image Pre-training (ALIP),\na bi-path model that integrates supervision from both raw text and synthetic\ncaption. As the core components of ALIP, the Language Consistency Gate (LCG)\nand Description Consistency Gate (DCG) dynamically adjust the weights of\nsamples and image-text/caption pairs during the training process. Meanwhile,\nthe adaptive contrastive loss can effectively reduce the impact of noise data\nand enhances the efficiency of pre-training data. We validate ALIP with\nexperiments on different scales of models and pre-training datasets.\nExperiments results show that ALIP achieves state-of-the-art performance on\nmultiple downstream tasks including zero-shot image-text retrieval and linear\nprobe. To facilitate future research, the code and pre-trained models are\nreleased at https://github.com/deepglint/ALIP.\n","authors":["Kaicheng Yang","Jiankang Deng","Xiang An","Jiawei Li","Ziyong Feng","Jia Guo","Jing Yang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.08428v2.pdf","comment":"15pages, 10figures, ICCV2023"},{"id":"http://arxiv.org/abs/2308.00214v2","updated":"2023-08-18T04:00:55Z","published":"2023-08-01T01:12:29Z","title":"Robust Single-view Cone-beam X-ray Pose Estimation with Neural Tuned\n Tomography (NeTT) and Masked Neural Radiance Fields (mNeRF)","summary":" Many tasks performed in image-guided, mini-invasive, medical procedures can\nbe cast as pose estimation problems, where an X-ray projection is utilized to\nreach a target in 3D space. Expanding on recent advances in the differentiable\nrendering of optically reflective materials, we introduce new methods for pose\nestimation of radiolucent objects using X-ray projections, and we demonstrate\nthe critical role of optimal view synthesis in performing this task. We first\ndevelop an algorithm (DiffDRR) that efficiently computes Digitally\nReconstructed Radiographs (DRRs) and leverages automatic differentiation within\nTensorFlow. Pose estimation is performed by iterative gradient descent using a\nloss function that quantifies the similarity of the DRR synthesized from a\nrandomly initialized pose and the true fluoroscopic image at the target pose.\nWe propose two novel methods for high-fidelity view synthesis, Neural Tuned\nTomography (NeTT) and masked Neural Radiance Fields (mNeRF). Both methods rely\non classic Cone-Beam Computerized Tomography (CBCT); NeTT directly optimizes\nthe CBCT densities, while the non-zero values of mNeRF are constrained by a 3D\nmask of the anatomic region segmented from CBCT. We demonstrate that both NeTT\nand mNeRF distinctly improve pose estimation within our framework. By defining\na successful pose estimate to be a 3D angle error of less than 3 deg, we find\nthat NeTT and mNeRF can achieve similar results, both with overall success\nrates more than 93%. However, the computational cost of NeTT is significantly\nlower than mNeRF in both training and pose estimation. Furthermore, we show\nthat a NeTT trained for a single subject can generalize to synthesize\nhigh-fidelity DRRs and ensure robust pose estimations for all other subjects.\nTherefore, we suggest that NeTT is an attractive option for robust pose\nestimation using fluoroscopic projections.\n","authors":["Chaochao Zhou","Syed Hasib Akhter Faruqui","Abhinav Patel","Ramez N. Abdalla","Michael C. Hurley","Ali Shaibani","Matthew B. Potts","Babak S. Jahromi","Leon Cho","Sameer A. Ansari","Donald R. Cantrell"],"pdf_url":"https://arxiv.org/pdf/2308.00214v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09281v1","updated":"2023-08-18T03:46:29Z","published":"2023-08-18T03:46:29Z","title":"Diverse Cotraining Makes Strong Semi-Supervised Segmentor","summary":" Deep co-training has been introduced to semi-supervised segmentation and\nachieves impressive results, yet few studies have explored the working\nmechanism behind it. In this work, we revisit the core assumption that supports\nco-training: multiple compatible and conditionally independent views. By\ntheoretically deriving the generalization upper bound, we prove the prediction\nsimilarity between two models negatively impacts the model's generalization\nability. However, most current co-training models are tightly coupled together\nand violate this assumption. Such coupling leads to the homogenization of\nnetworks and confirmation bias which consequently limits the performance. To\nthis end, we explore different dimensions of co-training and systematically\nincrease the diversity from the aspects of input domains, different\naugmentations and model architectures to counteract homogenization. Our Diverse\nCo-training outperforms the state-of-the-art (SOTA) methods by a large margin\nacross different evaluation protocols on the Pascal and Cityscapes. For\nexample. we achieve the best mIoU of 76.2%, 77.7% and 80.2% on Pascal with only\n92, 183 and 366 labeled images, surpassing the previous best results by more\nthan 5%.\n","authors":["Yijiang Li","Xinjiang Wang","Lihe Yang","Litong Feng","Wayne Zhang","Ying Gao"],"pdf_url":"https://arxiv.org/pdf/2308.09281v1.pdf","comment":"ICCV2023, Camera Ready Version, Code:\n \\url{https://github.com/williamium3000/diverse-cotraining}"},{"id":"http://arxiv.org/abs/2308.09279v1","updated":"2023-08-18T03:40:40Z","published":"2023-08-18T03:40:40Z","title":"DiffLLE: Diffusion-guided Domain Calibration for Unsupervised Low-light\n Image Enhancement","summary":" Existing unsupervised low-light image enhancement methods lack enough\neffectiveness and generalization in practical applications. We suppose this is\nbecause of the absence of explicit supervision and the inherent gap between\nreal-world scenarios and the training data domain. In this paper, we develop\nDiffusion-based domain calibration to realize more robust and effective\nunsupervised Low-Light Enhancement, called DiffLLE. Since the diffusion model\nperforms impressive denoising capability and has been trained on massive clean\nimages, we adopt it to bridge the gap between the real low-light domain and\ntraining degradation domain, while providing efficient priors of real-world\ncontent for unsupervised models. Specifically, we adopt a naive unsupervised\nenhancement algorithm to realize preliminary restoration and design two\nzero-shot plug-and-play modules based on diffusion model to improve\ngeneralization and effectiveness. The Diffusion-guided Degradation Calibration\n(DDC) module narrows the gap between real-world and training low-light\ndegradation through diffusion-based domain calibration and a lightness\nenhancement curve, which makes the enhancement model perform robustly even in\nsophisticated wild degradation. Due to the limited enhancement effect of the\nunsupervised model, we further develop the Fine-grained Target domain\nDistillation (FTD) module to find a more visual-friendly solution space. It\nexploits the priors of the pre-trained diffusion model to generate\npseudo-references, which shrinks the preliminary restored results from a coarse\nnormal-light domain to a finer high-quality clean field, addressing the lack of\nstrong explicit supervision for unsupervised methods. Benefiting from these,\nour approach even outperforms some supervised methods by using only a simple\nunsupervised baseline. Extensive experiments demonstrate the superior\neffectiveness of the proposed DiffLLE.\n","authors":["Shuzhou Yang","Xuanyu Zhang","Yinhuai Wang","Jiwen Yu","Yuhan Wang","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09278v1","updated":"2023-08-18T03:40:38Z","published":"2023-08-18T03:40:38Z","title":"MATLABER: Material-Aware Text-to-3D via LAtent BRDF auto-EncodeR","summary":" Based on powerful text-to-image diffusion models, text-to-3D generation has\nmade significant progress in generating compelling geometry and appearance.\nHowever, existing methods still struggle to recover high-fidelity object\nmaterials, either only considering Lambertian reflectance, or failing to\ndisentangle BRDF materials from the environment lights. In this work, we\npropose Material-Aware Text-to-3D via LAtent BRDF auto-EncodeR\n(\\textbf{MATLABER}) that leverages a novel latent BRDF auto-encoder for\nmaterial generation. We train this auto-encoder with large-scale real-world\nBRDF collections and ensure the smoothness of its latent space, which\nimplicitly acts as a natural distribution of materials. During appearance\nmodeling in text-to-3D generation, the latent BRDF embeddings, rather than BRDF\nparameters, are predicted via a material network. Through exhaustive\nexperiments, our approach demonstrates the superiority over existing ones in\ngenerating realistic and coherent object materials. Moreover, high-quality\nmaterials naturally enable multiple downstream tasks such as relighting and\nmaterial editing. Code and model will be publicly available at\n\\url{https://sheldontsui.github.io/projects/Matlaber}.\n","authors":["Xudong Xu","Zhaoyang Lyu","Xingang Pan","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2308.09278v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04356v2","updated":"2023-08-18T03:28:59Z","published":"2023-07-10T05:49:20Z","title":"InfLoR-SNN: Reducing Information Loss for Spiking Neural Networks","summary":" The Spiking Neural Network (SNN) has attracted more and more attention\nrecently. It adopts binary spike signals to transmit information. Benefitting\nfrom the information passing paradigm of SNNs, the multiplications of\nactivations and weights can be replaced by additions, which are more\nenergy-efficient. However, its \"Hard Reset\" mechanism for the firing activity\nwould ignore the difference among membrane potentials when the membrane\npotential is above the firing threshold, causing information loss. Meanwhile,\nquantifying the membrane potential to 0/1 spikes at the firing instants will\ninevitably introduce the quantization error thus bringing about information\nloss too. To address these problems, we propose to use the \"Soft Reset\"\nmechanism for the supervised training-based SNNs, which will drive the membrane\npotential to a dynamic reset potential according to its magnitude, and Membrane\nPotential Rectifier (MPR) to reduce the quantization error via redistributing\nthe membrane potential to a range close to the spikes. Results show that the\nSNNs with the \"Soft Reset\" mechanism and MPR outperform their vanilla\ncounterparts on both static and dynamic datasets.\n","authors":["Yufei Guo","Yuanpei Chen","Liwen Zhang","Xiaode Liu","Xinyi Tong","Yuanyuan Ou","Xuhui Huang","Zhe Ma"],"pdf_url":"https://arxiv.org/pdf/2307.04356v2.pdf","comment":"Accepted by ECCV2022"},{"id":"http://arxiv.org/abs/2308.06087v2","updated":"2023-08-18T03:19:52Z","published":"2023-08-11T11:57:58Z","title":"Audio-Visual Spatial Integration and Recursive Attention for Robust\n Sound Source Localization","summary":" The objective of the sound source localization task is to enable machines to\ndetect the location of sound-making objects within a visual scene. While the\naudio modality provides spatial cues to locate the sound source, existing\napproaches only use audio as an auxiliary role to compare spatial regions of\nthe visual modality. Humans, on the other hand, utilize both audio and visual\nmodalities as spatial cues to locate sound sources. In this paper, we propose\nan audio-visual spatial integration network that integrates spatial cues from\nboth modalities to mimic human behavior when detecting sound-making objects.\nAdditionally, we introduce a recursive attention network to mimic human\nbehavior of iterative focusing on objects, resulting in more accurate attention\nregions. To effectively encode spatial information from both modalities, we\npropose audio-visual pair matching loss and spatial region alignment loss. By\nutilizing the spatial cues of audio-visual modalities and recursively focusing\nobjects, our method can perform more robust sound source localization.\nComprehensive experimental results on the Flickr SoundNet and VGG-Sound Source\ndatasets demonstrate the superiority of our proposed method over existing\napproaches. Our code is available at: https://github.com/VisualAIKHU/SIRA-SSL\n","authors":["Sung Jin Um","Dongjin Kim","Jung Uk Kim"],"pdf_url":"https://arxiv.org/pdf/2308.06087v2.pdf","comment":"Camera-Ready, ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.09268v1","updated":"2023-08-18T03:14:05Z","published":"2023-08-18T03:14:05Z","title":"Progression-Guided Temporal Action Detection in Videos","summary":" We present a novel framework, Action Progression Network (APN), for temporal\naction detection (TAD) in videos. The framework locates actions in videos by\ndetecting the action evolution process. To encode the action evolution, we\nquantify a complete action process into 101 ordered stages (0\\%, 1\\%, ...,\n100\\%), referred to as action progressions. We then train a neural network to\nrecognize the action progressions. The framework detects action boundaries by\ndetecting complete action processes in the videos, e.g., a video segment with\ndetected action progressions closely follow the sequence 0\\%, 1\\%, ..., 100\\%.\nThe framework offers three major advantages: (1) Our neural networks are\ntrained end-to-end, contrasting conventional methods that optimize modules\nseparately; (2) The APN is trained using action frames exclusively, enabling\nmodels to be trained on action classification datasets and robust to videos\nwith temporal background styles differing from those in training; (3) Our\nframework effectively avoids detecting incomplete actions and excels in\ndetecting long-lasting actions due to the fine-grained and explicit encoding of\nthe temporal structure of actions. Leveraging these advantages, the APN\nachieves competitive performance and significantly surpasses its counterparts\nin detecting long-lasting actions. With an IoU threshold of 0.5, the APN\nachieves a mean Average Precision (mAP) of 58.3\\% on the THUMOS14 dataset and\n98.9\\% mAP on the DFMAD70 dataset.\n","authors":["Chongkai Lu","Man-Wai Mak","Ruimin Li","Zheru Chi","Hong Fu"],"pdf_url":"https://arxiv.org/pdf/2308.09268v1.pdf","comment":"Under Review. Code available at https://github.com/makecent/APN"},{"id":"http://arxiv.org/abs/2307.08199v2","updated":"2023-08-18T03:06:16Z","published":"2023-07-17T02:03:17Z","title":"Unbiased Image Synthesis via Manifold-Driven Sampling in Diffusion\n Models","summary":" Diffusion models are a potent class of generative models capable of producing\nhigh-quality images. However, they can face challenges related to data bias,\nfavoring specific modes of data, especially when the training data does not\naccurately represent the true data distribution and exhibits skewed or\nimbalanced patterns. For instance, the CelebA dataset contains more female\nimages than male images, leading to biased generation results and impacting\ndownstream applications. To address this issue, we propose a novel method that\nleverages manifold guidance to mitigate data bias in diffusion models. Our key\nidea is to estimate the manifold of the training data using an unsupervised\napproach, and then use it to guide the sampling process of diffusion models.\nThis encourages the generated images to be uniformly distributed on the data\nmanifold without altering the model architecture or necessitating labels or\nretraining. Theoretical analysis and empirical evidence demonstrate the\neffectiveness of our method in improving the quality and unbiasedness of image\ngeneration compared to standard diffusion models.\n","authors":["Xingzhe Su","Yi Ren","Wenwen Qiang","Zeen Song","Hang Gao","Fengge Wu","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.08199v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09495v3","updated":"2023-08-18T02:40:18Z","published":"2023-03-16T17:15:25Z","title":"Among Us: Adversarially Robust Collaborative Perception by Consensus","summary":" Multiple robots could perceive a scene (e.g., detect objects) collaboratively\nbetter than individuals, although easily suffer from adversarial attacks when\nusing deep learning. This could be addressed by the adversarial defense, but\nits training requires the often-unknown attacking mechanism. Differently, we\npropose ROBOSAC, a novel sampling-based defense strategy generalizable to\nunseen attackers. Our key idea is that collaborative perception should lead to\nconsensus rather than dissensus in results compared to individual perception.\nThis leads to our hypothesize-and-verify framework: perception results with and\nwithout collaboration from a random subset of teammates are compared until\nreaching a consensus. In such a framework, more teammates in the sampled subset\noften entail better perception performance but require longer sampling time to\nreject potential attackers. Thus, we derive how many sampling trials are needed\nto ensure the desired size of an attacker-free subset, or equivalently, the\nmaximum size of such a subset that we can successfully sample within a given\nnumber of trials. We validate our method on the task of collaborative 3D object\ndetection in autonomous driving scenarios.\n","authors":["Yiming Li","Qi Fang","Jiamu Bai","Siheng Chen","Felix Juefei-Xu","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2303.09495v3.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2304.01168v3","updated":"2023-08-18T02:38:06Z","published":"2023-04-03T17:37:00Z","title":"DeepAccident: A Motion and Accident Prediction Benchmark for V2X\n Autonomous Driving","summary":" Safety is the primary priority of autonomous driving. Nevertheless, no\npublished dataset currently supports the direct and explainable safety\nevaluation for autonomous driving. In this work, we propose DeepAccident, a\nlarge-scale dataset generated via a realistic simulator containing diverse\naccident scenarios that frequently occur in real-world driving. The proposed\nDeepAccident dataset includes 57K annotated frames and 285K annotated samples,\napproximately 7 times more than the large-scale nuScenes dataset with 40k\nannotated samples. In addition, we propose a new task, end-to-end motion and\naccident prediction, which can be used to directly evaluate the accident\nprediction ability for different autonomous driving algorithms. Furthermore,\nfor each scenario, we set four vehicles along with one infrastructure to record\ndata, thus providing diverse viewpoints for accident scenarios and enabling V2X\n(vehicle-to-everything) research on perception and prediction tasks. Finally,\nwe present a baseline V2X model named V2XFormer that demonstrates superior\nperformance for motion and accident prediction and 3D object detection compared\nto the single-vehicle model.\n","authors":["Tianqi Wang","Sukmin Kim","Wenxuan Ji","Enze Xie","Chongjian Ge","Junsong Chen","Zhenguo Li","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2304.01168v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07567v2","updated":"2023-08-18T02:32:17Z","published":"2023-04-15T14:08:47Z","title":"CoVLR: Coordinating Cross-Modal Consistency and Intra-Modal Structure\n for Vision-Language Retrieval","summary":" Current vision-language retrieval aims to perform cross-modal instance\nsearch, in which the core idea is to learn the consistent visionlanguage\nrepresentations. Although the performance of cross-modal retrieval has greatly\nimproved with the development of deep models, we unfortunately find that\ntraditional hard consistency may destroy the original relationships among\nsingle-modal instances, leading the performance degradation for single-modal\nretrieval. To address this challenge, in this paper, we experimentally observe\nthat the vision-language divergence may cause the existence of strong and weak\nmodalities, and the hard cross-modal consistency cannot guarantee that strong\nmodal instances' relationships are not affected by weak modality, resulting in\nthe strong modal instances' relationships perturbed despite learned consistent\nrepresentations.To this end, we propose a novel and directly Coordinated\nVisionLanguage Retrieval method (dubbed CoVLR), which aims to study and\nalleviate the desynchrony problem between the cross-modal alignment and\nsingle-modal cluster-preserving tasks. CoVLR addresses this challenge by\ndeveloping an effective meta-optimization based strategy, in which the\ncross-modal consistency objective and the intra-modal relation preserving\nobjective are acted as the meta-train and meta-test tasks, thereby CoVLR\nencourages both tasks to be optimized in a coordinated way. Consequently, we\ncan simultaneously insure cross-modal consistency and intra-modal structure.\nExperiments on different datasets validate CoVLR can improve single-modal\nretrieval accuracy whilst preserving crossmodal retrieval capacity compared\nwith the baselines.\n","authors":["Yang Yang","Zhongtian Fu","Xiangyu Wu","Wenjie Li"],"pdf_url":"https://arxiv.org/pdf/2304.07567v2.pdf","comment":"I apologize for my operational mistake, which has resulted in the\n absence of a revised version of the manuscript. Furthermore, I am concerned\n that the submission process of this paper may potentially lead to conflicts.\n Therefore, I kindly request the withdrawal of the manuscript"},{"id":"http://arxiv.org/abs/2308.09247v1","updated":"2023-08-18T02:17:47Z","published":"2023-08-18T02:17:47Z","title":"Point Contrastive Prediction with Semantic Clustering for\n Self-Supervised Learning on Point Cloud Videos","summary":" We propose a unified point cloud video self-supervised learning framework for\nobject-centric and scene-centric data. Previous methods commonly conduct\nrepresentation learning at the clip or frame level and cannot well capture\nfine-grained semantics. Instead of contrasting the representations of clips or\nframes, in this paper, we propose a unified self-supervised framework by\nconducting contrastive learning at the point level. Moreover, we introduce a\nnew pretext task by achieving semantic alignment of superpoints, which further\nfacilitates the representations to capture semantic cues at multiple scales. In\naddition, due to the high redundancy in the temporal dimension of dynamic point\nclouds, directly conducting contrastive learning at the point level usually\nleads to massive undesired negatives and insufficient modeling of positive\nrepresentations. To remedy this, we propose a selection strategy to retain\nproper negatives and make use of high-similarity samples from other instances\nas positive supplements. Extensive experiments show that our method outperforms\nsupervised counterparts on a wide range of downstream tasks and demonstrates\nthe superior transferability of the learned representations.\n","authors":["Xiaoxiao Sheng","Zhiqiang Shen","Gang Xiao","Longguang Wang","Yulan Guo","Hehe Fan"],"pdf_url":"https://arxiv.org/pdf/2308.09247v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09245v1","updated":"2023-08-18T02:12:54Z","published":"2023-08-18T02:12:54Z","title":"Masked Spatio-Temporal Structure Prediction for Self-supervised Learning\n on Point Cloud Videos","summary":" Recently, the community has made tremendous progress in developing effective\nmethods for point cloud video understanding that learn from massive amounts of\nlabeled data. However, annotating point cloud videos is usually notoriously\nexpensive. Moreover, training via one or only a few traditional tasks (e.g.,\nclassification) may be insufficient to learn subtle details of the\nspatio-temporal structure existing in point cloud videos. In this paper, we\npropose a Masked Spatio-Temporal Structure Prediction (MaST-Pre) method to\ncapture the structure of point cloud videos without human annotations. MaST-Pre\nis based on spatio-temporal point-tube masking and consists of two\nself-supervised learning tasks. First, by reconstructing masked point tubes,\nour method is able to capture the appearance information of point cloud videos.\nSecond, to learn motion, we propose a temporal cardinality difference\nprediction task that estimates the change in the number of points within a\npoint tube. In this way, MaST-Pre is forced to model the spatial and temporal\nstructure in point cloud videos. Extensive experiments on MSRAction-3D,\nNTU-RGBD, NvGesture, and SHREC'17 demonstrate the effectiveness of the proposed\nmethod.\n","authors":["Zhiqiang Shen","Xiaoxiao Sheng","Hehe Fan","Longguang Wang","Yulan Guo","Qiong Liu","Hao Wen","Xi Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.09245v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09244v1","updated":"2023-08-18T02:11:01Z","published":"2023-08-18T02:11:01Z","title":"SparseBEV: High-Performance Sparse 3D Object Detection from Multi-Camera\n Videos","summary":" Camera-based 3D object detection in BEV (Bird's Eye View) space has drawn\ngreat attention over the past few years. Dense detectors typically follow a\ntwo-stage pipeline by first constructing a dense BEV feature and then\nperforming object detection in BEV space, which suffers from complex view\ntransformations and high computation cost. On the other side, sparse detectors\nfollow a query-based paradigm without explicit dense BEV feature construction,\nbut achieve worse performance than the dense counterparts. In this paper, we\nfind that the key to mitigate this performance gap is the adaptability of the\ndetector in both BEV and image space. To achieve this goal, we propose\nSparseBEV, a fully sparse 3D object detector that outperforms the dense\ncounterparts. SparseBEV contains three key designs, which are (1)\nscale-adaptive self attention to aggregate features with adaptive receptive\nfield in BEV space, (2) adaptive spatio-temporal sampling to generate sampling\nlocations under the guidance of queries, and (3) adaptive mixing to decode the\nsampled features with dynamic weights from the queries. On the test split of\nnuScenes, SparseBEV achieves the state-of-the-art performance of 67.5 NDS. On\nthe val split, SparseBEV achieves 55.8 NDS while maintaining a real-time\ninference speed of 23.5 FPS. Code is available at\nhttps://github.com/MCG-NJU/SparseBEV.\n","authors":["Haisong Liu","Yao Teng","Tao Lu","Haiguang Wang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09244v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09242v1","updated":"2023-08-18T02:06:49Z","published":"2023-08-18T02:06:49Z","title":"ASAG: Building Strong One-Decoder-Layer Sparse Detectors via Adaptive\n Sparse Anchor Generation","summary":" Recent sparse detectors with multiple, e.g. six, decoder layers achieve\npromising performance but much inference time due to complex heads. Previous\nworks have explored using dense priors as initialization and built\none-decoder-layer detectors. Although they gain remarkable acceleration, their\nperformance still lags behind their six-decoder-layer counterparts by a large\nmargin. In this work, we aim to bridge this performance gap while retaining\nfast speed. We find that the architecture discrepancy between dense and sparse\ndetectors leads to feature conflict, hampering the performance of\none-decoder-layer detectors. Thus we propose Adaptive Sparse Anchor Generator\n(ASAG) which predicts dynamic anchors on patches rather than grids in a sparse\nway so that it alleviates the feature conflict problem. For each image, ASAG\ndynamically selects which feature maps and which locations to predict, forming\na fully adaptive way to generate image-specific anchors. Further, a simple and\neffective Query Weighting method eases the training instability from\nadaptiveness. Extensive experiments show that our method outperforms\ndense-initialized ones and achieves a better speed-accuracy trade-off. The code\nis available at \\url{https://github.com/iSEE-Laboratory/ASAG}.\n","authors":["Shenghao Fu","Junkai Yan","Yipeng Gao","Xiaohua Xie","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.09242v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09238v1","updated":"2023-08-18T01:53:47Z","published":"2023-08-18T01:53:47Z","title":"Improving Buoy Detection with Deep Transfer Learning for Mussel Farm\n Automation","summary":" The aquaculture sector in New Zealand is experiencing rapid expansion, with a\nparticular emphasis on mussel exports. As the demands of mussel farming\noperations continue to evolve, the integration of artificial intelligence and\ncomputer vision techniques, such as intelligent object detection, is emerging\nas an effective approach to enhance operational efficiency. This study delves\ninto advancing buoy detection by leveraging deep learning methodologies for\nintelligent mussel farm monitoring and management. The primary objective\ncenters on improving accuracy and robustness in detecting buoys across a\nspectrum of real-world scenarios. A diverse dataset sourced from mussel farms\nis captured and labeled for training, encompassing imagery taken from cameras\nmounted on both floating platforms and traversing vessels, capturing various\nlighting and weather conditions. To establish an effective deep learning model\nfor buoy detection with a limited number of labeled data, we employ transfer\nlearning techniques. This involves adapting a pre-trained object detection\nmodel to create a specialized deep learning buoy detection model. We explore\ndifferent pre-trained models, including YOLO and its variants, alongside data\ndiversity to investigate their effects on model performance. Our investigation\ndemonstrates a significant enhancement in buoy detection performance through\ndeep learning, accompanied by improved generalization across diverse weather\nconditions, highlighting the practical effectiveness of our approach.\n","authors":["Carl McMillan","Junhong Zhao","Bing Xue","Ross Vennell","Mengjie Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09238v1.pdf","comment":"7 pages, 5 figures, submitted to ICVNZ 2023 conference\n https://ivcnz2023.massey.ac.nz/"},{"id":"http://arxiv.org/abs/2303.09650v2","updated":"2023-08-18T01:52:34Z","published":"2023-03-16T21:06:13Z","title":"Iterative Soft Shrinkage Learning for Efficient Image Super-Resolution","summary":" Image super-resolution (SR) has witnessed extensive neural network designs\nfrom CNN to transformer architectures. However, prevailing SR models suffer\nfrom prohibitive memory footprint and intensive computations, which limits\nfurther deployment on edge devices. This work investigates the potential of\nnetwork pruning for super-resolution to take advantage of off-the-shelf network\ndesigns and reduce the underlying computational overhead. Two main challenges\nremain in applying pruning methods for SR. First, the widely-used filter\npruning technique reflects limited granularity and restricted adaptability to\ndiverse network structures. Second, existing pruning methods generally operate\nupon a pre-trained network for the sparse structure determination, hard to get\nrid of dense model training in the traditional SR paradigm. To address these\nchallenges, we adopt unstructured pruning with sparse models directly trained\nfrom scratch. Specifically, we propose a novel Iterative Soft\nShrinkage-Percentage (ISS-P) method by optimizing the sparse structure of a\nrandomly initialized network at each iteration and tweaking unimportant weights\nwith a small amount proportional to the magnitude scale on-the-fly. We observe\nthat the proposed ISS-P can dynamically learn sparse structures adapting to the\noptimization process and preserve the sparse model's trainability by yielding a\nmore regularized gradient throughput. Experiments on benchmark datasets\ndemonstrate the effectiveness of the proposed ISS-P over diverse network\narchitectures. Code is available at\nhttps://github.com/Jiamian-Wang/Iterative-Soft-Shrinkage-SR\n","authors":["Jiamian Wang","Huan Wang","Yulun Zhang","Yun Fu","Zhiqiang Tao"],"pdf_url":"https://arxiv.org/pdf/2303.09650v2.pdf","comment":"Accepted by ICCV 2023, code released at\n https://github.com/Jiamian-Wang/Iterative-Soft-Shrinkage-SR"},{"id":"http://arxiv.org/abs/2308.09234v1","updated":"2023-08-18T01:44:54Z","published":"2023-08-18T01:44:54Z","title":"Deep Boosting Multi-Modal Ensemble Face Recognition with Sample-Level\n Weighting","summary":" Deep convolutional neural networks have achieved remarkable success in face\nrecognition (FR), partly due to the abundant data availability. However, the\ncurrent training benchmarks exhibit an imbalanced quality distribution; most\nimages are of high quality. This poses issues for generalization on hard\nsamples since they are underrepresented during training. In this work, we\nemploy the multi-model boosting technique to deal with this issue. Inspired by\nthe well-known AdaBoost, we propose a sample-level weighting approach to\nincorporate the importance of different samples into the FR loss. Individual\nmodels of the proposed framework are experts at distinct levels of sample\nhardness. Therefore, the combination of models leads to a robust feature\nextractor without losing the discriminability on the easy samples. Also, for\nincorporating the sample hardness into the training criterion, we analytically\nshow the effect of sample mining on the important aspects of current angular\nmargin loss functions, i.e., margin and scale. The proposed method shows\nsuperior performance in comparison with the state-of-the-art algorithms in\nextensive experiments on the CFP-FP, LFW, CPLFW, CALFW, AgeDB, TinyFace, IJB-B,\nand IJB-C evaluation datasets.\n","authors":["Sahar Rahimi Malakshan","Mohammad Saeed Ebrahimi Saadabadi","Nima Najafzadeh","Nasser M. Nasrabadi"],"pdf_url":"https://arxiv.org/pdf/2308.09234v1.pdf","comment":"2023 IEEE International Joint Conference on Biometrics (IJCB)"},{"id":"http://arxiv.org/abs/2308.08316v2","updated":"2023-08-18T01:31:24Z","published":"2023-08-16T12:22:29Z","title":"Dual-Stream Diffusion Net for Text-to-Video Generation","summary":" With the emerging diffusion models, recently, text-to-video generation has\naroused increasing attention. But an important bottleneck therein is that\ngenerative videos often tend to carry some flickers and artifacts. In this\nwork, we propose a dual-stream diffusion net (DSDN) to improve the consistency\nof content variations in generating videos. In particular, the designed two\ndiffusion streams, video content and motion branches, could not only run\nseparately in their private spaces for producing personalized video variations\nas well as content, but also be well-aligned between the content and motion\ndomains through leveraging our designed cross-transformer interaction module,\nwhich would benefit the smoothness of generated videos. Besides, we also\nintroduce motion decomposer and combiner to faciliate the operation on video\nmotion. Qualitative and quantitative experiments demonstrate that our method\ncould produce amazing continuous videos with fewer flickers.\n","authors":["Binhui Liu","Xin Liu","Anbo Dai","Zhiyong Zeng","Zhen Cui","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2308.08316v2.pdf","comment":"8pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.09230v1","updated":"2023-08-18T01:24:52Z","published":"2023-08-18T01:24:52Z","title":"CCFace: Classification Consistency for Low-Resolution Face Recognition","summary":" In recent years, deep face recognition methods have demonstrated impressive\nresults on in-the-wild datasets. However, these methods have shown a\nsignificant decline in performance when applied to real-world low-resolution\nbenchmarks like TinyFace or SCFace. To address this challenge, we propose a\nnovel classification consistency knowledge distillation approach that transfers\nthe learned classifier from a high-resolution model to a low-resolution\nnetwork. This approach helps in finding discriminative representations for\nlow-resolution instances. To further improve the performance, we designed a\nknowledge distillation loss using the adaptive angular penalty inspired by the\nsuccess of the popular angular margin loss function. The adaptive penalty\nreduces overfitting on low-resolution samples and alleviates the convergence\nissue of the model integrated with data augmentation. Additionally, we utilize\nan asymmetric cross-resolution learning approach based on the state-of-the-art\nsemi-supervised representation learning paradigm to improve discriminability on\nlow-resolution instances and prevent them from forming a cluster. Our proposed\nmethod outperforms state-of-the-art approaches on low-resolution benchmarks,\nwith a three percent improvement on TinyFace while maintaining performance on\nhigh-resolution benchmarks.\n","authors":["Mohammad Saeed Ebrahimi Saadabadi","Sahar Rahimi Malakshan","Hossein Kashiani","Nasser M. Nasrabadi"],"pdf_url":"https://arxiv.org/pdf/2308.09230v1.pdf","comment":"2023 IEEE International Joint Conference on Biometrics (IJCB)"},{"id":"http://arxiv.org/abs/2303.04991v2","updated":"2023-08-18T01:20:35Z","published":"2023-03-09T02:24:30Z","title":"Deformer: Dynamic Fusion Transformer for Robust Hand Pose Estimation","summary":" Accurately estimating 3D hand pose is crucial for understanding how humans\ninteract with the world. Despite remarkable progress, existing methods often\nstruggle to generate plausible hand poses when the hand is heavily occluded or\nblurred. In videos, the movements of the hand allow us to observe various parts\nof the hand that may be occluded or blurred in a single frame. To adaptively\nleverage the visual clue before and after the occlusion or blurring for robust\nhand pose estimation, we propose the Deformer: a framework that implicitly\nreasons about the relationship between hand parts within the same image\n(spatial dimension) and different timesteps (temporal dimension). We show that\na naive application of the transformer self-attention mechanism is not\nsufficient because motion blur or occlusions in certain frames can lead to\nheavily distorted hand features and generate imprecise keys and queries. To\naddress this challenge, we incorporate a Dynamic Fusion Module into Deformer,\nwhich predicts the deformation of the hand and warps the hand mesh predictions\nfrom nearby frames to explicitly support the current frame estimation.\nFurthermore, we have observed that errors are unevenly distributed across\ndifferent hand parts, with vertices around fingertips having disproportionately\nhigher errors than those around the palm. We mitigate this issue by introducing\na new loss function called maxMSE that automatically adjusts the weight of\nevery vertex to focus the model on critical hand parts. Extensive experiments\nshow that our method significantly outperforms state-of-the-art methods by 10%,\nand is more robust to occlusions (over 14%).\n","authors":["Qichen Fu","Xingyu Liu","Ran Xu","Juan Carlos Niebles","Kris M. Kitani"],"pdf_url":"https://arxiv.org/pdf/2303.04991v2.pdf","comment":"In ICCV 2023. Project: https://fuqichen1998.github.io/Deformer/"},{"id":"http://arxiv.org/abs/2308.09228v1","updated":"2023-08-18T01:20:25Z","published":"2023-08-18T01:20:25Z","title":"Generalized Sum Pooling for Metric Learning","summary":" A common architectural choice for deep metric learning is a convolutional\nneural network followed by global average pooling (GAP). Albeit simple, GAP is\na highly effective way to aggregate information. One possible explanation for\nthe effectiveness of GAP is considering each feature vector as representing a\ndifferent semantic entity and GAP as a convex combination of them. Following\nthis perspective, we generalize GAP and propose a learnable generalized sum\npooling method (GSP). GSP improves GAP with two distinct abilities: i) the\nability to choose a subset of semantic entities, effectively learning to ignore\nnuisance information, and ii) learning the weights corresponding to the\nimportance of each entity. Formally, we propose an entropy-smoothed optimal\ntransport problem and show that it is a strict generalization of GAP, i.e., a\nspecific realization of the problem gives back GAP. We show that this\noptimization problem enjoys analytical gradients enabling us to use it as a\ndirect learnable replacement for GAP. We further propose a zero-shot loss to\nease the learning of GSP. We show the effectiveness of our method with\nextensive evaluations on 4 popular metric learning benchmarks. Code is\navailable at: GSP-DML Framework\n","authors":["Yeti Z. Gurbuz","Ozan Sener","A. Aydın Alatan"],"pdf_url":"https://arxiv.org/pdf/2308.09228v1.pdf","comment":"Accepted as a conference paper at International Conference on\n Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2308.09223v1","updated":"2023-08-18T00:48:30Z","published":"2023-08-18T00:48:30Z","title":"DMCVR: Morphology-Guided Diffusion Model for 3D Cardiac Volume\n Reconstruction","summary":" Accurate 3D cardiac reconstruction from cine magnetic resonance imaging\n(cMRI) is crucial for improved cardiovascular disease diagnosis and\nunderstanding of the heart's motion. However, current cardiac MRI-based\nreconstruction technology used in clinical settings is 2D with limited\nthrough-plane resolution, resulting in low-quality reconstructed cardiac\nvolumes. To better reconstruct 3D cardiac volumes from sparse 2D image stacks,\nwe propose a morphology-guided diffusion model for 3D cardiac volume\nreconstruction, DMCVR, that synthesizes high-resolution 2D images and\ncorresponding 3D reconstructed volumes. Our method outperforms previous\napproaches by conditioning the cardiac morphology on the generative model,\neliminating the time-consuming iterative optimization process of the latent\ncode, and improving generation quality. The learned latent spaces provide\nglobal semantics, local cardiac morphology and details of each 2D cMRI slice\nwith highly interpretable value to reconstruct 3D cardiac shape. Our\nexperiments show that DMCVR is highly effective in several aspects, such as 2D\ngeneration and 3D reconstruction performance. With DMCVR, we can produce\nhigh-resolution 3D cardiac MRI reconstructions, surpassing current techniques.\nOur proposed framework has great potential for improving the accuracy of\ncardiac disease diagnosis and treatment planning. Code can be accessed at\nhttps://github.com/hexiaoxiao-cs/DMCVR.\n","authors":["Xiaoxiao He","Chaowei Tan","Ligong Han","Bo Liu","Leon Axel","Kang Li","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2308.09223v1.pdf","comment":"Accepted in MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.09221v1","updated":"2023-08-18T00:40:42Z","published":"2023-08-18T00:40:42Z","title":"A review of technical factors to consider when designing neural networks\n for semantic segmentation of Earth Observation imagery","summary":" Semantic segmentation (classification) of Earth Observation imagery is a\ncrucial task in remote sensing. This paper presents a comprehensive review of\ntechnical factors to consider when designing neural networks for this purpose.\nThe review focuses on Convolutional Neural Networks (CNNs), Recurrent Neural\nNetworks (RNNs), Generative Adversarial Networks (GANs), and transformer\nmodels, discussing prominent design patterns for these ANN families and their\nimplications for semantic segmentation. Common pre-processing techniques for\nensuring optimal data preparation are also covered. These include methods for\nimage normalization and chipping, as well as strategies for addressing data\nimbalance in training samples, and techniques for overcoming limited data,\nincluding augmentation techniques, transfer learning, and domain adaptation. By\nencompassing both the technical aspects of neural network design and the\ndata-related considerations, this review provides researchers and practitioners\nwith a comprehensive and up-to-date understanding of the factors involved in\ndesigning effective neural networks for semantic segmentation of Earth\nObservation imagery.\n","authors":["Sam Khallaghi","J. Ronald Eastman","Lyndon D. Estes"],"pdf_url":"https://arxiv.org/pdf/2308.09221v1.pdf","comment":"145 pages with 32 figures"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.09649v1","updated":"2023-08-18T16:10:13Z","published":"2023-08-18T16:10:13Z","title":"MUSE: Music Recommender System with Shuffle Play Recommendation\n Enhancement","summary":" Recommender systems have become indispensable in music streaming services,\nenhancing user experiences by personalizing playlists and facilitating the\nserendipitous discovery of new music. However, the existing recommender systems\noverlook the unique challenges inherent in the music domain, specifically\nshuffle play, which provides subsequent tracks in a random sequence. Based on\nour observation that the shuffle play sessions hinder the overall training\nprocess of music recommender systems mainly due to the high unique transition\nrates of shuffle play sessions, we propose a Music Recommender System with\nShuffle Play Recommendation Enhancement (MUSE). MUSE employs the\nself-supervised learning framework that maximizes the agreement between the\noriginal session and the augmented session, which is augmented by our novel\nsession augmentation method, called transition-based augmentation. To further\nfacilitate the alignment of the representations between the two views, we\ndevise two fine-grained matching strategies, i.e., item- and similarity-based\nmatching strategies. Through rigorous experiments conducted across diverse\nenvironments, we demonstrate MUSE's efficacy over 12 baseline models on a\nlarge-scale Music Streaming Sessions Dataset (MSSD) from Spotify. The source\ncode of MUSE is available at \\url{https://github.com/yunhak0/MUSE}.\n","authors":["Yunhak Oh","Sukwon Yun","Dongmin Hyun","Sein Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2308.09649v1.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2308.09516v1","updated":"2023-08-18T12:49:25Z","published":"2023-08-18T12:49:25Z","title":"ReCon: Reducing Congestion in Job Recommendation using Optimal Transport","summary":" Recommender systems may suffer from congestion, meaning that there is an\nunequal distribution of the items in how often they are recommended. Some items\nmay be recommended much more than others. Recommenders are increasingly used in\ndomains where items have limited availability, such as the job market, where\ncongestion is especially problematic: Recommending a vacancy -- for which\ntypically only one person will be hired -- to a large number of job seekers may\nlead to frustration for job seekers, as they may be applying for jobs where\nthey are not hired. This may also leave vacancies unfilled and result in job\nmarket inefficiency.\n We propose a novel approach to job recommendation called ReCon, accounting\nfor the congestion problem. Our approach is to use an optimal transport\ncomponent to ensure a more equal spread of vacancies over job seekers, combined\nwith a job recommendation model in a multi-objective optimization problem. We\nevaluated our approach on two real-world job market datasets. The evaluation\nresults show that ReCon has good performance on both congestion-related (e.g.,\nCongestion) and desirability (e.g., NDCG) measures.\n","authors":["Yoosof Mashayekhi","Bo Kang","Jefrey Lijffijt","Tijl De Bie"],"pdf_url":"https://arxiv.org/pdf/2308.09516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09419v1","updated":"2023-08-18T09:38:57Z","published":"2023-08-18T09:38:57Z","title":"Attention Calibration for Transformer-based Sequential Recommendation","summary":" Transformer-based sequential recommendation (SR) has been booming in recent\nyears, with the self-attention mechanism as its key component. Self-attention\nhas been widely believed to be able to effectively select those informative and\nrelevant items from a sequence of interacted items for next-item prediction via\nlearning larger attention weights for these items. However, this may not always\nbe true in reality. Our empirical analysis of some representative\nTransformer-based SR models reveals that it is not uncommon for large attention\nweights to be assigned to less relevant items, which can result in inaccurate\nrecommendations. Through further in-depth analysis, we find two factors that\nmay contribute to such inaccurate assignment of attention weights: sub-optimal\nposition encoding and noisy input. To this end, in this paper, we aim to\naddress this significant yet challenging gap in existing works. To be specific,\nwe propose a simple yet effective framework called Attention Calibration for\nTransformer-based Sequential Recommendation (AC-TSR). In AC-TSR, a novel\nspatial calibrator and adversarial calibrator are designed respectively to\ndirectly calibrates those incorrectly assigned attention weights. The former is\ndevised to explicitly capture the spatial relationships (i.e., order and\ndistance) among items for more precise calculation of attention weights. The\nlatter aims to redistribute the attention weights based on each item's\ncontribution to the next-item prediction. AC-TSR is readily adaptable and can\nbe seamlessly integrated into various existing transformer-based SR models.\nExtensive experimental results on four benchmark real-world datasets\ndemonstrate the superiority of our proposed ACTSR via significant\nrecommendation performance enhancements. The source code is available at\nhttps://github.com/AIM-SE/AC-TSR.\n","authors":["Peilin Zhou","Qichen Ye","Yueqi Xie","Jingqi Gao","Shoujin Wang","Jae Boum Kim","Chenyu You","Sunghun Kim"],"pdf_url":"https://arxiv.org/pdf/2308.09419v1.pdf","comment":"Accepted by CIKM2023"},{"id":"http://arxiv.org/abs/2308.09395v1","updated":"2023-08-18T08:52:29Z","published":"2023-08-18T08:52:29Z","title":"SHARK: A Lightweight Model Compression Approach for Large-scale\n Recommender Systems","summary":" Increasing the size of embedding layers has shown to be effective in\nimproving the performance of recommendation models, yet gradually causing their\nsizes to exceed terabytes in industrial recommender systems, and hence the\nincrease of computing and storage costs. To save resources while maintaining\nmodel performances, we propose SHARK, the model compression practice we have\nsummarized in the recommender system of industrial scenarios. SHARK consists of\ntwo main components. First, we use the novel first-order component of Taylor\nexpansion as importance scores to prune the number of embedding tables (feature\nfields). Second, we introduce a new row-wise quantization method to apply\ndifferent quantization strategies to each embedding. We conduct extensive\nexperiments on both public and industrial datasets, demonstrating that each\ncomponent of our proposed SHARK framework outperforms previous approaches. We\nconduct A/B tests in multiple models on Kuaishou, such as short video,\ne-commerce, and advertising recommendation models. The results of the online\nA/B test showed SHARK can effectively reduce the memory footprint of the\nembedded layer. For the short-video scenarios, the compressed model without any\nperformance drop significantly saves 70% storage and thousands of machines,\nimproves 30\\% queries per second (QPS), and has been deployed to serve hundreds\nof millions of users and process tens of billions of requests every day.\n","authors":["Beichuan Zhang","Chenggen Sun","Jianchao Tan","Xinjun Cai","Jun Zhao","Mengqi Miao","Kang Yin","Chengru Song","Na Mou","Yang Song"],"pdf_url":"https://arxiv.org/pdf/2308.09395v1.pdf","comment":"accepted by cikm 2023"},{"id":"http://arxiv.org/abs/2308.03333v2","updated":"2023-08-18T07:05:10Z","published":"2023-08-07T06:29:20Z","title":"Heterogeneous Knowledge Fusion: A Novel Approach for Personalized\n Recommendation via LLM","summary":" The analysis and mining of user heterogeneous behavior are of paramount\nimportance in recommendation systems. However, the conventional approach of\nincorporating various types of heterogeneous behavior into recommendation\nmodels leads to feature sparsity and knowledge fragmentation issues. To address\nthis challenge, we propose a novel approach for personalized recommendation via\nLarge Language Model (LLM), by extracting and fusing heterogeneous knowledge\nfrom user heterogeneous behavior information. In addition, by combining\nheterogeneous knowledge and recommendation tasks, instruction tuning is\nperformed on LLM for personalized recommendations. The experimental results\ndemonstrate that our method can effectively integrate user heterogeneous\nbehavior and significantly improve recommendation performance.\n","authors":["Bin Yin","Junjie Xie","Yu Qin","Zixiang Ding","Zhichao Feng","Xiang Li","Wei Lin"],"pdf_url":"https://arxiv.org/pdf/2308.03333v2.pdf","comment":"Accepted at RecSys 2023"},{"id":"http://arxiv.org/abs/2308.09340v1","updated":"2023-08-18T06:52:07Z","published":"2023-08-18T06:52:07Z","title":"How Discriminative Are Your Qrels? How To Study the Statistical\n Significance of Document Adjudication Methods","summary":" Creating test collections for offline retrieval evaluation requires human\neffort to judge documents' relevance. This expensive activity motivated much\nwork in developing methods for constructing benchmarks with fewer assessment\ncosts. In this respect, adjudication methods actively decide both which\ndocuments and the order in which experts review them, in order to better\nexploit the assessment budget or to lower it. Researchers evaluate the quality\nof those methods by measuring the correlation between the known gold ranking of\nsystems under the full collection and the observed ranking of systems under the\nlower-cost one. This traditional analysis ignores whether and how the low-cost\njudgements impact on the statistically significant differences among systems\nwith respect to the full collection. We fill this void by proposing a novel\nmethodology to evaluate how the low-cost adjudication methods preserve the\npairwise significant differences between systems as the full collection. In\nother terms, while traditional approaches look for stability in answering the\nquestion \"is system A better than system B?\", our proposed approach looks for\nstability in answering the question \"is system A significantly better than\nsystem B?\", which is the ultimate questions researchers need to answer to\nguarantee the generalisability of their results. Among other results, we found\nthat the best methods in terms of ranking of systems correlation do not always\nmatch those preserving statistical significance.\n","authors":["David Otero","Javier Parapar","Nicola Ferro"],"pdf_url":"https://arxiv.org/pdf/2308.09340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19860v4","updated":"2023-08-18T05:56:05Z","published":"2023-05-31T13:51:26Z","title":"A Survey on Large Language Models for Recommendation","summary":" Large Language Models (LLMs) have emerged as powerful tools in the field of\nNatural Language Processing (NLP) and have recently gained significant\nattention in the domain of Recommendation Systems (RS). These models, trained\non massive amounts of data using self-supervised learning, have demonstrated\nremarkable success in learning universal representations and have the potential\nto enhance various aspects of recommendation systems by some effective transfer\ntechniques such as fine-tuning and prompt tuning, and so on. The crucial aspect\nof harnessing the power of language models in enhancing recommendation quality\nis the utilization of their high-quality representations of textual features\nand their extensive coverage of external knowledge to establish correlations\nbetween items and users. To provide a comprehensive understanding of the\nexisting LLM-based recommendation systems, this survey presents a taxonomy that\ncategorizes these models into two major paradigms, respectively Discriminative\nLLM for Recommendation (DLLM4Rec) and Generative LLM for Recommendation\n(GLLM4Rec), with the latter being systematically sorted out for the first time.\nFurthermore, we systematically review and analyze existing LLM-based\nrecommendation systems within each paradigm, providing insights into their\nmethodologies, techniques, and performance. Additionally, we identify key\nchallenges and several valuable findings to provide researchers and\npractitioners with inspiration. We have also created a GitHub repository to\nindex relevant papers on LLMs for recommendation,\nhttps://github.com/WLiK/LLM4Rec.\n","authors":["Likang Wu","Zhi Zheng","Zhaopeng Qiu","Hao Wang","Hongchao Gu","Tingjia Shen","Chuan Qin","Chen Zhu","Hengshu Zhu","Qi Liu","Hui Xiong","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2305.19860v4.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.09309v1","updated":"2023-08-18T05:07:41Z","published":"2023-08-18T05:07:41Z","title":"Meta-learning enhanced next POI recommendation by leveraging check-ins\n from auxiliary cities","summary":" Most existing point-of-interest (POI) recommenders aim to capture user\npreference by employing city-level user historical check-ins, thus facilitating\nusers' exploration of the city. However, the scarcity of city-level user\ncheck-ins brings a significant challenge to user preference learning. Although\nprior studies attempt to mitigate this challenge by exploiting various context\ninformation, e.g., spatio-temporal information, they ignore to transfer the\nknowledge (i.e., common behavioral pattern) from other relevant cities (i.e.,\nauxiliary cities). In this paper, we investigate the effect of knowledge\ndistilled from auxiliary cities and thus propose a novel Meta-learning Enhanced\nnext POI Recommendation framework (MERec). The MERec leverages the correlation\nof check-in behaviors among various cities into the meta-learning paradigm to\nhelp infer user preference in the target city, by holding the principle of\n\"paying more attention to more correlated knowledge\". Particularly, a\ncity-level correlation strategy is devised to attentively capture common\npatterns among cities, so as to transfer more relevant knowledge from more\ncorrelated cities. Extensive experiments verify the superiority of the proposed\nMERec against state-of-the-art algorithms.\n","authors":["Jinze Wang","Lu Zhang","Zhu Sun","Yew-Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2308.09309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09308v1","updated":"2023-08-18T05:05:35Z","published":"2023-08-18T05:05:35Z","title":"Differentiable Retrieval Augmentation via Generative Language Modeling\n for E-commerce Query Intent Classification","summary":" Retrieval augmentation, which enhances downstream models by a knowledge\nretriever and an external corpus instead of by merely increasing the number of\nmodel parameters, has been successfully applied to many natural language\nprocessing (NLP) tasks such as text classification, question answering and so\non. However, existing methods that separately or asynchronously train the\nretriever and downstream model mainly due to the non-differentiability between\nthe two parts, usually lead to degraded performance compared to end-to-end\njoint training.\n","authors":["Chenyu Zhao","Yunjiang Jiang","Yiming Qiu","Han Zhang","Wen-Yun Yang"],"pdf_url":"https://arxiv.org/pdf/2308.09308v1.pdf","comment":"5 pages, 2 figures; accepted by CIKM2023"},{"id":"http://arxiv.org/abs/2308.09292v1","updated":"2023-08-18T04:33:36Z","published":"2023-08-18T04:33:36Z","title":"Graph-based Alignment and Uniformity for Recommendation","summary":" Collaborative filtering-based recommender systems (RecSys) rely on learning\nrepresentations for users and items to predict preferences accurately.\nRepresentation learning on the hypersphere is a promising approach due to its\ndesirable properties, such as alignment and uniformity. However, the sparsity\nissue arises when it encounters RecSys. To address this issue, we propose a\nnovel approach, graph-based alignment and uniformity (GraphAU), that explicitly\nconsiders high-order connectivities in the user-item bipartite graph. GraphAU\naligns the user/item embedding to the dense vector representations of\nhigh-order neighbors using a neighborhood aggregator, eliminating the need to\ncompute the burdensome alignment to high-order neighborhoods individually. To\naddress the discrepancy in alignment losses, GraphAU includes a layer-wise\nalignment pooling module to integrate alignment losses layer-wise. Experiments\non four datasets show that GraphAU significantly alleviates the sparsity issue\nand achieves state-of-the-art performance. We open-source GraphAU at\nhttps://github.com/YangLiangwei/GraphAU.\n","authors":["Liangwei Yang","Zhiwei Liu","Chen Wang","Mingdai Yang","Xiaolong Liu","Jing Ma","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2308.09292v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2306.15905v2","updated":"2023-08-18T04:24:24Z","published":"2023-06-28T04:03:31Z","title":"Dimension Independent Mixup for Hard Negative Sample in Collaborative\n Filtering","summary":" Collaborative filtering (CF) is a widely employed technique that predicts\nuser preferences based on past interactions. Negative sampling plays a vital\nrole in training CF-based models with implicit feedback. In this paper, we\npropose a novel perspective based on the sampling area to revisit existing\nsampling methods. We point out that current sampling methods mainly focus on\nPoint-wise or Line-wise sampling, lacking flexibility and leaving a significant\nportion of the hard sampling area un-explored. To address this limitation, we\npropose Dimension Independent Mixup for Hard Negative Sampling (DINS), which is\nthe first Area-wise sampling method for training CF-based models. DINS\ncomprises three modules: Hard Boundary Definition, Dimension Independent Mixup,\nand Multi-hop Pooling. Experiments with real-world datasets on both matrix\nfactorization and graph-based models demonstrate that DINS outperforms other\nnegative sampling methods, establishing its effectiveness and superiority. Our\nwork contributes a new perspective, introduces Area-wise sampling, and presents\nDINS as a novel approach that achieves state-of-the-art performance for\nnegative sampling. Our implementations are available in PyTorch.\n","authors":["Xi Wu","Liangwei Yang","Jibing Gong","Chao Zhou","Tianyu Lin","Xiaolong Liu","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2306.15905v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09765v1","updated":"2023-08-18T18:18:55Z","published":"2023-08-18T18:18:55Z","title":"Taken by Surprise: Contrast effect for Similarity Scores","summary":" Accurately evaluating the similarity of object vector embeddings is of\ncritical importance for natural language processing, information retrieval and\nclassification tasks. Popular similarity scores (e.g cosine similarity) are\nbased on pairs of embedding vectors and disregard the distribution of the\nensemble from which objects are drawn. Human perception of object similarity\nsignificantly depends on the context in which the objects appear. In this work\nwe propose the \\emph{surprise score}, an ensemble-normalized similarity metric\nthat encapsulates the contrast effect of human perception and significantly\nimproves the classification performance on zero- and few-shot document\nclassification tasks. This score quantifies the surprise to find a given\nsimilarity between two elements relative to the pairwise ensemble similarities.\nWe evaluate this metric on zero/few shot classification and clustering tasks\nand typically find 10-15\\% better performance compared to raw cosine\nsimilarity. Our code is available at\nhttps://github.com/MeetElise/surprise-similarity.\n","authors":["homas C. Bachlechner","Mario Martone","Marjorie Schillo"],"pdf_url":"https://arxiv.org/pdf/2308.09765v1.pdf","comment":"9 pages, 2 figures and 4 tables"},{"id":"http://arxiv.org/abs/2308.10999v1","updated":"2023-08-18T13:42:41Z","published":"2023-08-18T13:42:41Z","title":"Eigenvalue-based Incremental Spectral Clustering","summary":" Our previous experiments demonstrated that subsets collections of (short)\ndocuments (with several hundred entries) share a common normalized in some way\neigenvalue spectrum of combinatorial Laplacian. Based on this insight, we\npropose a method of incremental spectral clustering. The method consists of the\nfollowing steps: (1) split the data into manageable subsets, (2) cluster each\nof the subsets, (3) merge clusters from different subsets based on the\neigenvalue spectrum similarity to form clusters of the entire set. This method\ncan be especially useful for clustering methods of complexity strongly\nincreasing with the size of the data sample,like in case of typical spectral\nclustering. Experiments were performed showing that in fact the clustering and\nmerging the subsets yields clusters close to clustering the entire dataset.\n","authors":["Mieczysław A. Kłopotek","Bartłmiej Starosta","Sławomir T. Wierzchoń"],"pdf_url":"https://arxiv.org/pdf/2308.10999v1.pdf","comment":"14 tables, 6 figures"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.09711v1","updated":"2023-08-18T17:59:01Z","published":"2023-08-18T17:59:01Z","title":"Robust Monocular Depth Estimation under Challenging Conditions","summary":" While state-of-the-art monocular depth estimation approaches achieve\nimpressive results in ideal settings, they are highly unreliable under\nchallenging illumination and weather conditions, such as at nighttime or in the\npresence of rain. In this paper, we uncover these safety-critical issues and\ntackle them with md4all: a simple and effective solution that works reliably\nunder both adverse and ideal conditions, as well as for different types of\nlearning supervision. We achieve this by exploiting the efficacy of existing\nmethods under perfect settings. Therefore, we provide valid training signals\nindependently of what is in the input. First, we generate a set of complex\nsamples corresponding to the normal training ones. Then, we train the model by\nguiding its self- or full-supervision by feeding the generated samples and\ncomputing the standard losses on the corresponding original images. Doing so\nenables a single model to recover information across diverse conditions without\nmodifications at inference time. Extensive experiments on two challenging\npublic datasets, namely nuScenes and Oxford RobotCar, demonstrate the\neffectiveness of our techniques, outperforming prior works by a large margin in\nboth standard and challenging conditions. Source code and data are available\nat: https://md4all.github.io.\n","authors":["Stefano Gasperini","Nils Morbitzer","HyunJun Jung","Nassir Navab","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2308.09711v1.pdf","comment":"ICCV 2023. Source code and data: https://md4all.github.io"},{"id":"http://arxiv.org/abs/2308.09709v1","updated":"2023-08-18T17:58:36Z","published":"2023-08-18T17:58:36Z","title":"Neural-network quantum state study of the long-range antiferromagnetic\n Ising chain","summary":" We investigate quantum phase transitions in the transverse field Ising chain\nwith algebraically decaying long-range antiferromagnetic interactions by using\nthe variational Monte Carlo method with the restricted Boltzmann machine being\nemployed as a trial wave function ansatz. In the finite-size scaling analysis\nwith the order parameter and the second R\\'enyi entropy, we find that the\ncentral charge deviates from 1/2 at a small decay exponent $\\alpha_\\mathrm{LR}$\nin contrast to the critical exponents staying very close to the short-range\n(SR) Ising values regardless of $\\alpha_\\mathrm{LR}$ examined, supporting the\npreviously proposed scenario of conformal invariance breakdown. To identify the\nthreshold of the Ising universality and the conformal symmetry, we perform two\nadditional tests for the universal Binder ratio and the conformal field theory\n(CFT) description of the correlation function. It turns out that both indicate\na noticeable deviation from the SR Ising class at $\\alpha_\\mathrm{LR} < 2$.\nHowever, a closer look at the scaled correlation function for\n$\\alpha_\\mathrm{LR} \\ge 2$ shows a gradual change from the asymptotic line of\nthe CFT verified at $\\alpha_\\mathrm{LR} = 3$, providing a rough estimate of the\nthreshold being in the range of $2 \\lesssim \\alpha_\\mathrm{LR} < 3$.\n","authors":["Jicheol Kim","Dongkyu Kim","Dong-Hee Kim"],"pdf_url":"https://arxiv.org/pdf/2308.09709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05407v4","updated":"2023-08-18T17:57:13Z","published":"2022-09-12T16:59:36Z","title":"Segmenting Known Objects and Unseen Unknowns without Prior Knowledge","summary":" Panoptic segmentation methods assign a known class to each pixel given in\ninput. Even for state-of-the-art approaches, this inevitably enforces decisions\nthat systematically lead to wrong predictions for objects outside the training\ncategories. However, robustness against out-of-distribution samples and corner\ncases is crucial in safety-critical settings to avoid dangerous consequences.\nSince real-world datasets cannot contain enough data points to adequately\nsample the long tail of the underlying distribution, models must be able to\ndeal with unseen and unknown scenarios as well. Previous methods targeted this\nby re-identifying already-seen unlabeled objects. In this work, we propose the\nnecessary step to extend segmentation with a new setting which we term holistic\nsegmentation. Holistic segmentation aims to identify and separate objects of\nunseen, unknown categories into instances without any prior knowledge about\nthem while performing panoptic segmentation of known classes. We tackle this\nnew problem with U3HS, which finds unknowns as highly uncertain regions and\nclusters their corresponding instance-aware embeddings into individual objects.\nBy doing so, for the first time in panoptic segmentation with unknown objects,\nour U3HS is trained without unknown categories, reducing assumptions and\nleaving the settings as unconstrained as in real-life scenarios. Extensive\nexperiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate\nthe effectiveness of U3HS for this new, challenging, and assumptions-free\nsetting called holistic segmentation. Project page:\nhttps://holisticseg.github.io.\n","authors":["Stefano Gasperini","Alvaro Marcos-Ramiro","Michael Schmidt","Nassir Navab","Benjamin Busam","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2209.05407v4.pdf","comment":"ICCV 2023. Project page: https://holisticseg.github.io"},{"id":"http://arxiv.org/abs/2308.09701v1","updated":"2023-08-18T17:52:12Z","published":"2023-08-18T17:52:12Z","title":"Do you know what q-means?","summary":" Clustering is one of the most important tools for analysis of large datasets,\nand perhaps the most popular clustering algorithm is Lloyd's iteration for\n$k$-means. This iteration takes $N$ vectors $v_1,\\dots,v_N\\in\\mathbb{R}^d$ and\noutputs $k$ centroids $c_1,\\dots,c_k\\in\\mathbb{R}^d$; these partition the\nvectors into clusters based on which centroid is closest to a particular\nvector. We present an overall improved version of the \"$q$-means\" algorithm,\nthe quantum algorithm originally proposed by Kerenidis, Landman, Luongo, and\nPrakash (2019) which performs $\\varepsilon$-$k$-means, an approximate version\nof $k$-means clustering. This algorithm does not rely on the quantum linear\nalgebra primitives of prior work, instead only using its QRAM to prepare and\nmeasure simple states based on the current iteration's clusters. The time\ncomplexity is $O\\big(\\frac{k^{2}}{\\varepsilon^2}(\\sqrt{k}d + \\log(Nd))\\big)$\nand maintains the polylogarithmic dependence on $N$ while improving the\ndependence on most of the other parameters. We also present a \"dequantized\"\nalgorithm for $\\varepsilon$-$k$-means which runs in\n$O\\big(\\frac{k^{2}}{\\varepsilon^2}(kd + \\log(Nd))\\big)$ time. Notably, this\nclassical algorithm matches the polylogarithmic dependence on $N$ attained by\nthe quantum algorithms.\n","authors":["João F. Doriguello","Alessandro Luongo","Ewin Tang"],"pdf_url":"https://arxiv.org/pdf/2308.09701v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2304.11726v2","updated":"2023-08-18T17:46:38Z","published":"2023-04-23T19:00:17Z","title":"End-to-End Feasible Optimization Proxies for Large-Scale Economic\n Dispatch","summary":" The paper proposes a novel End-to-End Learning and Repair (E2ELR)\narchitecture for training optimization proxies for economic dispatch problems.\nE2ELR combines deep neural networks with closed-form, differentiable repair\nlayers, thereby integrating learning and feasibility in an end-to-end fashion.\nE2ELR is also trained with self-supervised learning, removing the need for\nlabeled data and the solving of numerous optimization problems offline. E2ELR\nis evaluated on industry-size power grids with tens of thousands of buses using\nan economic dispatch that co-optimizes energy and reserves. The results\ndemonstrate that the self-supervised E2ELR achieves state-of-the-art\nperformance, with optimality gaps that outperform other baselines by at least\nan order of magnitude.\n","authors":["Wenbo Chen","Mathieu Tanneau","Pascal Van Hentenryck"],"pdf_url":"https://arxiv.org/pdf/2304.11726v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09693v1","updated":"2023-08-18T17:41:39Z","published":"2023-08-18T17:41:39Z","title":"A Lightweight Transformer for Faster and Robust EBSD Data Collection","summary":" Three dimensional electron back-scattered diffraction (EBSD) microscopy is a\ncritical tool in many applications in materials science, yet its data quality\ncan fluctuate greatly during the arduous collection process, particularly via\nserial-sectioning. Fortunately, 3D EBSD data is inherently sequential, opening\nup the opportunity to use transformers, state-of-the-art deep learning\narchitectures that have made breakthroughs in a plethora of domains, for data\nprocessing and recovery. To be more robust to errors and accelerate this 3D\nEBSD data collection, we introduce a two step method that recovers missing\nslices in an 3D EBSD volume, using an efficient transformer model and a\nprojection algorithm to process the transformer's outputs. Overcoming the\ncomputational and practical hurdles of deep learning with scarce high\ndimensional data, we train this model using only synthetic 3D EBSD data with\nself-supervision and obtain superior recovery accuracy on real 3D EBSD data,\ncompared to existing methods.\n","authors":["Harry Dong","Sean Donegan","Megna Shah","Yuejie Chi"],"pdf_url":"https://arxiv.org/pdf/2308.09693v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09691v1","updated":"2023-08-18T17:38:00Z","published":"2023-08-18T17:38:00Z","title":"Reduced Order Modeling of a MOOSE-based Advanced Manufacturing Model\n with Operator Learning","summary":" Advanced Manufacturing (AM) has gained significant interest in the nuclear\ncommunity for its potential application on nuclear materials. One challenge is\nto obtain desired material properties via controlling the manufacturing process\nduring runtime. Intelligent AM based on deep reinforcement learning (DRL)\nrelies on an automated process-level control mechanism to generate optimal\ndesign variables and adaptive system settings for improved end-product\nproperties. A high-fidelity thermo-mechanical model for direct energy\ndeposition has recently been developed within the MOOSE framework at the Idaho\nNational Laboratory (INL). The goal of this work is to develop an accurate and\nfast-running reduced order model (ROM) for this MOOSE-based AM model that can\nbe used in a DRL-based process control and optimization method. Operator\nlearning (OL)-based methods will be employed due to their capability to learn a\nfamily of differential equations, in this work, produced by changing process\nvariables in the Gaussian point heat source for the laser. We will develop\nOL-based ROM using Fourier neural operator, and perform a benchmark comparison\nof its performance with a conventional deep neural network-based ROM.\n","authors":["Mahmoud Yaseen","Dewen Yushu","Peter German","Xu Wu"],"pdf_url":"https://arxiv.org/pdf/2308.09691v1.pdf","comment":"10 Pages, 7 Figures, 2 Tables. arXiv admin note: text overlap with\n arXiv:2308.02462"},{"id":"http://arxiv.org/abs/2306.07622v2","updated":"2023-08-18T17:33:15Z","published":"2023-06-13T08:43:13Z","title":"Human-Like Intuitive Behavior and Reasoning Biases Emerged in Language\n Models -- and Disappeared in GPT-4","summary":" Large language models (LLMs) are currently at the forefront of intertwining\nAI systems with human communication and everyday life. Therefore, it is of\ngreat importance to evaluate their emerging abilities. In this study, we show\nthat LLMs, most notably GPT-3, exhibit behavior that strikingly resembles\nhuman-like intuition -- and the cognitive errors that come with it. However,\nLLMs with higher cognitive capabilities, in particular ChatGPT and GPT-4,\nlearned to avoid succumbing to these errors and perform in a hyperrational\nmanner. For our experiments, we probe LLMs with the Cognitive Reflection Test\n(CRT) as well as semantic illusions that were originally designed to\ninvestigate intuitive decision-making in humans. Moreover, we probe how sturdy\nthe inclination for intuitive-like decision-making is. Our study demonstrates\nthat investigating LLMs with methods from psychology has the potential to\nreveal otherwise unknown emergent traits.\n","authors":["Thilo Hagendorff","Sarah Fabi"],"pdf_url":"https://arxiv.org/pdf/2306.07622v2.pdf","comment":"Overlap with arXiv:2212.05206"},{"id":"http://arxiv.org/abs/2308.09687v1","updated":"2023-08-18T17:29:23Z","published":"2023-08-18T17:29:23Z","title":"Graph of Thoughts: Solving Elaborate Problems with Large Language Models","summary":" We introduce Graph of Thoughts (GoT): a framework that advances prompting\ncapabilities in large language models (LLMs) beyond those offered by paradigms\nsuch as Chain-ofThought or Tree of Thoughts (ToT). The key idea and primary\nadvantage of GoT is the ability to model the information generated by an LLM as\nan arbitrary graph, where units of information (\"LLM thoughts\") are vertices,\nand edges correspond to dependencies between these vertices. This approach\nenables combining arbitrary LLM thoughts into synergistic outcomes, distilling\nthe essence of whole networks of thoughts, or enhancing thoughts using feedback\nloops. We illustrate that GoT offers advantages over state of the art on\ndifferent tasks, for example increasing the quality of sorting by 62% over ToT,\nwhile simultaneously reducing costs by >31%. We ensure that GoT is extensible\nwith new thought transformations and thus can be used to spearhead new\nprompting schemes. This work brings the LLM reasoning closer to human thinking\nor brain mechanisms such as recurrence, both of which form complex networks.\n","authors":["Maciej Besta","Nils Blach","Ales Kubicek","Robert Gerstenberger","Lukas Gianinazzi","Joanna Gajda","Tomasz Lehmann","Michal Podstawski","Hubert Niewiadomski","Piotr Nyczyk","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2308.09687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09685v1","updated":"2023-08-18T17:13:45Z","published":"2023-08-18T17:13:45Z","title":"Audiovisual Moments in Time: A Large-Scale Annotated Dataset of\n Audiovisual Actions","summary":" We present Audiovisual Moments in Time (AVMIT), a large-scale dataset of\naudiovisual action events. In an extensive annotation task 11 participants\nlabelled a subset of 3-second audiovisual videos from the Moments in Time\ndataset (MIT). For each trial, participants assessed whether the labelled\naudiovisual action event was present and whether it was the most prominent\nfeature of the video. The dataset includes the annotation of 57,177 audiovisual\nvideos, each independently evaluated by 3 of 11 trained participants. From this\ninitial collection, we created a curated test set of 16 distinct action\nclasses, with 60 videos each (960 videos). We also offer 2 sets of pre-computed\naudiovisual feature embeddings, using VGGish/YamNet for audio data and\nVGG16/EfficientNetB0 for visual data, thereby lowering the barrier to entry for\naudiovisual DNN research. We explored the advantages of AVMIT annotations and\nfeature embeddings to improve performance on audiovisual event recognition. A\nseries of 6 Recurrent Neural Networks (RNNs) were trained on either\nAVMIT-filtered audiovisual events or modality-agnostic events from MIT, and\nthen tested on our audiovisual test set. In all RNNs, top 1 accuracy was\nincreased by 2.71-5.94\\% by training exclusively on audiovisual events, even\noutweighing a three-fold increase in training data. We anticipate that the\nnewly annotated AVMIT dataset will serve as a valuable resource for research\nand comparative experiments involving computational models and human\nparticipants, specifically when addressing research questions where audiovisual\ncorrespondence is of critical importance.\n","authors":["Michael Joannou","Pia Rotshtein","Uta Noppeney"],"pdf_url":"https://arxiv.org/pdf/2308.09685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06836v3","updated":"2023-08-18T16:33:39Z","published":"2023-07-13T15:56:23Z","title":"PC-Droid: Faster diffusion and improved quality for particle cloud\n generation","summary":" Building on the success of PC-JeDi we introduce PC-Droid, a substantially\nimproved diffusion model for the generation of jet particle clouds. By\nleveraging a new diffusion formulation, studying more recent integration\nsolvers, and training on all jet types simultaneously, we are able to achieve\nstate-of-the-art performance for all types of jets across all evaluation\nmetrics. We study the trade-off between generation speed and quality by\ncomparing two attention based architectures, as well as the potential of\nconsistency distillation to reduce the number of diffusion steps. Both the\nfaster architecture and consistency models demonstrate performance surpassing\nmany competing models, with generation time up to two orders of magnitude\nfaster than PC-JeDi and three orders of magnitude faster than Delphes.\n","authors":["Matthew Leigh","Debajyoti Sengupta","John Andrew Raine","Guillaume Quétant","Tobias Golling"],"pdf_url":"https://arxiv.org/pdf/2307.06836v3.pdf","comment":"21 pages, 8 tables, 13 figures"},{"id":"http://arxiv.org/abs/2308.09664v1","updated":"2023-08-18T16:30:57Z","published":"2023-08-18T16:30:57Z","title":"Variational optimization of the amplitude of neural-network quantum\n many-body ground states","summary":" Neural-network quantum states (NQSs), variationally optimized by combining\ntraditional methods and deep learning techniques, is a new way to find quantum\nmany-body ground states and gradually becomes a competitor of traditional\nvariational methods. However, there are still some difficulties in the\noptimization of NQSs, such as local minima, slow convergence, and sign\nstructure optimization. Here, we split a quantum many-body variational wave\nfunction into a multiplication of a real-valued amplitude neural network and a\nsign structure, and focus on the optimization of the amplitude network while\nkeeping the sign structure fixed. The amplitude network is a convolutional\nneural network (CNN) with residual blocks, namely a ResNet. Our method is\ntested on three typical quantum many-body systems. The obtained ground state\nenergies are lower than or comparable to those from traditional variational\nMonte Carlo (VMC) methods and density matrix renormalization group (DMRG).\nSurprisingly, for the frustrated Heisenberg $J_1$-$J_2$ model, our results are\nbetter than those of the complex-valued CNN in the literature, implying that\nthe sign structure of the complex-valued NQS is difficult to be optimized. We\nwill study the optimization of the sign structure of NQSs in the future.\n","authors":["Jia-Qi Wang","Rong-Qiang He","Zhong-Yi Lu"],"pdf_url":"https://arxiv.org/pdf/2308.09664v1.pdf","comment":"7 pages, 2 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.09663v1","updated":"2023-08-18T16:30:51Z","published":"2023-08-18T16:30:51Z","title":"GiGaMAE: Generalizable Graph Masked Autoencoder via Collaborative Latent\n Space Reconstruction","summary":" Self-supervised learning with masked autoencoders has recently gained\npopularity for its ability to produce effective image or textual\nrepresentations, which can be applied to various downstream tasks without\nretraining. However, we observe that the current masked autoencoder models lack\ngood generalization ability on graph data. To tackle this issue, we propose a\nnovel graph masked autoencoder framework called GiGaMAE. Different from\nexisting masked autoencoders that learn node presentations by explicitly\nreconstructing the original graph components (e.g., features or edges), in this\npaper, we propose to collaboratively reconstruct informative and integrated\nlatent embeddings. By considering embeddings encompassing graph topology and\nattribute information as reconstruction targets, our model could capture more\ngeneralized and comprehensive knowledge. Furthermore, we introduce a mutual\ninformation based reconstruction loss that enables the effective reconstruction\nof multiple targets. This learning objective allows us to differentiate between\nthe exclusive knowledge learned from a single target and common knowledge\nshared by multiple targets. We evaluate our method on three downstream tasks\nwith seven datasets as benchmarks. Extensive experiments demonstrate the\nsuperiority of GiGaMAE against state-of-the-art baselines. We hope our results\nwill shed light on the design of foundation models on graph-structured data.\nOur code is available at: https://github.com/sycny/GiGaMAE.\n","authors":["Yucheng Shi","Yushun Dong","Qiaoyu Tan","Jundong Li","Ninghao Liu"],"pdf_url":"https://arxiv.org/pdf/2308.09663v1.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2011.14033v6","updated":"2023-08-18T16:10:22Z","published":"2020-11-28T00:20:36Z","title":"A Tractable Online Learning Algorithm for the Multinomial Logit\n Contextual Bandit","summary":" In this paper, we consider the contextual variant of the MNL-Bandit problem.\nMore specifically, we consider a dynamic set optimization problem, where a\ndecision-maker offers a subset (assortment) of products to a consumer and\nobserves the response in every round. Consumers purchase products to maximize\ntheir utility. We assume that a set of attributes describe the products, and\nthe mean utility of a product is linear in the values of these attributes. We\nmodel consumer choice behavior using the widely used Multinomial Logit (MNL)\nmodel and consider the decision maker problem of dynamically learning the model\nparameters while optimizing cumulative revenue over the selling horizon $T$.\nThough this problem has attracted considerable attention in recent times, many\nexisting methods often involve solving an intractable non-convex optimization\nproblem. Their theoretical performance guarantees depend on a problem-dependent\nparameter which could be prohibitively large. In particular, existing\nalgorithms for this problem have regret bounded by $O(\\sqrt{\\kappa d T})$,\nwhere $\\kappa$ is a problem-dependent constant that can have an exponential\ndependency on the number of attributes. In this paper, we propose an optimistic\nalgorithm and show that the regret is bounded by $O(\\sqrt{dT} + \\kappa)$,\nsignificantly improving the performance over existing methods. Further, we\npropose a convex relaxation of the optimization step, which allows for\ntractable decision-making while retaining the favourable regret guarantee.\n","authors":["Priyank Agrawal","Theja Tulabandhula","Vashist Avadhanula"],"pdf_url":"https://arxiv.org/pdf/2011.14033v6.pdf","comment":"There is a technical issue on how Theorem 9 is used, an update is\n underway"},{"id":"http://arxiv.org/abs/2308.09647v1","updated":"2023-08-18T16:07:01Z","published":"2023-08-18T16:07:01Z","title":"Robust Uncertainty Quantification using Conformalised Monte Carlo\n Prediction","summary":" Deploying deep learning models in safety-critical applications remains a very\nchallenging task, mandating the provision of assurances for the dependable\noperation of these models. Uncertainty quantification (UQ) methods estimate the\nmodel's confidence per prediction, informing decision-making by considering the\neffect of randomness and model misspecification. Despite the advances of\nstate-of-the-art UQ methods, they are computationally expensive or produce\nconservative prediction sets/intervals. We introduce MC-CP, a novel hybrid UQ\nmethod that combines a new adaptive Monte Carlo (MC) dropout method with\nconformal prediction (CP). MC-CP adaptively modulates the traditional MC\ndropout at runtime to save memory and computation resources, enabling\npredictions to be consumed by CP, yielding robust prediction sets/intervals.\nThroughout comprehensive experiments, we show that MC-CP delivers significant\nimprovements over advanced UQ methods, like MC dropout, RAPS and CQR, both in\nclassification and regression benchmarks. MC-CP can be easily added to existing\nmodels, making its deployment simple.\n","authors":["Daniel Bethell","Simos Gerasimou","Radu Calinescu"],"pdf_url":"https://arxiv.org/pdf/2308.09647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10841v2","updated":"2023-08-18T16:04:12Z","published":"2023-06-19T10:40:30Z","title":"Blockchain-Enabled Federated Learning: A Reference Architecture Design,\n Implementation, and Verification","summary":" This paper presents an innovative reference architecture for\nblockchain-enabled federated learning (BCFL), a state-of-the-art approach that\namalgamates the strengths of federated learning and blockchain technology. This\nresults in a decentralized, collaborative machine learning system that respects\ndata privacy and user-controlled identity. Our architecture strategically\nemploys a decentralized identifier (DID)-based authentication system, allowing\nparticipants to authenticate and then gain access to the federated learning\nplatform securely using their self-sovereign DIDs, which are recorded on the\nblockchain. Ensuring robust security and efficient decentralization through the\nexecution of smart contracts is a key aspect of our approach. Moreover, our\nBCFL reference architecture provides significant extensibility, accommodating\nthe integration of various additional elements, as per specific requirements\nand use cases, thereby rendering it an adaptable solution for a wide range of\nBCFL applications. Participants can authenticate and then gain access to the\nfederated learning platform securely using their self-sovereign DIDs, which are\nsecurely recorded on the blockchain. The pivotal contribution of this study is\nthe successful implementation and validation of a realistic BCFL reference\narchitecture, marking a significant milestone in the field. We intend to make\nthe source code publicly accessible shortly, fostering further advancements and\nadaptations within the community. This research not only bridges a crucial gap\nin the current literature but also lays a solid foundation for future\nexplorations in the realm of BCFL.\n","authors":["Eunsu Goh","Daeyeol Kim","Kwangkee Lee","Do-Yup Kim"],"pdf_url":"https://arxiv.org/pdf/2306.10841v2.pdf","comment":"14 pages, 15 figures, 3 tables"},{"id":"http://arxiv.org/abs/2307.12644v2","updated":"2023-08-18T16:03:06Z","published":"2023-07-24T09:35:47Z","title":"Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation\n of rPPG","summary":" rPPG (Remote photoplethysmography) is a technology that measures and analyzes\nBVP (Blood Volume Pulse) by using the light absorption characteristics of\nhemoglobin captured through a camera. Analyzing the measured BVP can derive\nvarious physiological signals such as heart rate, stress level, and blood\npressure, which can be applied to various applications such as telemedicine,\nremote patient monitoring, and early prediction of cardiovascular disease. rPPG\nis rapidly evolving and attracting great attention from both academia and\nindustry by providing great usability and convenience as it can measure\nbiosignals using a camera-equipped device without medical or wearable devices.\nDespite extensive efforts and advances in this field, serious challenges\nremain, including issues related to skin color, camera characteristics, ambient\nlighting, and other sources of noise and artifacts, which degrade accuracy\nperformance. We argue that fair and evaluable benchmarking is urgently required\nto overcome these challenges and make meaningful progress from both academic\nand commercial perspectives. In most existing work, models are trained, tested,\nand validated only on limited datasets. Even worse, some studies lack available\ncode or reproducibility, making it difficult to fairly evaluate and compare\nperformance. Therefore, the purpose of this study is to provide a benchmarking\nframework to evaluate various rPPG techniques across a wide range of datasets\nfor fair evaluation and comparison, including both conventional non-deep neural\nnetwork (non-DNN) and deep neural network (DNN) methods. GitHub URL:\nhttps://github.com/remotebiosensing/rppg\n","authors":["Dae-Yeol Kim","Eunsu Goh","KwangKee Lee","JongEui Chae","JongHyeon Mun","Junyeong Na","Chae-bong Sohn","Do-Yup Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12644v2.pdf","comment":"20 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.09643v1","updated":"2023-08-18T16:01:18Z","published":"2023-08-18T16:01:18Z","title":"biquality-learn: a Python library for Biquality Learning","summary":" The democratization of Data Mining has been widely successful thanks in part\nto powerful and easy-to-use Machine Learning libraries. These libraries have\nbeen particularly tailored to tackle Supervised Learning. However, strong\nsupervision signals are scarce in practice, and practitioners must resort to\nweak supervision. In addition to weaknesses of supervision, dataset shifts are\nanother kind of phenomenon that occurs when deploying machine learning models\nin the real world. That is why Biquality Learning has been proposed as a\nmachine learning framework to design algorithms capable of handling multiple\nweaknesses of supervision and dataset shifts without assumptions on their\nnature and level by relying on the availability of a small trusted dataset\ncomposed of cleanly labeled and representative samples. Thus we propose\nbiquality-learn: a Python library for Biquality Learning with an intuitive and\nconsistent API to learn machine learning models from biquality data, with\nwell-proven algorithms, accessible and easy to use for everyone, and enabling\nresearchers to experiment in a reproducible way on biquality data.\n","authors":["Pierre Nodet","Vincent Lemaire","Alexis Bondu","Antoine Cornuéjols"],"pdf_url":"https://arxiv.org/pdf/2308.09643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09640v1","updated":"2023-08-18T15:59:55Z","published":"2023-08-18T15:59:55Z","title":"Revisiting Skin Tone Fairness in Dermatological Lesion Classification","summary":" Addressing fairness in lesion classification from dermatological images is\ncrucial due to variations in how skin diseases manifest across skin tones.\nHowever, the absence of skin tone labels in public datasets hinders building a\nfair classifier. To date, such skin tone labels have been estimated prior to\nfairness analysis in independent studies using the Individual Typology Angle\n(ITA). Briefly, ITA calculates an angle based on pixels extracted from skin\nimages taking into account the lightness and yellow-blue tints. These angles\nare then categorised into skin tones that are subsequently used to analyse\nfairness in skin cancer classification. In this work, we review and compare\nfour ITA-based approaches of skin tone classification on the ISIC18 dataset, a\ncommon benchmark for assessing skin cancer classification fairness in the\nliterature. Our analyses reveal a high disagreement among previously published\nstudies demonstrating the risks of ITA-based skin tone estimation methods.\nMoreover, we investigate the causes of such large discrepancy among these\napproaches and find that the lack of diversity in the ISIC18 dataset limits its\nuse as a testbed for fairness analysis. Finally, we recommend further research\non robust ITA estimation and diverse dataset acquisition with skin tone\nannotation to facilitate conclusive fairness assessments of artificial\nintelligence tools in dermatology. Our code is available at\nhttps://github.com/tkalbl/RevisitingSkinToneFairness.\n","authors":["Thorsten Kalb","Kaisar Kushibar","Celia Cintas","Karim Lekadir","Oliver Diaz","Richard Osuala"],"pdf_url":"https://arxiv.org/pdf/2308.09640v1.pdf","comment":"Accepted at 2023 MICCAI FAIMI Workshop"},{"id":"http://arxiv.org/abs/2308.09635v1","updated":"2023-08-18T15:53:40Z","published":"2023-08-18T15:53:40Z","title":"Development of a Neural Network-based Method for Improved Imputation of\n Missing Values in Time Series Data by Repurposing DataWig","summary":" Time series data are observations collected over time intervals. Successful\nanalysis of time series data captures patterns such as trends, cyclicity and\nirregularity, which are crucial for decision making in research, business, and\ngovernance. However, missing values in time series data occur often and present\nobstacles to successful analysis, thus they need to be filled with alternative\nvalues, a process called imputation. Although various approaches have been\nattempted for robust imputation of time series data, even the most advanced\nmethods still face challenges including limited scalability, poor capacity to\nhandle heterogeneous data types and inflexibility due to requiring strong\nassumptions of data missing mechanisms. Moreover, the imputation accuracy of\nthese methods still has room for improvement. In this study, I developed\ntsDataWig (time-series DataWig) by modifying DataWig, a neural network-based\nmethod that possesses the capacity to process large datasets and heterogeneous\ndata types but was designed for non-time series data imputation. Unlike the\noriginal DataWig, tsDataWig can directly handle values of time variables and\nimpute missing values in complex time series datasets. Using one simulated and\nthree different complex real-world time series datasets, I demonstrated that\ntsDataWig outperforms the original DataWig and the current state-of-the-art\nmethods for time series data imputation and potentially has broad application\ndue to not requiring strong assumptions of data missing mechanisms. This study\nprovides a valuable solution for robustly imputing missing values in\nchallenging time series datasets, which often contain millions of samples, high\ndimensional variables, and heterogeneous data types.\n","authors":["Daniel Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09635v1.pdf","comment":"16 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.09632v1","updated":"2023-08-18T15:44:45Z","published":"2023-08-18T15:44:45Z","title":"VALERIE22 -- A photorealistic, richly metadata annotated dataset of\n urban environments","summary":" The VALERIE tool pipeline is a synthetic data generator developed with the\ngoal to contribute to the understanding of domain-specific factors that\ninfluence perception performance of DNNs (deep neural networks). This work was\ncarried out under the German research project KI Absicherung in order to\ndevelop a methodology for the validation of DNNs in the context of pedestrian\ndetection in urban environments for automated driving. The VALERIE22 dataset\nwas generated with the VALERIE procedural tools pipeline providing a\nphotorealistic sensor simulation rendered from automatically synthesized\nscenes. The dataset provides a uniquely rich set of metadata, allowing\nextraction of specific scene and semantic features (like pixel-accurate\nocclusion rates, positions in the scene and distance + angle to the camera).\nThis enables a multitude of possible tests on the data and we hope to stimulate\nresearch on understanding performance of DNNs. Based on performance metric a\ncomparison with several other publicly available datasets is provided,\ndemonstrating that VALERIE22 is one of best performing synthetic datasets\ncurrently available in the open domain.\n","authors":["Oliver Grau","Korbinian Hagn"],"pdf_url":"https://arxiv.org/pdf/2308.09632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09629v1","updated":"2023-08-18T15:43:31Z","published":"2023-08-18T15:43:31Z","title":"Learning Computational Efficient Bots with Costly Features","summary":" Deep reinforcement learning (DRL) techniques have become increasingly used in\nvarious fields for decision-making processes. However, a challenge that often\narises is the trade-off between both the computational efficiency of the\ndecision-making process and the ability of the learned agent to solve a\nparticular task. This is particularly critical in real-time settings such as\nvideo games where the agent needs to take relevant decisions at a very high\nfrequency, with a very limited inference time.\n In this work, we propose a generic offline learning approach where the\ncomputation cost of the input features is taken into account. We derive the\nBudgeted Decision Transformer as an extension of the Decision Transformer that\nincorporates cost constraints to limit its cost at inference. As a result, the\nmodel can dynamically choose the best input features at each timestep. We\ndemonstrate the effectiveness of our method on several tasks, including D4RL\nbenchmarks and complex 3D environments similar to those found in video games,\nand show that it can achieve similar performance while using significantly\nfewer computational resources compared to classical approaches.\n","authors":["Anthony Kobanda","Valliappan C. A.","Joshua Romoff","Ludovic Denoyer"],"pdf_url":"https://arxiv.org/pdf/2308.09629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05681v2","updated":"2023-08-18T15:34:40Z","published":"2023-08-10T16:34:20Z","title":"Hard No-Box Adversarial Attack on Skeleton-Based Human Action\n Recognition with Skeleton-Motion-Informed Gradient","summary":" Recently, methods for skeleton-based human activity recognition have been\nshown to be vulnerable to adversarial attacks. However, these attack methods\nrequire either the full knowledge of the victim (i.e. white-box attacks),\naccess to training data (i.e. transfer-based attacks) or frequent model queries\n(i.e. black-box attacks). All their requirements are highly restrictive,\nraising the question of how detrimental the vulnerability is. In this paper, we\nshow that the vulnerability indeed exists. To this end, we consider a new\nattack task: the attacker has no access to the victim model or the training\ndata or labels, where we coin the term hard no-box attack. Specifically, we\nfirst learn a motion manifold where we define an adversarial loss to compute a\nnew gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our\ngradient contains information of the motion dynamics, which is different from\nexisting gradient-based attack methods that compute the loss gradient assuming\neach dimension in the data is independent. The SMI gradient can augment many\ngradient-based attack methods, leading to a new family of no-box attack\nmethods. Extensive evaluation and comparison show that our method imposes a\nreal threat to existing classifiers. They also show that the SMI gradient\nimproves the transferability and imperceptibility of adversarial samples in\nboth no-box and transfer-based black-box settings.\n","authors":["Zhengzhi Lu","He Wang","Ziyi Chang","Guoan Yang","Hubert P. H. Shum"],"pdf_url":"https://arxiv.org/pdf/2308.05681v2.pdf","comment":"Camera-ready version for ICCV 2023"},{"id":"http://arxiv.org/abs/2307.11792v2","updated":"2023-08-18T15:22:21Z","published":"2023-07-20T20:45:44Z","title":"Quantum Convolutional Neural Networks with Interaction Layers for\n Classification of Classical Data","summary":" Quantum Machine Learning (QML) has come into the limelight due to the\nexceptional computational abilities of quantum computers. With the promises of\nnear error-free quantum computers in the not-so-distant future, it is important\nthat the effect of multi-qubit interactions on quantum neural networks is\nstudied extensively. This paper introduces a Quantum Convolutional Network with\nnovel Interaction layers exploiting three-qubit interactions increasing the\nnetwork's expressibility and entangling capability, for classifying both image\nand one-dimensional data. The proposed approach is tested on three publicly\navailable datasets namely MNIST, Fashion MNIST, and Iris datasets, to perform\nbinary and multiclass classifications and is found to supersede the performance\nof the existing state-of-the-art methods.\n","authors":["Jishnu Mahmud","Raisa Mashtura","Shaikh Anowarul Fattah","Mohammad Saquib"],"pdf_url":"https://arxiv.org/pdf/2307.11792v2.pdf","comment":"20 pages, 14 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.09612v1","updated":"2023-08-18T15:14:31Z","published":"2023-08-18T15:14:31Z","title":"Constrained Bayesian Optimization Using a Lagrange Multiplier Applied to\n Power Transistor Design","summary":" We propose a novel constrained Bayesian Optimization (BO) algorithm\noptimizing the design process of Laterally-Diffused Metal-Oxide-Semiconductor\n(LDMOS) transistors while realizing a target Breakdown Voltage (BV). We convert\nthe constrained BO problem into a conventional BO problem using a Lagrange\nmultiplier. Instead of directly optimizing the traditional Figure-of-Merit\n(FOM), we set the Lagrangian as the objective function of BO. This adaptive\nobjective function with a changeable Lagrange multiplier can address\nconstrained BO problems which have constraints that require costly evaluations,\nwithout the need for additional surrogate models to approximate constraints.\nOur algorithm enables a device designer to set the target BV in the design\nspace, and obtain a device that satisfies the optimized FOM and the target BV\nconstraint automatically. Utilizing this algorithm, we have also explored the\nphysical limits of the FOM for our devices in 30 - 50 V range within the\ndefined design space.\n","authors":["Ping-Ju Chuang","Ali Saadat","Sara Ghazvini","Hal Edwards","William G. Vandenberghe"],"pdf_url":"https://arxiv.org/pdf/2308.09612v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.09605v1","updated":"2023-08-18T14:58:23Z","published":"2023-08-18T14:58:23Z","title":"Solving PDEs on Spheres with Physics-Informed Convolutional Neural\n Networks","summary":" Physics-informed neural networks (PINNs) have been demonstrated to be\nefficient in solving partial differential equations (PDEs) from a variety of\nexperimental perspectives. Some recent studies have also proposed PINN\nalgorithms for PDEs on surfaces, including spheres. However, theoretical\nunderstanding of the numerical performance of PINNs, especially PINNs on\nsurfaces or manifolds, is still lacking. In this paper, we establish rigorous\nanalysis of the physics-informed convolutional neural network (PICNN) for\nsolving PDEs on the sphere. By using and improving the latest approximation\nresults of deep convolutional neural networks and spherical harmonic analysis,\nwe prove an upper bound for the approximation error with respect to the Sobolev\nnorm. Subsequently, we integrate this with innovative localization complexity\nanalysis to establish fast convergence rates for PICNN. Our theoretical results\nare also confirmed and supplemented by our experiments. In light of these\nfindings, we explore potential strategies for circumventing the curse of\ndimensionality that arises when solving high-dimensional PDEs.\n","authors":["Guanhang Lei","Zhen Lei","Lei Shi","Chenyu Zeng","Ding-Xuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.09605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09604v1","updated":"2023-08-18T14:57:21Z","published":"2023-08-18T14:57:21Z","title":"Breaking the Complexity Barrier in Compositional Minimax Optimization","summary":" Compositional minimax optimization is a pivotal yet under-explored challenge\nacross machine learning, including distributionally robust training and policy\nevaluation for reinforcement learning. Current techniques exhibit suboptimal\ncomplexity or rely heavily on large batch sizes. This paper proposes Nested\nSTOchastic Recursive Momentum (NSTORM), attaining the optimal sample complexity\nof $O(\\kappa^3/\\epsilon^3)$ for finding an $\\epsilon$-accurate solution.\nHowever, NSTORM requires low learning rates, potentially limiting\napplicability. Thus we introduce ADAptive NSTORM (ADA-NSTORM) with adaptive\nlearning rates, proving it achieves the same sample complexity while\nexperiments demonstrate greater effectiveness. Our methods match lower bounds\nfor minimax optimization without large batch requirements, validated through\nextensive experiments. This work significantly advances compositional minimax\noptimization, a crucial capability for distributional robustness and policy\nevaluation\n","authors":["Jin Liu","Xiaokang Pan","Junwen Duan","Hongdong Li","Youqi Li","Zhe Qu"],"pdf_url":"https://arxiv.org/pdf/2308.09604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08010v2","updated":"2023-08-18T14:51:30Z","published":"2023-03-14T15:57:54Z","title":"Window-Based Early-Exit Cascades for Uncertainty Estimation: When Deep\n Ensembles are More Efficient than Single Models","summary":" Deep Ensembles are a simple, reliable, and effective method of improving both\nthe predictive performance and uncertainty estimates of deep learning\napproaches. However, they are widely criticised as being computationally\nexpensive, due to the need to deploy multiple independent models. Recent work\nhas challenged this view, showing that for predictive accuracy, ensembles can\nbe more computationally efficient (at inference) than scaling single models\nwithin an architecture family. This is achieved by cascading ensemble members\nvia an early-exit approach. In this work, we investigate extending these\nefficiency gains to tasks related to uncertainty estimation. As many such\ntasks, e.g. selective classification, are binary classification, our key novel\ninsight is to only pass samples within a window close to the binary decision\nboundary to later cascade stages. Experiments on ImageNet-scale data across a\nnumber of network architectures and uncertainty tasks show that the proposed\nwindow-based early-exit approach is able to achieve a superior\nuncertainty-computation trade-off compared to scaling single models. For\nexample, a cascaded EfficientNet-B2 ensemble is able to achieve similar\ncoverage at 5% risk as a single EfficientNet-B4 with <30% the number of MACs.\nWe also find that cascades/ensembles give more reliable improvements on OOD\ndata vs scaling models up. Code for this work is available at:\nhttps://github.com/Guoxoug/window-early-exit.\n","authors":["Guoxuan Xia","Christos-Savvas Bouganis"],"pdf_url":"https://arxiv.org/pdf/2303.08010v2.pdf","comment":"Accepted to ICCV 2023 (camera-ready version, 9 pages)"},{"id":"http://arxiv.org/abs/2308.09596v1","updated":"2023-08-18T14:45:28Z","published":"2023-08-18T14:45:28Z","title":"Disparity, Inequality, and Accuracy Tradeoffs in Graph Neural Networks\n for Node Classification","summary":" Graph neural networks (GNNs) are increasingly used in critical human\napplications for predicting node labels in attributed graphs. Their ability to\naggregate features from nodes' neighbors for accurate classification also has\nthe capacity to exacerbate existing biases in data or to introduce new ones\ntowards members from protected demographic groups. Thus, it is imperative to\nquantify how GNNs may be biased and to what extent their harmful effects may be\nmitigated. To this end, we propose two new GNN-agnostic interventions namely,\n(i) PFR-AX which decreases the separability between nodes in protected and\nnon-protected groups, and (ii) PostProcess which updates model predictions\nbased on a blackbox policy to minimize differences between error rates across\ndemographic groups. Through a large set of experiments on four datasets, we\nframe the efficacies of our approaches (and three variants) in terms of their\nalgorithmic fairness-accuracy tradeoff and benchmark our results against three\nstrong baseline interventions on three state-of-the-art GNN models. Our results\nshow that no single intervention offers a universally optimal tradeoff, but\nPFR-AX and PostProcess provide granular control and improve model confidence\nwhen correctly predicting positive outcomes for nodes in protected groups.\n","authors":["Arpit Merchant","Carlos Castillo"],"pdf_url":"https://arxiv.org/pdf/2308.09596v1.pdf","comment":"Accepted to CIKM 2023"},{"id":"http://arxiv.org/abs/2308.02535v2","updated":"2023-08-18T14:42:58Z","published":"2023-08-01T10:02:26Z","title":"Learning to Generate Training Datasets for Robust Semantic Segmentation","summary":" Semantic segmentation techniques have shown significant progress in recent\nyears, but their robustness to real-world perturbations and data samples not\nseen during training remains a challenge, particularly in safety-critical\napplications. In this paper, we propose a novel approach to improve the\nrobustness of semantic segmentation techniques by leveraging the synergy\nbetween label-to-image generators and image-to-label segmentation models.\nSpecifically, we design and train Robusta, a novel robust conditional\ngenerative adversarial network to generate realistic and plausible perturbed or\noutlier images that can be used to train reliable segmentation models. We\nconduct in-depth studies of the proposed generative model, assess the\nperformance and robustness of the downstream segmentation network, and\ndemonstrate that our approach can significantly enhance the robustness of\nsemantic segmentation techniques in the face of real-world perturbations,\ndistribution shifts, and out-of-distribution samples. Our results suggest that\nthis approach could be valuable in safety-critical applications, where the\nreliability of semantic segmentation techniques is of utmost importance and\ncomes with a limited computational budget in inference. We will release our\ncode shortly.\n","authors":["Marwane Hariat","Olivier Laurent","Rémi Kazmierczak","Shihao Zhang","Andrei Bursuc","Angela Yao","Gianni Franchi"],"pdf_url":"https://arxiv.org/pdf/2308.02535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03295v2","updated":"2023-08-18T14:40:20Z","published":"2023-08-07T04:44:12Z","title":"DOMINO: Domain-invariant Hyperdimensional Classification for\n Multi-Sensor Time Series Data","summary":" With the rapid evolution of the Internet of Things, many real-world\napplications utilize heterogeneously connected sensors to capture time-series\ninformation. Edge-based machine learning (ML) methodologies are often employed\nto analyze locally collected data. However, a fundamental issue across\ndata-driven ML approaches is distribution shift. It occurs when a model is\ndeployed on a data distribution different from what it was trained on, and can\nsubstantially degrade model performance. Additionally, increasingly\nsophisticated deep neural networks (DNNs) have been proposed to capture spatial\nand temporal dependencies in multi-sensor time series data, requiring intensive\ncomputational resources beyond the capacity of today's edge devices. While\nbrain-inspired hyperdimensional computing (HDC) has been introduced as a\nlightweight solution for edge-based learning, existing HDCs are also vulnerable\nto the distribution shift challenge. In this paper, we propose DOMINO, a novel\nHDC learning framework addressing the distribution shift problem in noisy\nmulti-sensor time-series data. DOMINO leverages efficient and parallel matrix\noperations on high-dimensional space to dynamically identify and filter out\ndomain-variant dimensions. Our evaluation on a wide range of multi-sensor time\nseries classification tasks shows that DOMINO achieves on average 2.04% higher\naccuracy than state-of-the-art (SOTA) DNN-based domain generalization\ntechniques, and delivers 16.34x faster training and 2.89x faster inference.\nMore importantly, DOMINO performs notably better when learning from partially\nlabeled and highly imbalanced data, providing 10.93x higher robustness against\nhardware noises than SOTA DNNs.\n","authors":["Junyao Wang","Luke Chen","Mohammad Abdullah Al Faruque"],"pdf_url":"https://arxiv.org/pdf/2308.03295v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.00700v2","updated":"2023-08-18T14:31:08Z","published":"2022-06-01T18:18:18Z","title":"RoCourseNet: Distributionally Robust Training of a Prediction Aware\n Recourse Model","summary":" Counterfactual (CF) explanations for machine learning (ML) models are\npreferred by end-users, as they explain the predictions of ML models by\nproviding a recourse (or contrastive) case to individuals who are adversely\nimpacted by predicted outcomes. Existing CF explanation methods generate\nrecourses under the assumption that the underlying target ML model remains\nstationary over time. However, due to commonly occurring distributional shifts\nin training data, ML models constantly get updated in practice, which might\nrender previously generated recourses invalid and diminish end-users trust in\nour algorithmic framework. To address this problem, we propose RoCourseNet, a\ntraining framework that jointly optimizes predictions and recourses that are\nrobust to future data shifts. This work contains four key contributions: (1) We\nformulate the robust recourse generation problem as a tri-level optimization\nproblem which consists of two sub-problems: (i) a bi-level problem that finds\nthe worst-case adversarial shift in the training data, and (ii) an outer\nminimization problem to generate robust recourses against this worst-case\nshift. (2) We leverage adversarial training to solve this tri-level\noptimization problem by: (i) proposing a novel virtual data shift (VDS)\nalgorithm to find worst-case shifted ML models via explicitly considering the\nworst-case data shift in the training dataset, and (ii) a block-wise coordinate\ndescent procedure to optimize for prediction and corresponding robust\nrecourses. (3) We evaluate RoCourseNet's performance on three real-world\ndatasets, and show that RoCourseNet consistently achieves more than 96% robust\nvalidity and outperforms state-of-the-art baselines by at least 10% in\ngenerating robust CF explanations. (4) Finally, we generalize the RoCourseNet\nframework to accommodate any parametric post-hoc methods for improving robust\nvalidity.\n","authors":["Hangzhi Guo","Feiran Jia","Jinghui Chen","Anna Squicciarini","Amulya Yadav"],"pdf_url":"https://arxiv.org/pdf/2206.00700v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09583v1","updated":"2023-08-18T14:23:21Z","published":"2023-08-18T14:23:21Z","title":"WizardMath: Empowering Mathematical Reasoning for Large Language Models\n via Reinforced Evol-Instruct","summary":" Large language models (LLMs), such as GPT-4, have shown remarkable\nperformance in natural language processing (NLP) tasks, including challenging\nmathematical reasoning. However, most existing open-source models are only\npre-trained on large-scale internet data and without math-related optimization.\nIn this paper, we present WizardMath, which enhances the mathematical reasoning\nabilities of Llama-2, by applying our proposed Reinforcement Learning from\nEvol-Instruct Feedback (RLEIF) method to the domain of math. Through extensive\nexperiments on two mathematical reasoning benchmarks, namely GSM8k and MATH, we\nreveal the extraordinary capabilities of our model. WizardMath surpasses all\nother open-source LLMs by a substantial margin. Furthermore, our model even\noutperforms ChatGPT-3.5, Claude Instant-1, PaLM-2 and Minerva on GSM8k,\nsimultaneously surpasses Text-davinci-002, PaLM-1 and GPT-3 on MATH. More\ndetails and model weights are public at https://github.com/nlpxucan/WizardLM\nand https://huggingface.co/WizardLM.\n","authors":["Haipeng Luo","Qingfeng Sun","Can Xu","Pu Zhao","Jianguang Lou","Chongyang Tao","Xiubo Geng","Qingwei Lin","Shifeng Chen","Dongmei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09583v1.pdf","comment":"LLM, Mathematical Reasoning"},{"id":"http://arxiv.org/abs/2308.06668v2","updated":"2023-08-18T14:16:37Z","published":"2023-08-13T02:59:36Z","title":"Foundation Models in Smart Agriculture: Basics, Opportunities, and\n Challenges","summary":" The past decade has witnessed the rapid development of ML and DL\nmethodologies in agricultural systems, showcased by great successes in variety\nof agricultural applications. However, these conventional ML/DL models have\ncertain limitations: They heavily rely on large, costly-to-acquire labeled\ndatasets for training, require specialized expertise for development and\nmaintenance, and are mostly tailored for specific tasks, thus lacking\ngeneralizability. Recently, foundation models have demonstrated remarkable\nsuccesses in language and vision tasks across various domains. These models are\ntrained on a vast amount of data from multiple domains and modalities. Once\ntrained, they can accomplish versatile tasks with just minor fine-tuning and\nminimal task-specific labeled data. Despite their proven effectiveness and huge\npotential, there has been little exploration of applying FMs to agriculture\nfields. Therefore, this study aims to explore the potential of FMs in the field\nof smart agriculture. In particular, we present conceptual tools and technical\nbackground to facilitate the understanding of the problem space and uncover new\nresearch directions in this field. To this end, we first review recent FMs in\nthe general computer science domain and categorize them into four categories:\nlanguage FMs, vision FMs, multimodal FMs, and reinforcement learning FMs.\nSubsequently, we outline the process of developing agriculture FMs and discuss\ntheir potential applications in smart agriculture. We also discuss the unique\nchallenges associated with developing AFMs, including model training,\nvalidation, and deployment. Through this study, we contribute to the\nadvancement of AI in agriculture by introducing AFMs as a promising paradigm\nthat can significantly mitigate the reliance on extensive labeled datasets and\nenhance the efficiency, effectiveness, and generalization of agricultural AI\nsystems.\n","authors":["Jiajia Li","Mingle Xu","Lirong Xiang","Dong Chen","Weichao Zhuang","Xunyuan Yin","Zhaojian Li"],"pdf_url":"https://arxiv.org/pdf/2308.06668v2.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2203.05920v4","updated":"2023-08-18T14:16:12Z","published":"2022-03-11T13:45:42Z","title":"Generalized Bandit Regret Minimizer Framework in Imperfect Information\n Extensive-Form Game","summary":" Regret minimization methods are a powerful tool for learning approximate Nash\nequilibrium (NE) in two-player zero-sum imperfect information extensive-form\ngames (IIEGs). We consider the problem in the interactive bandit-feedback\nsetting where we don't know the dynamics of the IIEG. In general, only the\ninteractive trajectory and the reached terminal node value $v(z^t)$ are\nrevealed. To learn NE, the regret minimizer is required to estimate the\nfull-feedback loss gradient $\\ell^t$ by $v(z^t)$ and minimize the regret. In\nthis paper, we propose a generalized framework for this learning setting. It\npresents a theoretical framework for the design and the modular analysis of the\nbandit regret minimization methods. We demonstrate that the most recent bandit\nregret minimization methods can be analyzed as a particular case of our\nframework. Following this framework, we describe a novel method SIX-OMD to\nlearn approximate NE. It is model-free and extremely improves the best existing\nconvergence rate from the order of $O(\\sqrt{X B/T}+\\sqrt{Y C/T})$ to $O(\\sqrt{\nM_{\\mathcal{X}}/T} +\\sqrt{ M_{\\mathcal{Y}}/T})$. Moreover, SIX-OMD is\ncomputationally efficient as it needs to perform the current strategy and\naverage strategy updates only along the sampled trajectory.\n","authors":["Linjian Meng","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2203.05920v4.pdf","comment":"The proof of this paper includes many errors, especially for SIX-OMD,\n the regret bound of this algorithm is not right since this regret is lower\n than the lowest theoretical regret bound obtained by information theory"},{"id":"http://arxiv.org/abs/2308.09571v1","updated":"2023-08-18T14:03:34Z","published":"2023-08-18T14:03:34Z","title":"Physics-Informed Boundary Integral Networks (PIBI-Nets): A Data-Driven\n Approach for Solving Partial Differential Equations","summary":" Partial differential equations (PDEs) can describe many relevant phenomena in\ndynamical systems. In real-world applications, we commonly need to combine\nformal PDE models with (potentially noisy) observations. This is especially\nrelevant in settings where we lack information about boundary or initial\nconditions, or where we need to identify unknown model parameters. In recent\nyears, Physics-informed neural networks (PINNs) have become a popular tool for\nproblems of this kind. In high-dimensional settings, however, PINNs often\nsuffer from computational problems because they usually require dense\ncollocation points over the entire computational domain. To address this\nproblem, we present Physics-Informed Boundary Integral Networks (PIBI-Nets) as\na data-driven approach for solving PDEs in one dimension less than the original\nproblem space. PIBI-Nets only need collocation points at the computational\ndomain boundary, while still achieving highly accurate results, and in several\npractical settings, they clearly outperform PINNs. Exploiting elementary\nproperties of fundamental solutions of linear differential operators, we\npresent a principled and simple way to handle point sources in inverse\nproblems. We demonstrate the excellent performance of PIBI-Nets for the Laplace\nand Poisson equations, both on artificial data sets and within a real-world\napplication concerning the reconstruction of groundwater flows.\n","authors":["Monika Nagy-Huber","Volker Roth"],"pdf_url":"https://arxiv.org/pdf/2308.09571v1.pdf","comment":"Preprint. Submitted to Journal of Computational Science, Elsevier,\n for special issue \"Machine Learning and Data Assimilation for Dynamical\n Systems\""},{"id":"http://arxiv.org/abs/2308.09570v1","updated":"2023-08-18T14:02:56Z","published":"2023-08-18T14:02:56Z","title":"Investigating the Interplay between Features and Structures in Graph\n Learning","summary":" In the past, the dichotomy between homophily and heterophily has inspired\nresearch contributions toward a better understanding of Deep Graph Networks'\ninductive bias. In particular, it was believed that homophily strongly\ncorrelates with better node classification predictions of message-passing\nmethods. More recently, however, researchers pointed out that such dichotomy is\ntoo simplistic as we can construct node classification tasks where graphs are\ncompletely heterophilic but the performances remain high. Most of these works\nhave also proposed new quantitative metrics to understand when a graph\nstructure is useful, which implicitly or explicitly assume the correlation\nbetween node features and target labels. Our work empirically investigates what\nhappens when this strong assumption does not hold, by formalising two\ngenerative processes for node classification tasks that allow us to build and\nstudy ad-hoc problems. To quantitatively measure the influence of the node\nfeatures on the target labels, we also use a metric we call Feature\nInformativeness. We construct six synthetic tasks and evaluate the performance\nof six models, including structure-agnostic ones. Our findings reveal that\npreviously defined metrics are not adequate when we relax the above assumption.\nOur contribution to the workshop aims at presenting novel research findings\nthat could help advance our understanding of the field.\n","authors":["Daniele Castellana","Federico Errica"],"pdf_url":"https://arxiv.org/pdf/2308.09570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09565v1","updated":"2023-08-18T13:57:04Z","published":"2023-08-18T13:57:04Z","title":"Normalization Is All You Need: Understanding Layer-Normalized Federated\n Learning under Extreme Label Shift","summary":" Layer normalization (LN) is a widely adopted deep learning technique\nespecially in the era of foundation models. Recently, LN has been shown to be\nsurprisingly effective in federated learning (FL) with non-i.i.d. data.\nHowever, exactly why and how it works remains mysterious. In this work, we\nreveal the profound connection between layer normalization and the label shift\nproblem in federated learning. To understand layer normalization better in FL,\nwe identify the key contributing mechanism of normalization methods in FL,\ncalled feature normalization (FN), which applies normalization to the latent\nfeature representation before the classifier head. Although LN and FN do not\nimprove expressive power, they control feature collapse and local overfitting\nto heavily skewed datasets, and thus accelerates global training. Empirically,\nwe show that normalization leads to drastic improvements on standard benchmarks\nunder extreme label shift. Moreover, we conduct extensive ablation studies to\nunderstand the critical factors of layer normalization in FL. Our results\nverify that FN is an essential ingredient inside LN to significantly improve\nthe convergence of FL while remaining robust to learning rate choices,\nespecially under extreme label shift where each client has access to few\nclasses.\n","authors":["Guojun Zhang","Mahdi Beitollahi","Alex Bie","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2308.09565v1.pdf","comment":"23 pages, 9 figures"},{"id":"http://arxiv.org/abs/2305.05293v2","updated":"2023-08-18T13:35:05Z","published":"2023-05-09T09:31:04Z","title":"On the Limitations of Model Stealing with Uncertainty Quantification\n Models","summary":" Model stealing aims at inferring a victim model's functionality at a fraction\nof the original training cost. While the goal is clear, in practice the model's\narchitecture, weight dimension, and original training data can not be\ndetermined exactly, leading to mutual uncertainty during stealing. In this\nwork, we explicitly tackle this uncertainty by generating multiple possible\nnetworks and combining their predictions to improve the quality of the stolen\nmodel. For this, we compare five popular uncertainty quantification models in a\nmodel stealing task. Surprisingly, our results indicate that the considered\nmodels only lead to marginal improvements in terms of label agreement (i.e.,\nfidelity) to the stolen model. To find the cause of this, we inspect the\ndiversity of the model's prediction by looking at the prediction variance as a\nfunction of training iterations. We realize that during training, the models\ntend to have similar predictions, indicating that the network diversity we\nwanted to leverage using uncertainty quantification models is not (high) enough\nfor improvements on the model stealing task.\n","authors":["David Pape","Sina Däubener","Thorsten Eisenhofer","Antonio Emanuele Cinà","Lea Schönherr"],"pdf_url":"https://arxiv.org/pdf/2305.05293v2.pdf","comment":"6 pages, 1 figure, 2 table, paper submitted to European Symposium on\n Artificial Neural Networks, Computational Intelligence and Machine Learning"},{"id":"http://arxiv.org/abs/2308.09552v1","updated":"2023-08-18T13:33:02Z","published":"2023-08-18T13:33:02Z","title":"Attesting Distributional Properties of Training Data for Machine\n Learning","summary":" The success of machine learning (ML) has been accompanied by increased\nconcerns about its trustworthiness. Several jurisdictions are preparing ML\nregulatory frameworks. One such concern is ensuring that model training data\nhas desirable distributional properties for certain sensitive attributes. For\nexample, draft regulations indicate that model trainers are required to show\nthat training datasets have specific distributional properties, such as\nreflecting diversity of the population.\n We propose the notion of property attestation allowing a prover (e.g., model\ntrainer) to demonstrate relevant distributional properties of training data to\na verifier (e.g., a customer) without revealing the data. We present an\neffective hybrid property attestation combining property inference with\ncryptographic mechanisms.\n","authors":["Vasisht Duddu","Anudeep Das","Nora Khayata","Hossein Yalame","Thomas Schneider","N. Asokan"],"pdf_url":"https://arxiv.org/pdf/2308.09552v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1902.10664v5","updated":"2023-08-18T13:32:13Z","published":"2019-02-27T17:55:06Z","title":"Local Function Complexity for Active Learning via Mixture of Gaussian\n Processes","summary":" Inhomogeneities in real-world data, e.g., due to changes in the observation\nnoise level or variations in the structural complexity of the source function,\npose a unique set of challenges for statistical inference. Accounting for them\ncan greatly improve predictive power when physical resources or computation\ntime is limited. In this paper, we draw on recent theoretical results on the\nestimation of local function complexity (LFC), derived from the domain of local\npolynomial smoothing (LPS), to establish a notion of local structural\ncomplexity, which is used to develop a model-agnostic active learning (AL)\nframework. Due to its reliance on pointwise estimates, the LPS model class is\nnot robust and scalable concerning large input space dimensions that typically\ncome along with real-world problems. Here, we derive and estimate the Gaussian\nprocess regression (GPR)-based analog of the LPS-based LFC and use it as a\nsubstitute in the above framework to make it robust and scalable. We assess the\neffectiveness of our LFC estimate in an AL application on a prototypical\nlow-dimensional synthetic dataset, before taking on the challenging real-world\ntask of reconstructing a quantum chemical force field for a small organic\nmolecule and demonstrating state-of-the-art performance with a significantly\nreduced training demand.\n","authors":["Danny Panknin","Stefan Chmiela","Klaus-Robert Müller","Shinichi Nakajima"],"pdf_url":"https://arxiv.org/pdf/1902.10664v5.pdf","comment":"27 pages (+16 pages of references and appendices), 19 figures"},{"id":"http://arxiv.org/abs/2308.09544v1","updated":"2023-08-18T13:22:59Z","published":"2023-08-18T13:22:59Z","title":"Adapt Your Teacher: Improving Knowledge Distillation for Exemplar-free\n Continual Learning","summary":" In this work, we investigate exemplar-free class incremental learning (CIL)\nwith knowledge distillation (KD) as a regularization strategy, aiming to\nprevent forgetting. KD-based methods are successfully used in CIL, but they\noften struggle to regularize the model without access to exemplars of the\ntraining data from previous tasks. Our analysis reveals that this issue\noriginates from substantial representation shifts in the teacher network when\ndealing with out-of-distribution data. This causes large errors in the KD loss\ncomponent, leading to performance degradation in CIL. Inspired by recent\ntest-time adaptation methods, we introduce Teacher Adaptation (TA), a method\nthat concurrently updates the teacher and the main model during incremental\ntraining. Our method seamlessly integrates with KD-based CIL approaches and\nallows for consistent enhancement of their performance across multiple\nexemplar-free CIL benchmarks.\n","authors":["Filip Szatkowski","Mateusz Pyla","Marcin Przewięźlikowski","Sebastian Cygert","Bartłomiej Twardowski","Tomasz Trzciński"],"pdf_url":"https://arxiv.org/pdf/2308.09544v1.pdf","comment":"VCL workshop at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09543v1","updated":"2023-08-18T13:20:08Z","published":"2023-08-18T13:20:08Z","title":"Latent State Models of Training Dynamics","summary":" The impact of randomness on model training is poorly understood. How do\ndifferences in data order and initialization actually manifest in the model,\nsuch that some training runs outperform others or converge faster? Furthermore,\nhow can we interpret the resulting training dynamics and the phase transitions\nthat characterize different trajectories? To understand the effect of\nrandomness on the dynamics and outcomes of neural network training, we train\nmodels multiple times with different random seeds and compute a variety of\nmetrics throughout training, such as the $L_2$ norm, mean, and variance of the\nneural network's weights. We then fit a hidden Markov model (HMM) over the\nresulting sequences of metrics. The HMM represents training as a stochastic\nprocess of transitions between latent states, providing an intuitive overview\nof significant changes during training. Using our method, we produce a\nlow-dimensional, discrete representation of training dynamics on grokking\ntasks, image classification, and masked language modeling. We use the HMM\nrepresentation to study phase transitions and identify latent \"detour\" states\nthat slow down convergence.\n","authors":["Michael Y. Hu","Angelica Chen","Naomi Saphra","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2308.09543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09542v1","updated":"2023-08-18T13:19:26Z","published":"2023-08-18T13:19:26Z","title":"Decoupled conditional contrastive learning with variable metadata for\n prostate lesion detection","summary":" Early diagnosis of prostate cancer is crucial for efficient treatment.\nMulti-parametric Magnetic Resonance Images (mp-MRI) are widely used for lesion\ndetection. The Prostate Imaging Reporting and Data System (PI-RADS) has\nstandardized interpretation of prostate MRI by defining a score for lesion\nmalignancy. PI-RADS data is readily available from radiology reports but is\nsubject to high inter-reports variability. We propose a new contrastive loss\nfunction that leverages weak metadata with multiple annotators per sample and\ntakes advantage of inter-reports variability by defining metadata confidence.\nBy combining metadata of varying confidence with unannotated data into a single\nconditional contrastive loss function, we report a 3% AUC increase on lesion\ndetection on the public PI-CAI challenge dataset.\n Code is available at: https://github.com/camilleruppli/decoupled_ccl\n","authors":["Camille Ruppli","Pietro Gori","Roberto Ardon","Isabelle Bloch"],"pdf_url":"https://arxiv.org/pdf/2308.09542v1.pdf","comment":"Accepted at MILLanD workshop (MICCAI)"},{"id":"http://arxiv.org/abs/2308.09531v1","updated":"2023-08-18T13:11:23Z","published":"2023-08-18T13:11:23Z","title":"Privacy-Preserving 3-Layer Neural Network Training using Mere\n Homomorphic Encryption Technique","summary":" In this manuscript, we consider the problem of privacy-preserving training of\nneural networks in the mere homomorphic encryption setting. We combine several\nexsiting techniques available, extend some of them, and finally enable the\ntraining of 3-layer neural networks for both the regression and classification\nproblems using mere homomorphic encryption technique.\n","authors":["John Chiang"],"pdf_url":"https://arxiv.org/pdf/2308.09531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06619v2","updated":"2023-08-18T13:00:23Z","published":"2023-08-12T17:27:49Z","title":"Can Unstructured Pruning Reduce the Depth in Deep Neural Networks?","summary":" Pruning is a widely used technique for reducing the size of deep neural\nnetworks while maintaining their performance. However, such a technique,\ndespite being able to massively compress deep models, is hardly able to remove\nentire layers from a model (even when structured): is this an addressable task?\nIn this study, we introduce EGP, an innovative Entropy Guided Pruning algorithm\naimed at reducing the size of deep neural networks while preserving their\nperformance. The key focus of EGP is to prioritize pruning connections in\nlayers with low entropy, ultimately leading to their complete removal. Through\nextensive experiments conducted on popular models like ResNet-18 and Swin-T,\nour findings demonstrate that EGP effectively compresses deep neural networks\nwhile maintaining competitive performance levels. Our results not only shed\nlight on the underlying mechanism behind the advantages of unstructured\npruning, but also pave the way for further investigations into the intricate\nrelationship between entropy, pruning techniques, and deep learning\nperformance. The EGP algorithm and its insights hold great promise for\nadvancing the field of network compression and optimization. The source code\nfor EGP is released open-source.\n","authors":["Zhu Liao","Victor Quétu","Van-Tam Nguyen","Enzo Tartaglione"],"pdf_url":"https://arxiv.org/pdf/2308.06619v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09517v1","updated":"2023-08-18T12:49:57Z","published":"2023-08-18T12:49:57Z","title":"Transitivity-Preserving Graph Representation Learning for Bridging Local\n Connectivity and Role-based Similarity","summary":" Graph representation learning (GRL) methods, such as graph neural networks\nand graph transformer models, have been successfully used to analyze\ngraph-structured data, mainly focusing on node classification and link\nprediction tasks. However, the existing studies mostly only consider local\nconnectivity while ignoring long-range connectivity and the roles of nodes. In\nthis paper, we propose Unified Graph Transformer Networks (UGT) that\neffectively integrate local and global structural information into fixed-length\nvector representations. First, UGT learns local structure by identifying the\nlocal substructures and aggregating features of the $k$-hop neighborhoods of\neach node. Second, we construct virtual edges, bridging distant nodes with\nstructural similarity to capture the long-range dependencies. Third, UGT learns\nunified representations through self-attention, encoding structural distance\nand $p$-step transition probability between node pairs. Furthermore, we propose\na self-supervised learning task that effectively learns transition probability\nto fuse local and global structural features, which could then be transferred\nto other downstream tasks. Experimental results on real-world benchmark\ndatasets over various downstream tasks showed that UGT significantly\noutperformed baselines that consist of state-of-the-art models. In addition,\nUGT reaches the expressive power of the third-order Weisfeiler-Lehman\nisomorphism test (3d-WL) in distinguishing non-isomorphic graph pairs. The\nsource code is available at\nhttps://github.com/NSLab-CUK/Unified-Graph-Transformer.\n","authors":["Van Thuy Hoang","O-Joun Lee"],"pdf_url":"https://arxiv.org/pdf/2308.09517v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2308.09514v1","updated":"2023-08-18T12:45:32Z","published":"2023-08-18T12:45:32Z","title":"Spatial LibriSpeech: An Augmented Dataset for Spatial Audio Learning","summary":" We present Spatial LibriSpeech, a spatial audio dataset with over 650 hours\nof 19-channel audio, first-order ambisonics, and optional distractor noise.\nSpatial LibriSpeech is designed for machine learning model training, and it\nincludes labels for source position, speaking direction, room acoustics and\ngeometry. Spatial LibriSpeech is generated by augmenting LibriSpeech samples\nwith 200k+ simulated acoustic conditions across 8k+ synthetic rooms. To\ndemonstrate the utility of our dataset, we train models on four spatial audio\ntasks, resulting in a median absolute error of 6.60{\\deg} on 3D source\nlocalization, 0.43m on distance, 90.66ms on T30, and 2.74dB on DRR estimation.\nWe show that the same models generalize well to widely-used evaluation\ndatasets, e.g., obtaining a median absolute error of 12.43{\\deg} on 3D source\nlocalization on TUT Sound Events 2018, and 157.32ms on T30 estimation on ACE\nChallenge.\n","authors":["Miguel Sarabia","Elena Menyaylenko","Alessandro Toso","Skyler Seto","Zakaria Aldeneh","Shadi Pirhosseinloo","Luca Zappella","Barry-John Theobald","Nicholas Apostoloff","Jonathan Sheaffer"],"pdf_url":"https://arxiv.org/pdf/2308.09514v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09499v1","updated":"2023-08-18T12:14:51Z","published":"2023-08-18T12:14:51Z","title":"Bridged-GNN: Knowledge Bridge Learning for Effective Knowledge Transfer","summary":" The data-hungry problem, characterized by insufficiency and low-quality of\ndata, poses obstacles for deep learning models. Transfer learning has been a\nfeasible way to transfer knowledge from high-quality external data of source\ndomains to limited data of target domains, which follows a domain-level\nknowledge transfer to learn a shared posterior distribution. However, they are\nusually built on strong assumptions, e.g., the domain invariant posterior\ndistribution, which is usually unsatisfied and may introduce noises, resulting\nin poor generalization ability on target domains. Inspired by Graph Neural\nNetworks (GNNs) that aggregate information from neighboring nodes, we redefine\nthe paradigm as learning a knowledge-enhanced posterior distribution for target\ndomains, namely Knowledge Bridge Learning (KBL). KBL first learns the scope of\nknowledge transfer by constructing a Bridged-Graph that connects knowledgeable\nsamples to each target sample and then performs sample-wise knowledge transfer\nvia GNNs.KBL is free from strong assumptions and is robust to noises in the\nsource data. Guided by KBL, we propose the Bridged-GNN} including an Adaptive\nKnowledge Retrieval module to build Bridged-Graph and a Graph Knowledge\nTransfer module. Comprehensive experiments on both un-relational and relational\ndata-hungry scenarios demonstrate the significant improvements of Bridged-GNN\ncompared with SOTA methods\n","authors":["Wendong Bi","Xueqi Cheng","Bingbing Xu","Xiaoqian Sun","Li Xu","Huawei Shen"],"pdf_url":"https://arxiv.org/pdf/2308.09499v1.pdf","comment":"Accepted by CIKM2023"},{"id":"http://arxiv.org/abs/2308.09497v1","updated":"2023-08-18T12:14:25Z","published":"2023-08-18T12:14:25Z","title":"Predictive Authoring for Brazilian Portuguese Augmentative and\n Alternative Communication","summary":" Individuals with complex communication needs (CCN) often rely on augmentative\nand alternative communication (AAC) systems to have conversations and\ncommunique their wants. Such systems allow message authoring by arranging\npictograms in sequence. However, the difficulty of finding the desired item to\ncomplete a sentence can increase as the user's vocabulary increases. This paper\nproposes using BERTimbau, a Brazilian Portuguese version of BERT, for pictogram\nprediction in AAC systems. To finetune BERTimbau, we constructed an AAC corpus\nfor Brazilian Portuguese to use as a training corpus. We tested different\napproaches to representing a pictogram for prediction: as a word (using\npictogram captions), as a concept (using a dictionary definition), and as a set\nof synonyms (using related terms). We also evaluated the usage of images for\npictogram prediction. The results demonstrate that using embeddings computed\nfrom the pictograms' caption, synonyms, or definitions have a similar\nperformance. Using synonyms leads to lower perplexity, but using captions leads\nto the highest accuracies. This paper provides insight into how to represent a\npictogram for prediction using a BERT-like model and the potential of using\nimages for pictogram prediction.\n","authors":["Jayr Pereira","Rodrigo Nogueira","Cleber Zanchettin","Robson Fidalgo"],"pdf_url":"https://arxiv.org/pdf/2308.09497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09490v1","updated":"2023-08-18T11:59:15Z","published":"2023-08-18T11:59:15Z","title":"Balancing Transparency and Risk: The Security and Privacy Risks of\n Open-Source Machine Learning Models","summary":" The field of artificial intelligence (AI) has experienced remarkable progress\nin recent years, driven by the widespread adoption of open-source machine\nlearning models in both research and industry. Considering the\nresource-intensive nature of training on vast datasets, many applications opt\nfor models that have already been trained. Hence, a small number of key players\nundertake the responsibility of training and publicly releasing large\npre-trained models, providing a crucial foundation for a wide range of\napplications. However, the adoption of these open-source models carries\ninherent privacy and security risks that are often overlooked. To provide a\nconcrete example, an inconspicuous model may conceal hidden functionalities\nthat, when triggered by specific input patterns, can manipulate the behavior of\nthe system, such as instructing self-driving cars to ignore the presence of\nother vehicles. The implications of successful privacy and security attacks\nencompass a broad spectrum, ranging from relatively minor damage like service\ninterruptions to highly alarming scenarios, including physical harm or the\nexposure of sensitive user data. In this work, we present a comprehensive\noverview of common privacy and security threats associated with the use of\nopen-source models. By raising awareness of these dangers, we strive to promote\nthe responsible and secure use of AI systems.\n","authors":["Dominik Hintersdorf","Lukas Struppek","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2308.09490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09051v2","updated":"2023-08-18T11:34:10Z","published":"2023-03-16T02:47:59Z","title":"Robust Evaluation of Diffusion-Based Adversarial Purification","summary":" We question the current evaluation practice on diffusion-based purification\nmethods. Diffusion-based purification methods aim to remove adversarial effects\nfrom an input data point at test time. The approach gains increasing attention\nas an alternative to adversarial training due to the disentangling between\ntraining and testing. Well-known white-box attacks are often employed to\nmeasure the robustness of the purification. However, it is unknown whether\nthese attacks are the most effective for the diffusion-based purification since\nthe attacks are often tailored for adversarial training. We analyze the current\npractices and provide a new guideline for measuring the robustness of\npurification methods against adversarial attacks. Based on our analysis, we\nfurther propose a new purification strategy improving robustness compared to\nthe current diffusion-based purification methods.\n","authors":["Minjong Lee","Dongwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2303.09051v2.pdf","comment":"Accepted by ICCV 2023, Oral presentation"},{"id":"http://arxiv.org/abs/2305.17246v2","updated":"2023-08-18T11:32:44Z","published":"2023-05-26T20:19:09Z","title":"NASimEmu: Network Attack Simulator & Emulator for Training Agents\n Generalizing to Novel Scenarios","summary":" Current frameworks for training offensive penetration testing agents with\ndeep reinforcement learning struggle to produce agents that perform well in\nreal-world scenarios, due to the reality gap in simulation-based frameworks and\nthe lack of scalability in emulation-based frameworks. Additionally, existing\nframeworks often use an unrealistic metric that measures the agents'\nperformance on the training data. NASimEmu, a new framework introduced in this\npaper, addresses these issues by providing both a simulator and an emulator\nwith a shared interface. This approach allows agents to be trained in\nsimulation and deployed in the emulator, thus verifying the realism of the used\nabstraction. Our framework promotes the development of general agents that can\ntransfer to novel scenarios unseen during their training. For the simulation\npart, we adopt an existing simulator NASim and enhance its realism. The\nemulator is implemented with industry-level tools, such as Vagrant, VirtualBox,\nand Metasploit. Experiments demonstrate that a simulation-trained agent can be\ndeployed in emulation, and we show how to use the framework to train a general\nagent that transfers into novel, structurally different scenarios. NASimEmu is\navailable as open-source.\n","authors":["Jaromír Janisch","Tomáš Pevný","Viliam Lisý"],"pdf_url":"https://arxiv.org/pdf/2305.17246v2.pdf","comment":"NASimEmu is available at https://github.com/jaromiru/NASimEmu and the\n baseline agents at https://github.com/jaromiru/NASimEmu-agents"},{"id":"http://arxiv.org/abs/2308.09464v1","updated":"2023-08-18T11:02:27Z","published":"2023-08-18T11:02:27Z","title":"Data augmentation and explainability for bias discovery and mitigation\n in deep learning","summary":" This dissertation explores the impact of bias in deep neural networks and\npresents methods for reducing its influence on model performance. The first\npart begins by categorizing and describing potential sources of bias and errors\nin data and models, with a particular focus on bias in machine learning\npipelines. The next chapter outlines a taxonomy and methods of Explainable AI\nas a way to justify predictions and control and improve the model. Then, as an\nexample of a laborious manual data inspection and bias discovery process, a\nskin lesion dataset is manually examined. A Global Explanation for the Bias\nIdentification method is proposed as an alternative semi-automatic approach to\nmanual data exploration for discovering potential biases in data. Relevant\nnumerical methods and metrics are discussed for assessing the effects of the\nidentified biases on the model. Whereas identifying errors and bias is\ncritical, improving the model and reducing the number of flaws in the future is\nan absolute priority. Hence, the second part of the thesis focuses on\nmitigating the influence of bias on ML models. Three approaches are proposed\nand discussed: Style Transfer Data Augmentation, Targeted Data Augmentations,\nand Attribution Feedback. Style Transfer Data Augmentation aims to address\nshape and texture bias by merging a style of a malignant lesion with a\nconflicting shape of a benign one. Targeted Data Augmentations randomly insert\npossible biases into all images in the dataset during the training, as a way to\nmake the process random and, thus, destroy spurious correlations. Lastly,\nAttribution Feedback is used to fine-tune the model to improve its accuracy by\neliminating obvious mistakes and teaching it to ignore insignificant input\nparts via an attribution loss. The goal of these approaches is to reduce the\ninfluence of bias on machine learning models, rather than eliminate it\nentirely.\n","authors":["Agnieszka Mikołajczyk-Bareła"],"pdf_url":"https://arxiv.org/pdf/2308.09464v1.pdf","comment":"A PhD Thesis"},{"id":"http://arxiv.org/abs/2212.05680v2","updated":"2023-08-18T10:46:35Z","published":"2022-12-12T03:35:05Z","title":"REAP: A Large-Scale Realistic Adversarial Patch Benchmark","summary":" Machine learning models are known to be susceptible to adversarial\nperturbation. One famous attack is the adversarial patch, a sticker with a\nparticularly crafted pattern that makes the model incorrectly predict the\nobject it is placed on. This attack presents a critical threat to\ncyber-physical systems that rely on cameras such as autonomous cars. Despite\nthe significance of the problem, conducting research in this setting has been\ndifficult; evaluating attacks and defenses in the real world is exceptionally\ncostly while synthetic data are unrealistic. In this work, we propose the REAP\n(REalistic Adversarial Patch) benchmark, a digital benchmark that allows the\nuser to evaluate patch attacks on real images, and under real-world conditions.\nBuilt on top of the Mapillary Vistas dataset, our benchmark contains over\n14,000 traffic signs. Each sign is augmented with a pair of geometric and\nlighting transformations, which can be used to apply a digitally generated\npatch realistically onto the sign. Using our benchmark, we perform the first\nlarge-scale assessments of adversarial patch attacks under realistic\nconditions. Our experiments suggest that adversarial patch attacks may present\na smaller threat than previously believed and that the success rate of an\nattack on simpler digital simulations is not predictive of its actual\neffectiveness in practice. We release our benchmark publicly at\nhttps://github.com/wagner-group/reap-benchmark.\n","authors":["Nabeel Hingun","Chawin Sitawarin","Jerry Li","David Wagner"],"pdf_url":"https://arxiv.org/pdf/2212.05680v2.pdf","comment":"ICCV 2023. Code and benchmark can be found at\n https://github.com/wagner-group/reap-benchmark"},{"id":"http://arxiv.org/abs/2308.09451v1","updated":"2023-08-18T10:29:26Z","published":"2023-08-18T10:29:26Z","title":"Reconstructing $S$-matrix Phases with Machine Learning","summary":" An important element of the $S$-matrix bootstrap program is the relationship\nbetween the modulus of an $S$-matrix element and its phase. Unitarity relates\nthem by an integral equation. Even in the simplest case of elastic scattering,\nthis integral equation cannot be solved analytically and numerical approaches\nare required. We apply modern machine learning techniques to studying the\nunitarity constraint. We find that for a given modulus, when a phase exists it\ncan generally be reconstructed to good accuracy with machine learning.\nMoreover, the loss of the reconstruction algorithm provides a good proxy for\nwhether a given modulus can be consistent with unitarity at all. In addition,\nwe study the question of whether multiple phases can be consistent with a\nsingle modulus, finding novel phase-ambiguous solutions. In particular, we find\na new phase-ambiguous solution which pushes the known limit on such solutions\nsignificantly beyond the previous bound.\n","authors":["Aurélien Dersy","Matthew D. Schwartz","Alexander Zhiboedov"],"pdf_url":"https://arxiv.org/pdf/2308.09451v1.pdf","comment":"43 pages, 21 figures"},{"id":"http://arxiv.org/abs/2308.09448v1","updated":"2023-08-18T10:22:31Z","published":"2023-08-18T10:22:31Z","title":"Defending Label Inference Attacks in Split Learning under Regression\n Setting","summary":" As a privacy-preserving method for implementing Vertical Federated Learning,\nSplit Learning has been extensively researched. However, numerous studies have\nindicated that the privacy-preserving capability of Split Learning is\ninsufficient. In this paper, we primarily focus on label inference attacks in\nSplit Learning under regression setting, which are mainly implemented through\nthe gradient inversion method. To defend against label inference attacks, we\npropose Random Label Extension (RLE), where labels are extended to obfuscate\nthe label information contained in the gradients, thereby preventing the\nattacker from utilizing gradients to train an attack model that can infer the\noriginal labels. To further minimize the impact on the original task, we\npropose Model-based adaptive Label Extension (MLE), where original labels are\npreserved in the extended labels and dominate the training process. The\nexperimental results show that compared to the basic defense methods, our\nproposed defense methods can significantly reduce the attack model's\nperformance while preserving the original task's performance.\n","authors":["Haoze Qiu","Fei Zheng","Chaochao Chen","Xiaolin Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.09448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09444v1","updated":"2023-08-18T10:17:59Z","published":"2023-08-18T10:17:59Z","title":"An Efficient 1 Iteration Learning Algorithm for Gaussian Mixture Model\n And Gaussian Mixture Embedding For Neural Network","summary":" We propose an Gaussian Mixture Model (GMM) learning algorithm, based on our\nprevious work of GMM expansion idea. The new algorithm brings more robustness\nand simplicity than classic Expectation Maximization (EM) algorithm. It also\nimproves the accuracy and only take 1 iteration for learning. We theoretically\nproof that this new algorithm is guarantee to converge regardless the\nparameters initialisation. We compare our GMM expansion method with classic\nprobability layers in neural network leads to demonstrably better capability to\novercome data uncertainty and inverse problem. Finally, we test GMM based\ngenerator which shows a potential to build further application that able to\nutilized distribution random sampling for stochastic variation as well as\nvariation control.\n","authors":["Weiguo Lu","Xuan Wu","Deng Ding","Gangnan Yuan"],"pdf_url":"https://arxiv.org/pdf/2308.09444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09437v1","updated":"2023-08-18T10:07:46Z","published":"2023-08-18T10:07:46Z","title":"From Hope to Safety: Unlearning Biases of Deep Models by Enforcing the\n Right Reasons in Latent Space","summary":" Deep Neural Networks are prone to learning spurious correlations embedded in\nthe training data, leading to potentially biased predictions. This poses risks\nwhen deploying these models for high-stake decision-making, such as in medical\napplications. Current methods for post-hoc model correction either require\ninput-level annotations, which are only possible for spatially localized\nbiases, or augment the latent feature space, thereby hoping to enforce the\nright reasons. We present a novel method ensuring the right reasons on the\nconcept level by reducing the model's sensitivity towards biases through the\ngradient. When modeling biases via Concept Activation Vectors, we highlight the\nimportance of choosing robust directions, as traditional regression-based\napproaches such as Support Vector Machines tend to result in diverging\ndirections. We effectively mitigate biases in controlled and real-world\nsettings on the ISIC, Bone Age, ImageNet and CelebA datasets using VGG, ResNet\nand EfficientNet architectures.\n","authors":["Maximilian Dreyer","Frederik Pahde","Christopher J. Anders","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2308.09437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09433v1","updated":"2023-08-18T10:07:17Z","published":"2023-08-18T10:07:17Z","title":"Can ultrasound confidence maps predict sonographers' labeling\n variability?","summary":" Measuring cross-sectional areas in ultrasound images is a standard tool to\nevaluate disease progress or treatment response. Often addressed today with\nsupervised deep-learning segmentation approaches, existing solutions highly\ndepend upon the quality of experts' annotations. However, the annotation\nquality in ultrasound is anisotropic and position-variant due to the inherent\nphysical imaging principles, including attenuation, shadows, and missing\nboundaries, commonly exacerbated with depth. This work proposes a novel\napproach that guides ultrasound segmentation networks to account for\nsonographers' uncertainties and generate predictions with variability similar\nto the experts. We claim that realistic variability can reduce overconfident\npredictions and improve physicians' acceptance of deep-learning cross-sectional\nsegmentation solutions. Our method provides CM's certainty for each pixel for\nminimal computational overhead as it can be precalculated directly from the\nimage. We show that there is a correlation between low values in the confidence\nmaps and expert's label uncertainty. Therefore, we propose to give the\nconfidence maps as additional information to the networks. We study the effect\nof the proposed use of ultrasound CMs in combination with four state-of-the-art\nneural networks and in two configurations: as a second input channel and as\npart of the loss. We evaluate our method on 3D ultrasound datasets of the\nthyroid and lower limb muscles. Our results show ultrasound CMs increase the\nDice score, improve the Hausdorff and Average Surface Distances, and decrease\nthe number of isolated pixel predictions. Furthermore, our findings suggest\nthat ultrasound CMs improve the penalization of uncertain areas in the ground\ntruth data, thereby improving problematic interpolations. Our code and example\ndata will be made public at\nhttps://github.com/IFL-CAMP/Confidence-segmentation.\n","authors":["Vanessa Gonzalez Duque","Leonhard Zirus","Yordanka Velikova","Nassir Navab","Diana Mateus"],"pdf_url":"https://arxiv.org/pdf/2308.09433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09431v1","updated":"2023-08-18T10:03:51Z","published":"2023-08-18T10:03:51Z","title":"End-to-end topographic networks as models of cortical map formation and\n human visual behaviour: moving beyond convolutions","summary":" Computational models are an essential tool for understanding the origin and\nfunctions of the topographic organisation of the primate visual system. Yet,\nvision is most commonly modelled by convolutional neural networks that ignore\ntopography by learning identical features across space. Here, we overcome this\nlimitation by developing All-Topographic Neural Networks (All-TNNs). Trained on\nvisual input, several features of primate topography emerge in All-TNNs: smooth\norientation maps and cortical magnification in their first layer, and\ncategory-selective areas in their final layer. In addition, we introduce a\nnovel dataset of human spatial biases in object recognition, which enables us\nto directly link models to behaviour. We demonstrate that All-TNNs\nsignificantly better align with human behaviour than previous state-of-the-art\nconvolutional models due to their topographic nature. All-TNNs thereby mark an\nimportant step forward in understanding the spatial organisation of the visual\nbrain and how it mediates visual behaviour.\n","authors":["Zejin Lu","Adrien Doerig","Victoria Bosch","Bas Krahmer","Daniel Kaiser","Radoslaw M Cichy","Tim C Kietzmann"],"pdf_url":"https://arxiv.org/pdf/2308.09431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09430v1","updated":"2023-08-18T10:00:27Z","published":"2023-08-18T10:00:27Z","title":"Towards Understanding the Generalizability of Delayed Stochastic\n Gradient Descent","summary":" Stochastic gradient descent (SGD) performed in an asynchronous manner plays a\ncrucial role in training large-scale machine learning models. However, the\ngeneralization performance of asynchronous delayed SGD, which is an essential\nmetric for assessing machine learning algorithms, has rarely been explored.\nExisting generalization error bounds are rather pessimistic and cannot reveal\nthe correlation between asynchronous delays and generalization. In this paper,\nwe investigate sharper generalization error bound for SGD with asynchronous\ndelay $\\tau$. Leveraging the generating function analysis tool, we first\nestablish the average stability of the delayed gradient algorithm. Based on\nthis algorithmic stability, we provide upper bounds on the generalization error\nof $\\tilde{\\mathcal{O}}(\\frac{T-\\tau}{n\\tau})$ and\n$\\tilde{\\mathcal{O}}(\\frac{1}{n})$ for quadratic convex and strongly convex\nproblems, respectively, where $T$ refers to the iteration number and $n$ is the\namount of training data. Our theoretical results indicate that asynchronous\ndelays reduce the generalization error of the delayed SGD algorithm. Analogous\nanalysis can be generalized to the random delay setting, and the experimental\nresults validate our theoretical findings.\n","authors":["Xiaoge Deng","Li Shen","Shengwei Li","Tao Sun","Dongsheng Li","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2308.09430v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09426v1","updated":"2023-08-18T09:51:11Z","published":"2023-08-18T09:51:11Z","title":"Self-Supervised Single-Image Deconvolution with Siamese Neural Networks","summary":" Inverse problems in image reconstruction are fundamentally complicated by\nunknown noise properties. Classical iterative deconvolution approaches amplify\nnoise and require careful parameter selection for an optimal trade-off between\nsharpness and grain. Deep learning methods allow for flexible parametrization\nof the noise and learning its properties directly from the data. Recently,\nself-supervised blind-spot neural networks were successfully adopted for image\ndeconvolution by including a known point-spread function in the end-to-end\ntraining. However, their practical application has been limited to 2D images in\nthe biomedical domain because it implies large kernels that are poorly\noptimized. We tackle this problem with Fast Fourier Transform convolutions that\nprovide training speed-up in 3D microscopy deconvolution tasks. Further, we\npropose to adopt a Siamese invariance loss for deconvolution and empirically\nidentify its optimal position in the neural network between blind-spot and full\nimage branches. The experimental results show that our improved framework\noutperforms the previous state-of-the-art deconvolution methods with a known\npoint spread function.\n","authors":["Mikhail Papkov","Kaupo Palo","Leopold Parts"],"pdf_url":"https://arxiv.org/pdf/2308.09426v1.pdf","comment":"Accepted for DALI @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2305.15111v2","updated":"2023-08-18T09:33:59Z","published":"2023-05-24T13:01:51Z","title":"Reconstruction, forecasting, and stability of chaotic dynamics from\n partial data","summary":" The forecasting and computation of the stability of chaotic systems from\npartial observations are tasks for which traditional equation-based methods may\nnot be suitable. In this computational paper, we propose data-driven methods to\n(i) infer the dynamics of unobserved (hidden) chaotic variables (full-state\nreconstruction); (ii) time forecast the evolution of the full state; and (iii)\ninfer the stability properties of the full state. The tasks are performed with\nlong short-term memory (LSTM) networks, which are trained with observations\n(data) limited to only part of the state: (i) the low-to-high resolution LSTM\n(LH-LSTM), which takes partial observations as training input, and requires\naccess to the full system state when computing the loss; and (ii) the\nphysics-informed LSTM (PI-LSTM), which is designed to combine partial\nobservations with the integral formulation of the dynamical system's evolution\nequations. First, we derive the Jacobian of the LSTMs. Second, we analyse a\nchaotic partial differential equation, the Kuramoto-Sivashinsky (KS), and the\nLorenz-96 system. We show that the proposed networks can forecast the hidden\nvariables, both time-accurately and statistically. The Lyapunov exponents and\ncovariant Lyapunov vectors, which characterize the stability of the chaotic\nattractors, are correctly inferred from partial observations. Third, the\nPI-LSTM outperforms the LH-LSTM by successfully reconstructing the hidden\nchaotic dynamics when the input dimension is smaller or similar to the\nKaplan-Yorke dimension of the attractor. This work opens new opportunities for\nreconstructing the full state, inferring hidden variables, and computing the\nstability of chaotic systems from partial data.\n","authors":["Elise Özalp","Georgios Margazoglou","Luca Magri"],"pdf_url":"https://arxiv.org/pdf/2305.15111v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09414v1","updated":"2023-08-18T09:29:29Z","published":"2023-08-18T09:29:29Z","title":"Machine-Learning Solutions for the Analysis of Single-Particle Diffusion\n Trajectories","summary":" Single-particle traces of the diffusive motion of molecules, cells, or\nanimals are by-now routinely measured, similar to stochastic records of stock\nprices or weather data. Deciphering the stochastic mechanism behind the\nrecorded dynamics is vital in understanding the observed systems. Typically,\nthe task is to decipher the exact type of diffusion and/or to determine system\nparameters. The tools used in this endeavor are currently revolutionized by\nmodern machine-learning techniques. In this Perspective we provide an overview\nover recently introduced methods in machine-learning for diffusive time series,\nmost notably, those successfully competing in the\nAnomalous-Diffusion-Challenge. As such methods are often criticized for their\nlack of interpretability, we focus on means to include uncertainty estimates\nand feature-based approaches, both improving interpretability and providing\nconcrete insight into the learning process of the machine. We expand the\ndiscussion by examining predictions on different out-of-distribution data. We\nalso comment on expected future developments.\n","authors":["Henrik Seckler","Janusz Szwabinski","Ralf Metzler"],"pdf_url":"https://arxiv.org/pdf/2308.09414v1.pdf","comment":"25 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.09411v1","updated":"2023-08-18T09:23:55Z","published":"2023-08-18T09:23:55Z","title":"Metadata Improves Segmentation Through Multitasking Elicitation","summary":" Metainformation is a common companion to biomedical images. However, this\npotentially powerful additional source of signal from image acquisition has had\nlimited use in deep learning methods, for semantic segmentation in particular.\nHere, we incorporate metadata by employing a channel modulation mechanism in\nconvolutional networks and study its effect on semantic segmentation tasks. We\ndemonstrate that metadata as additional input to a convolutional network can\nimprove segmentation results while being inexpensive in implementation as a\nnimble add-on to popular models. We hypothesize that this benefit of metadata\ncan be attributed to facilitating multitask switching. This aspect of\nmetadata-driven systems is explored and discussed in detail.\n","authors":["Iaroslav Plutenko","Mikhail Papkov","Kaupo Palo","Leopold Parts","Dmytro Fishman"],"pdf_url":"https://arxiv.org/pdf/2308.09411v1.pdf","comment":"Accepted for DART @ MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.09393v1","updated":"2023-08-18T08:49:30Z","published":"2023-08-18T08:49:30Z","title":"Learning MDL logic programs from noisy data","summary":" Many inductive logic programming approaches struggle to learn programs from\nnoisy data. To overcome this limitation, we introduce an approach that learns\nminimal description length programs from noisy data, including recursive\nprograms. Our experiments on several domains, including drug design, game\nplaying, and program synthesis, show that our approach can outperform existing\napproaches in terms of predictive accuracies and scale to moderate amounts of\nnoise.\n","authors":["Céline Hocquette","Andreas Niskanen","Matti Järvisalo","Andrew Cropper"],"pdf_url":"https://arxiv.org/pdf/2308.09393v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2206.01614"},{"id":"http://arxiv.org/abs/2307.01090v2","updated":"2023-08-18T08:31:30Z","published":"2023-07-03T15:09:10Z","title":"Streamlined Lensed Quasar Identification in Multiband Images via\n Ensemble Networks","summary":" Quasars experiencing strong lensing offer unique viewpoints on subjects\nrelated to the cosmic expansion rate, the dark matter profile within the\nforeground deflectors, and the quasar host galaxies. Unfortunately, identifying\nthem in astronomical images is challenging since they are overwhelmed by the\nabundance of non-lenses. To address this, we have developed a novel approach by\nensembling cutting-edge convolutional networks (CNNs) -- for instance, ResNet,\nInception, NASNet, MobileNet, EfficientNet, and RegNet -- along with vision\ntransformers (ViTs) trained on realistic galaxy-quasar lens simulations based\non the Hyper Suprime-Cam (HSC) multiband images. While the individual model\nexhibits remarkable performance when evaluated against the test dataset,\nachieving an area under the receiver operating characteristic curve of $>$97.3%\nand a median false positive rate of 3.6%, it struggles to generalize in real\ndata, indicated by numerous spurious sources picked by each classifier. A\nsignificant improvement is achieved by averaging these CNNs and ViTs, resulting\nin the impurities being downsized by factors up to 50. Subsequently, combining\nthe HSC images with the UKIRT, VISTA, and unWISE data, we retrieve\napproximately 60 million sources as parent samples and reduce this to 892,609\nafter employing a photometry preselection to discover $z>1.5$ lensed quasars\nwith Einstein radii of $\\theta_\\mathrm{E}<5$ arcsec. Afterward, the ensemble\nclassifier indicates 3080 sources with a high probability of being lenses, for\nwhich we visually inspect, yielding 210 prevailing candidates awaiting\nspectroscopic confirmation. These outcomes suggest that automated deep learning\npipelines hold great potential in effectively detecting strong lenses in vast\ndatasets with minimal manual visual inspection involved.\n","authors":["Irham Taufik Andika","Sherry H. Suyu","Raoul Cañameras","Alejandra Melo","Stefan Schuldt","Yiping Shu","Anna-Christina Eilers","Anton Timur Jaelani","Minghao Yue"],"pdf_url":"https://arxiv.org/pdf/2307.01090v2.pdf","comment":"Accepted for publication in the Astronomy & Astrophysics journal. 28\n pages, 11 figures, and 3 tables. We welcome comments from the reader"},{"id":"http://arxiv.org/abs/2307.01507v2","updated":"2023-08-18T08:30:00Z","published":"2023-07-04T06:41:50Z","title":"Relation-aware graph structure embedding with co-contrastive learning\n for drug-drug interaction prediction","summary":" Relation-aware graph structure embedding is promising for predicting\nmulti-relational drug-drug interactions (DDIs). Typically, most existing\nmethods begin by constructing a multi-relational DDI graph and then learning\nrelation-aware graph structure embeddings (RaGSEs) of drugs from the DDI graph.\nNevertheless, most existing approaches are usually limited in learning RaGSEs\nof new drugs, leading to serious over-fitting when the test DDIs involve such\ndrugs. To alleviate this issue, we propose a novel DDI prediction method based\non relation-aware graph structure embedding with co-contrastive learning,\nRaGSECo. The proposed RaGSECo constructs two heterogeneous drug graphs: a\nmulti-relational DDI graph and a multi-attribute drug-drug similarity (DDS)\ngraph. The two graphs are used respectively for learning and propagating the\nRaGSEs of drugs, aiming to ensure all drugs, including new ones, can possess\neffective RaGSEs. Additionally, we present a novel co-contrastive learning\nmodule to learn drug-pairs (DPs) representations. This mechanism learns DP\nrepresentations from two distinct views (interaction and similarity views) and\nencourages these views to supervise each other collaboratively to obtain more\ndiscriminative DP representations. We evaluate the effectiveness of our RaGSECo\non three different tasks using two real datasets. The experimental results\ndemonstrate that RaGSECo outperforms existing state-of-the-art prediction\nmethods.\n","authors":["Mengying Jiang","Guizhong Liu","Biao Zhao","Yuanchao Su","Weiqiang Jin"],"pdf_url":"https://arxiv.org/pdf/2307.01507v2.pdf","comment":"14pages, 23figures"},{"id":"http://arxiv.org/abs/2308.09381v1","updated":"2023-08-18T08:24:57Z","published":"2023-08-18T08:24:57Z","title":"On Gradient-like Explanation under a Black-box Setting: When Black-box\n Explanations Become as Good as White-box","summary":" Attribution methods shed light on the explainability of data-driven\napproaches such as deep learning models by revealing the most contributing\nfeatures to decisions that have been made. A widely accepted way of deriving\nfeature attributions is to analyze the gradients of the target function with\nrespect to input features. Analysis of gradients requires full access to the\ntarget system, meaning that solutions of this kind treat the target system as a\nwhite-box. However, the white-box assumption may be untenable due to security\nand safety concerns, thus limiting their practical applications. As an answer\nto the limited flexibility, this paper presents GEEX (gradient-estimation-based\nexplanation), an explanation method that delivers gradient-like explanations\nunder a black-box setting. Furthermore, we integrate the proposed method with a\npath method. The resulting approach iGEEX (integrated GEEX) satisfies the four\nfundamental axioms of attribution methods: sensitivity, insensitivity,\nimplementation invariance, and linearity. With a focus on image data, the\nexhaustive experiments empirically show that the proposed methods outperform\nstate-of-the-art black-box methods and achieve competitive performance compared\nto the ones with full access.\n","authors":["Yi Cai","Gerhard Wunder"],"pdf_url":"https://arxiv.org/pdf/2308.09381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09380v1","updated":"2023-08-18T08:23:47Z","published":"2023-08-18T08:23:47Z","title":"Deciphering knee osteoarthritis diagnostic features with explainable\n artificial intelligence: A systematic review","summary":" Existing artificial intelligence (AI) models for diagnosing knee\nosteoarthritis (OA) have faced criticism for their lack of transparency and\ninterpretability, despite achieving medical-expert-like performance. This\nopacity makes them challenging to trust in clinical practice. Recently,\nexplainable artificial intelligence (XAI) has emerged as a specialized\ntechnique that can provide confidence in the model's prediction by revealing\nhow the prediction is derived, thus promoting the use of AI systems in\nhealthcare. This paper presents the first survey of XAI techniques used for\nknee OA diagnosis. The XAI techniques are discussed from two perspectives: data\ninterpretability and model interpretability. The aim of this paper is to\nprovide valuable insights into XAI's potential towards a more reliable knee OA\ndiagnosis approach and encourage its adoption in clinical practice.\n","authors":["Yun Xin Teoh","Alice Othmani","Siew Li Goh","Juliana Usman","Khin Wee Lai"],"pdf_url":"https://arxiv.org/pdf/2308.09380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09375v1","updated":"2023-08-18T08:10:41Z","published":"2023-08-18T08:10:41Z","title":"Image Processing and Machine Learning for Hyperspectral Unmixing: An\n Overview and the HySUPP Python Package","summary":" Spectral pixels are often a mixture of the pure spectra of the materials,\ncalled endmembers, due to the low spatial resolution of hyperspectral sensors,\ndouble scattering, and intimate mixtures of materials in the scenes. Unmixing\nestimates the fractional abundances of the endmembers within the pixel.\nDepending on the prior knowledge of endmembers, linear unmixing can be divided\ninto three main groups: supervised, semi-supervised, and unsupervised (blind)\nlinear unmixing. Advances in Image processing and machine learning\nsubstantially affected unmixing. This paper provides an overview of advanced\nand conventional unmixing approaches. Additionally, we draw a critical\ncomparison between advanced and conventional techniques from the three\ncategories. We compare the performance of the unmixing techniques on three\nsimulated and two real datasets. The experimental results reveal the advantages\nof different unmixing categories for different unmixing scenarios. Moreover, we\nprovide an open-source Python-based package available at\nhttps://github.com/BehnoodRasti/HySUPP to reproduce the results.\n","authors":["Behnood Rasti","Alexandre Zouaoui","Julien Mairal","Jocelyn Chanussot"],"pdf_url":"https://arxiv.org/pdf/2308.09375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09374v1","updated":"2023-08-18T08:09:31Z","published":"2023-08-18T08:09:31Z","title":"Noise Sensitivity and Stability of Deep Neural Networks for Binary\n Classification","summary":" A first step is taken towards understanding often observed non-robustness\nphenomena of deep neural net (DNN) classifiers. This is done from the\nperspective of Boolean functions by asking if certain sequences of Boolean\nfunctions represented by common DNN models are noise sensitive or noise stable,\nconcepts defined in the Boolean function literature. Due to the natural\nrandomness in DNN models, these concepts are extended to annealed and quenched\nversions. Here we sort out the relation between these definitions and\ninvestigate the properties of two standard DNN architectures, the fully\nconnected and convolutional models, when initiated with Gaussian weights.\n","authors":["Johan Jonasson","Jeffrey E. Steif","Olof Zetterqvist"],"pdf_url":"https://arxiv.org/pdf/2308.09374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09372v1","updated":"2023-08-18T08:06:49Z","published":"2023-08-18T08:06:49Z","title":"Which Transformer to Favor: A Comparative Analysis of Efficiency in\n Vision Transformers","summary":" The growing popularity of Vision Transformers as the go-to models for image\nclassification has led to an explosion of architectural modifications claiming\nto be more efficient than the original ViT. However, a wide diversity of\nexperimental conditions prevents a fair comparison between all of them, based\nsolely on their reported results. To address this gap in comparability, we\nconduct a comprehensive analysis of more than 30 models to evaluate the\nefficiency of vision transformers and related architectures, considering\nvarious performance metrics. Our benchmark provides a comparable baseline\nacross the landscape of efficiency-oriented transformers, unveiling a plethora\nof surprising insights. For example, we discover that ViT is still Pareto\noptimal across multiple efficiency metrics, despite the existence of several\nalternative approaches claiming to be more efficient. Results also indicate\nthat hybrid attention-CNN models fare particularly well when it comes to low\ninference memory and number of parameters, and also that it is better to scale\nthe model size, than the image size. Furthermore, we uncover a strong positive\ncorrelation between the number of FLOPS and the training memory, which enables\nthe estimation of required VRAM from theoretical measurements alone.\n Thanks to our holistic evaluation, this study offers valuable insights for\npractitioners and researchers, facilitating informed decisions when selecting\nmodels for specific applications. We publicly release our code and data at\nhttps://github.com/tobna/WhatTransformerToFavor\n","authors":["Tobias Christian Nauen","Sebastian Palacio","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2308.09372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09368v1","updated":"2023-08-18T08:02:52Z","published":"2023-08-18T08:02:52Z","title":"A tailored Handwritten-Text-Recognition System for Medieval Latin","summary":" The Bavarian Academy of Sciences and Humanities aims to digitize its Medieval\nLatin Dictionary. This dictionary entails record cards referring to lemmas in\nmedieval Latin, a low-resource language. A crucial step of the digitization\nprocess is the Handwritten Text Recognition (HTR) of the handwritten lemmas\nfound on these record cards. In our work, we introduce an end-to-end pipeline,\ntailored to the medieval Latin dictionary, for locating, extracting, and\ntranscribing the lemmas. We employ two state-of-the-art (SOTA) image\nsegmentation models to prepare the initial data set for the HTR task.\nFurthermore, we experiment with different transformer-based models and conduct\na set of experiments to explore the capabilities of different combinations of\nvision encoders with a GPT-2 decoder. Additionally, we also apply extensive\ndata augmentation resulting in a highly competitive model. The best-performing\nsetup achieved a Character Error Rate (CER) of 0.015, which is even superior to\nthe commercial Google Cloud Vision model, and shows more stable performance.\n","authors":["Philipp Koch","Gilary Vera Nuñez","Esteban Garces Arias","Christian Heumann","Matthias Schöffel","Alexander Häberlin","Matthias Aßenmacher"],"pdf_url":"https://arxiv.org/pdf/2308.09368v1.pdf","comment":"This paper has been accepted at the First Workshop on Ancient\n Language Processing, co-located with RANLP 2023. This is the author's version\n of the work. The definite version of record will be published in the\n proceedings"},{"id":"http://arxiv.org/abs/2308.09367v1","updated":"2023-08-18T08:01:45Z","published":"2023-08-18T08:01:45Z","title":"On the Approximation of Bi-Lipschitz Maps by Invertible Neural Networks","summary":" Invertible neural networks (INNs) represent an important class of deep neural\nnetwork architectures that have been widely used in several applications. The\nuniversal approximation properties of INNs have also been established recently.\nHowever, the approximation rate of INNs is largely missing. In this work, we\nprovide an analysis of the capacity of a class of coupling-based INNs to\napproximate bi-Lipschitz continuous mappings on a compact domain, and the\nresult shows that it can well approximate both forward and inverse maps\nsimultaneously. Furthermore, we develop an approach for approximating\nbi-Lipschitz maps on infinite-dimensional spaces that simultaneously\napproximate the forward and inverse maps, by combining model reduction with\nprincipal component analysis and INNs for approximating the reduced map, and we\nanalyze the overall approximation error of the approach. Preliminary numerical\nresults show the feasibility of the approach for approximating the solution\noperator for parameterized second-order elliptic problems.\n","authors":["Bangti Jin","Zehui Zhou","Jun Zou"],"pdf_url":"https://arxiv.org/pdf/2308.09367v1.pdf","comment":"32 pages"},{"id":"http://arxiv.org/abs/2308.09360v1","updated":"2023-08-18T07:40:56Z","published":"2023-08-18T07:40:56Z","title":"Multi-feature concatenation and multi-classifier stacking: an\n interpretable and generalizable machine learning method for MDD\n discrimination with rsfMRI","summary":" Major depressive disorder is a serious and heterogeneous psychiatric disorder\nthat needs accurate diagnosis. Resting-state functional MRI (rsfMRI), which\ncaptures multiple perspectives on brain structure, function, and connectivity,\nis increasingly applied in the diagnosis and pathological research of mental\ndiseases. Different machine learning algorithms are then developed to exploit\nthe rich information in rsfMRI and discriminate MDD patients from normal\ncontrols. Despite recent advances reported, the discrimination accuracy has\nroom for further improvement. The generalizability and interpretability of the\nmethod are not sufficiently addressed either. Here, we propose a machine\nlearning method (MFMC) for MDD discrimination by concatenating multiple\nfeatures and stacking multiple classifiers. MFMC is tested on the REST-meta-MDD\ndata set that contains 2428 subjects collected from 25 different sites. MFMC\nyields 96.9% MDD discrimination accuracy, demonstrating a significant\nimprovement over existing methods. In addition, the generalizability of MFMC is\nvalidated by the good performance when the training and testing subjects are\nfrom independent sites. The use of XGBoost as the meta classifier allows us to\nprobe the decision process of MFMC. We identify 13 feature values related to 9\nbrain regions including the posterior cingulate gyrus, superior frontal gyrus\norbital part, and angular gyrus, which contribute most to the classification\nand also demonstrate significant differences at the group level. The use of\nthese 13 feature values alone can reach 87% of MFMC's full performance when\ntaking all feature values. These features may serve as clinically useful\ndiagnostic and prognostic biomarkers for mental disorders in the future.\n","authors":["Yunsong Luo","Wenyu Chen","Ling Zhan","Jiang Qiu","Tao Jia"],"pdf_url":"https://arxiv.org/pdf/2308.09360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09351v1","updated":"2023-08-18T07:17:09Z","published":"2023-08-18T07:17:09Z","title":"RLIPv2: Fast Scaling of Relational Language-Image Pre-training","summary":" Relational Language-Image Pre-training (RLIP) aims to align vision\nrepresentations with relational texts, thereby advancing the capability of\nrelational reasoning in computer vision tasks. However, hindered by the slow\nconvergence of RLIPv1 architecture and the limited availability of existing\nscene graph data, scaling RLIPv1 is challenging. In this paper, we propose\nRLIPv2, a fast converging model that enables the scaling of relational\npre-training to large-scale pseudo-labelled scene graph data. To enable fast\nscaling, RLIPv2 introduces Asymmetric Language-Image Fusion (ALIF), a mechanism\nthat facilitates earlier and deeper gated cross-modal fusion with sparsified\nlanguage encoding layers. ALIF leads to comparable or better performance than\nRLIPv1 in a fraction of the time for pre-training and fine-tuning. To obtain\nscene graph data at scale, we extend object detection datasets with free-form\nrelation labels by introducing a captioner (e.g., BLIP) and a designed Relation\nTagger. The Relation Tagger assigns BLIP-generated relation texts to region\npairs, thus enabling larger-scale relational pre-training. Through extensive\nexperiments conducted on Human-Object Interaction Detection and Scene Graph\nGeneration, RLIPv2 shows state-of-the-art performance on three benchmarks under\nfully-finetuning, few-shot and zero-shot settings. Notably, the largest RLIPv2\nachieves 23.29mAP on HICO-DET without any fine-tuning, yields 32.22mAP with\njust 1% data and yields 45.09mAP with 100% data. Code and models are publicly\navailable at https://github.com/JacobYuan7/RLIPv2.\n","authors":["Hangjie Yuan","Shiwei Zhang","Xiang Wang","Samuel Albanie","Yining Pan","Tao Feng","Jianwen Jiang","Dong Ni","Yingya Zhang","Deli Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.09351v1.pdf","comment":"Accepted to ICCV 2023. Code and models:\n https://github.com/JacobYuan7/RLIPv2"},{"id":"http://arxiv.org/abs/2308.03045v3","updated":"2023-08-18T07:10:29Z","published":"2023-08-06T08:14:35Z","title":"Machine learning methods for the search for L&T brown dwarfs in the data\n of modern sky surveys","summary":" According to various estimates, brown dwarfs (BD) should account for up to 25\npercent of all objects in the Galaxy. However, few of them are discovered and\nwell-studied, both individually and as a population. Homogeneous and complete\nsamples of brown dwarfs are needed for these kinds of studies. Due to their\nweakness, spectral studies of brown dwarfs are rather laborious. For this\nreason, creating a significant reliable sample of brown dwarfs, confirmed by\nspectroscopic observations, seems unattainable at the moment. Numerous attempts\nhave been made to search for and create a set of brown dwarfs using their\ncolours as a decision rule applied to a vast amount of survey data. In this\nwork, we use machine learning methods such as Random Forest Classifier,\nXGBoost, SVM Classifier and TabNet on PanStarrs DR1, 2MASS and WISE data to\ndistinguish L and T brown dwarfs from objects of other spectral and luminosity\nclasses. The explanation of the models is discussed. We also compare our models\nwith classical decision rules, proving their efficiency and relevance.\n","authors":["Aleksandra Avdeeva"],"pdf_url":"https://arxiv.org/pdf/2308.03045v3.pdf","comment":"12 pages, 10 figures, Accepted for publication in Astronomy and\n Computing"},{"id":"http://arxiv.org/abs/2308.09345v1","updated":"2023-08-18T07:07:15Z","published":"2023-08-18T07:07:15Z","title":"Denoising diffusion-based MR to CT image translation enables whole spine\n vertebral segmentation in 2D and 3D without manual annotations","summary":" Background: Automated segmentation of spinal MR images plays a vital role\nboth scientifically and clinically. However, accurately delineating posterior\nspine structures presents challenges.\n Methods: This retrospective study, approved by the ethical committee,\ninvolved translating T1w and T2w MR image series into CT images in a total of\nn=263 pairs of CT/MR series. Landmark-based registration was performed to align\nimage pairs. We compared 2D paired (Pix2Pix, denoising diffusion implicit\nmodels (DDIM) image mode, DDIM noise mode) and unpaired (contrastive unpaired\ntranslation, SynDiff) image-to-image translation using \"peak signal to noise\nratio\" (PSNR) as quality measure. A publicly available segmentation network\nsegmented the synthesized CT datasets, and Dice scores were evaluated on\nin-house test sets and the \"MRSpineSeg Challenge\" volumes. The 2D findings were\nextended to 3D Pix2Pix and DDIM.\n Results: 2D paired methods and SynDiff exhibited similar translation\nperformance and Dice scores on paired data. DDIM image mode achieved the\nhighest image quality. SynDiff, Pix2Pix, and DDIM image mode demonstrated\nsimilar Dice scores (0.77). For craniocaudal axis rotations, at least two\nlandmarks per vertebra were required for registration. The 3D translation\noutperformed the 2D approach, resulting in improved Dice scores (0.80) and\nanatomically accurate segmentations in a higher resolution than the original MR\nimage.\n Conclusion: Two landmarks per vertebra registration enabled paired\nimage-to-image translation from MR to CT and outperformed all unpaired\napproaches. The 3D techniques provided anatomically correct segmentations,\navoiding underprediction of small structures like the spinous process.\n","authors":["Robert Graf","Joachim Schmitt","Sarah Schlaeger","Hendrik Kristian Möller","Vasiliki Sideri-Lampretsa","Anjany Sekuboyina","Sandro Manuel Krieg","Benedikt Wiestler","Bjoern Menze","Daniel Rueckert","Jan Stefan Kirschke"],"pdf_url":"https://arxiv.org/pdf/2308.09345v1.pdf","comment":"35 pages, 7 figures, Code and a model weights available\n https://doi.org/10.5281/zenodo.8221159 and\n https://doi.org/10.5281/zenodo.8198697"},{"id":"http://arxiv.org/abs/2308.09343v1","updated":"2023-08-18T07:05:30Z","published":"2023-08-18T07:05:30Z","title":"Surprise machines: revealing Harvard Art Museums' image collection","summary":" Surprise Machines is a project of experimental museology that sets out to\nvisualize the entire image collection of the Harvard Art Museums, intending to\nopen up unexpected vistas on more than 200,000 objects usually inaccessible to\nvisitors. Part of the exhibition Curatorial A(i)gents organized by metaLAB (at)\nHarvard, the project explores the limits of artificial intelligence to display\na large set of images and create surprise among visitors. To achieve such a\nfeeling of surprise, a choreographic interface was designed to connect the\naudience's movement with several unique views of the collection.\n","authors":["Dario Rodighiero","Lins Derry","Douglas Duhaime","Jordan Kruguer","Maximilian C. Mueller","Christopher Pietsch","Jeffrey T. Schnapp","Jeff Steward"],"pdf_url":"https://arxiv.org/pdf/2308.09343v1.pdf","comment":"14 pages and 7 figures"},{"id":"http://arxiv.org/abs/2308.09341v1","updated":"2023-08-18T06:59:55Z","published":"2023-08-18T06:59:55Z","title":"Document Automation Architectures: Updated Survey in Light of Large\n Language Models","summary":" This paper surveys the current state of the art in document automation (DA).\nThe objective of DA is to reduce the manual effort during the generation of\ndocuments by automatically creating and integrating input from different\nsources and assembling documents conforming to defined templates. There have\nbeen reviews of commercial solutions of DA, particularly in the legal domain,\nbut to date there has been no comprehensive review of the academic research on\nDA architectures and technologies. The current survey of DA reviews the\nacademic literature and provides a clearer definition and characterization of\nDA and its features, identifies state-of-the-art DA architectures and\ntechnologies in academic research, and provides ideas that can lead to new\nresearch opportunities within the DA field in light of recent advances in\ngenerative AI and large language models.\n","authors":["Mohammad Ahmadi Achachlouei","Omkar Patil","Tarun Joshi","Vijayan N. Nair"],"pdf_url":"https://arxiv.org/pdf/2308.09341v1.pdf","comment":"The current paper is the updated version of an earlier survey on\n document automation [Ahmadi Achachlouei et al. 2021]. Updates in the current\n paper are as follows: We shortened almost all sections to reduce the size of\n the main paper (without references) from 28 pages to 10 pages, added a review\n of selected papers on large language models, removed certain sections and\n most of diagrams. arXiv admin note: substantial text overlap with\n arXiv:2109.11603"},{"id":"http://arxiv.org/abs/2308.05115v2","updated":"2023-08-18T06:35:50Z","published":"2023-08-08T07:50:38Z","title":"PTransIPs: Identification of phosphorylation sites based on protein\n pretrained language model and Transformer","summary":" Phosphorylation is central to numerous fundamental cellular processes,\ninfluencing the onset and progression of a variety of diseases. The correct\nidentification of these phosphorylation sites is of great importance to unravel\nthe intricate molecular mechanisms within cells and during viral infections,\npotentially leading to the discovery of new therapeutic targets. In this study,\nwe introduce PTransIPs, a novel deep learning model for the identification of\nphosphorylation sites. PTransIPs treat amino acids within protein sequences as\nwords, extracting unique encodings based on their type and sequential position.\nThe model also incorporates embeddings from large pretrained protein models as\nadditional data inputs. PTransIPS is further trained on a combination model of\nconvolutional neural network with residual connections and Transformer model\nequipped with multi-head attention mechanisms. At last, the model outputs\nclassification results through a fully connected layer. The results of\nindependent testing reveal that PTransIPs outperforms existing\nstate-of-the-art(SOTA) methods, achieving AUROCs of 0.9232 and 0.9660 for\nidentifying phosphorylated S/T and Y sites respectively. In addition, ablation\nstudies prove that pretrained model embeddings contribute to the performance of\nPTransIPs. Furthermore, PTransIPs has interpretable amino acid preference,\nvisible training process and shows generalizability on other bioactivity\nclassification tasks. To facilitate usage, our code and data are publicly\naccessible at \\url{https://github.com/StatXzy7/PTransIPs}.\n","authors":["Ziyang Xu","Haitian Zhong"],"pdf_url":"https://arxiv.org/pdf/2308.05115v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11466v2","updated":"2023-08-18T05:58:31Z","published":"2023-06-20T11:41:01Z","title":"Comprehensive Training and Evaluation on Deep Reinforcement Learning for\n Automated Driving in Various Simulated Driving Maneuvers","summary":" Developing and testing automated driving models in the real world might be\nchallenging and even dangerous, while simulation can help with this, especially\nfor challenging maneuvers. Deep reinforcement learning (DRL) has the potential\nto tackle complex decision-making and controlling tasks through learning and\ninteracting with the environment, thus it is suitable for developing automated\ndriving while not being explored in detail yet. This study carried out a\ncomprehensive study by implementing, evaluating, and comparing the two DRL\nalgorithms, Deep Q-networks (DQN) and Trust Region Policy Optimization (TRPO),\nfor training automated driving on the highway-env simulation platform.\nEffective and customized reward functions were developed and the implemented\nalgorithms were evaluated in terms of onlane accuracy (how well the car drives\non the road within the lane), efficiency (how fast the car drives), safety (how\nlikely the car is to crash into obstacles), and comfort (how much the car makes\njerks, e.g., suddenly accelerates or brakes). Results show that the TRPO-based\nmodels with modified reward functions delivered the best performance in most\ncases. Furthermore, to train a uniform driving model that can tackle various\ndriving maneuvers besides the specific ones, this study expanded the\nhighway-env and developed an extra customized training environment, namely,\nComplexRoads, integrating various driving maneuvers and multiple road scenarios\ntogether. Models trained on the designed ComplexRoads environment can adapt\nwell to other driving maneuvers with promising overall performance. Lastly,\nseveral functionalities were added to the highway-env to implement this work.\nThe codes are open on GitHub at https://github.com/alaineman/drlcarsim-paper.\n","authors":["Yongqi Dong","Tobias Datema","Vincent Wassenaar","Joris van de Weg","Cahit Tolga Kopar","Harim Suleman"],"pdf_url":"https://arxiv.org/pdf/2306.11466v2.pdf","comment":"6 pages, 3 figures, accepted by the 26th IEEE International\n Conference on Intelligent Transportation Systems (ITSC 2023)"},{"id":"http://arxiv.org/abs/2308.01050v3","updated":"2023-08-18T05:47:38Z","published":"2023-08-02T09:48:08Z","title":"A Counterfactual Safety Margin Perspective on the Scoring of Autonomous\n Vehicles' Riskiness","summary":" Autonomous Vehicles (AVs) have the potential to provide numerous societal\nbenefits, such as decreased road accidents and increased overall transportation\nefficiency. However, quantifying the risk associated with AVs is challenging\ndue to the lack of historical data and the rapidly evolving technology. This\npaper presents a data-driven framework for comparing the risk of different AVs'\nbehaviors in various operational design domains (ODDs), based on counterfactual\nsimulations of \"misbehaving\" road users. We introduce the concept of\ncounterfactual safety margin, which represents the minimum deviation from\nnormal behavior that could lead to a collision. This concept helps to find the\nmost critical scenarios but also to assess the frequency and severity of risk\nof AVs. We show that the proposed methodology is applicable even when the AV's\nbehavioral policy is unknown -- through worst- and best-case analyses -- making\nthe method useful also to external third-party risk assessors. Our experimental\nresults demonstrate the correlation between the safety margin, the driving\npolicy quality, and the ODD shedding light on the relative risk associated with\ndifferent AV providers. This work contributes to AV safety assessment and aids\nin addressing legislative and insurance concerns surrounding this emerging\ntechnology.\n","authors":["Alessandro Zanardi","Andrea Censi","Margherita Atzei","Luigi Di Lillo","Emilio Frazzoli"],"pdf_url":"https://arxiv.org/pdf/2308.01050v3.pdf","comment":"updated affiliations"},{"id":"http://arxiv.org/abs/2308.09318v1","updated":"2023-08-18T05:37:55Z","published":"2023-08-18T05:37:55Z","title":"Towards Attack-tolerant Federated Learning via Critical Parameter\n Analysis","summary":" Federated learning is used to train a shared model in a decentralized way\nwithout clients sharing private data with each other. Federated learning\nsystems are susceptible to poisoning attacks when malicious clients send false\nupdates to the central server. Existing defense strategies are ineffective\nunder non-IID data settings. This paper proposes a new defense strategy, FedCPA\n(Federated learning with Critical Parameter Analysis). Our attack-tolerant\naggregation method is based on the observation that benign local models have\nsimilar sets of top-k and bottom-k critical parameters, whereas poisoned local\nmodels do not. Experiments with different attack scenarios on multiple datasets\ndemonstrate that our model outperforms existing defense strategies in defending\nagainst poisoning attacks.\n","authors":["Sungwon Han","Sungwon Park","Fangzhao Wu","Sundong Kim","Bin Zhu","Xing Xie","Meeyoung Cha"],"pdf_url":"https://arxiv.org/pdf/2308.09318v1.pdf","comment":"ICCV'23 Accepted"},{"id":"http://arxiv.org/abs/2208.12489v2","updated":"2023-08-18T05:26:31Z","published":"2022-08-26T08:00:02Z","title":"GHN-Q: Parameter Prediction for Unseen Quantized Convolutional\n Architectures via Graph Hypernetworks","summary":" Deep convolutional neural network (CNN) training via iterative optimization\nhas had incredible success in finding optimal parameters. However, modern CNN\narchitectures often contain millions of parameters. Thus, any given model for a\nsingle architecture resides in a massive parameter space. Models with similar\nloss could have drastically different characteristics such as adversarial\nrobustness, generalizability, and quantization robustness. For deep learning on\nthe edge, quantization robustness is often crucial. Finding a model that is\nquantization-robust can sometimes require significant efforts. Recent works\nusing Graph Hypernetworks (GHN) have shown remarkable performance predicting\nhigh-performant parameters of varying CNN architectures. Inspired by these\nsuccesses, we wonder if the graph representations of GHN-2 can be leveraged to\npredict quantization-robust parameters as well, which we call GHN-Q. We conduct\nthe first-ever study exploring the use of graph hypernetworks for predicting\nparameters of unseen quantized CNN architectures. We focus on a reduced CNN\nsearch space and find that GHN-Q can in fact predict quantization-robust\nparameters for various 8-bit quantized CNNs. Decent quantized accuracies are\nobserved even with 4-bit quantization despite GHN-Q not being trained on it.\nQuantized finetuning of GHN-Q at lower bitwidths may bring further improvements\nand is currently being explored.\n","authors":["Stone Yun","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2208.12489v2.pdf","comment":"Updated Figure 1 and added additional results in Table 1. Initial\n extended abstract version accepted at Edge Intelligence Workshop 2022 for\n poster presentation"},{"id":"http://arxiv.org/abs/2308.09312v1","updated":"2023-08-18T05:19:18Z","published":"2023-08-18T05:19:18Z","title":"Path Signatures for Seizure Forecasting","summary":" Forecasting the state of a system from an observed time series is the subject\nof research in many domains, such as computational neuroscience. Here, the\nprediction of epileptic seizures from brain measurements is an unresolved\nproblem. There are neither complete models describing underlying brain\ndynamics, nor do individual patients exhibit a single seizure onset pattern,\nwhich complicates the development of a `one-size-fits-all' solution. Based on a\nlongitudinal patient data set, we address the automated discovery and\nquantification of statistical features (biomarkers) that can be used to\nforecast seizures in a patient-specific way. We use existing and novel feature\nextraction algorithms, in particular the path signature, a recent development\nin time series analysis. Of particular interest is how this set of complex,\nnonlinear features performs compared to simpler, linear features on this task.\nOur inference is based on statistical classification algorithms with in-built\nsubset selection to discern time series with and without an impending seizure\nwhile selecting only a small number of relevant features. This study may be\nseen as a step towards a generalisable pattern recognition pipeline for time\nseries in a broader context.\n","authors":["Jonas F. Haderlein","Andre D. H. Peterson","Parvin Zarei Eskikand","Mark J. Cook","Anthony N. Burkitt","Iven M. Y. Mareels","David B. Grayden"],"pdf_url":"https://arxiv.org/pdf/2308.09312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09310v1","updated":"2023-08-18T05:11:50Z","published":"2023-08-18T05:11:50Z","title":"Variance reduction techniques for stochastic proximal point algorithms","summary":" In the context of finite sums minimization, variance reduction techniques are\nwidely used to improve the performance of state-of-the-art stochastic gradient\nmethods. Their practical impact is clear, as well as their theoretical\nproperties. Stochastic proximal point algorithms have been studied as an\nalternative to stochastic gradient algorithms since they are more stable with\nrespect to the choice of the stepsize but a proper variance reduced version is\nmissing. In this work, we propose the first study of variance reduction\ntechniques for stochastic proximal point algorithms. We introduce a stochastic\nproximal version of SVRG, SAGA, and some of their variants for smooth and\nconvex functions. We provide several convergence results for the iterates and\nthe objective function values. In addition, under the Polyak-{\\L}ojasiewicz\n(PL) condition, we obtain linear convergence rates for the iterates and the\nfunction values. Our numerical experiments demonstrate the advantages of the\nproximal variance reduction methods over their gradient counterparts,\nespecially about the stability with respect to the choice of the step size.\n","authors":["Cheik Traoré","Vassilis Apidopoulos","Saverio Salzo","Silvia Villa"],"pdf_url":"https://arxiv.org/pdf/2308.09310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09309v1","updated":"2023-08-18T05:07:41Z","published":"2023-08-18T05:07:41Z","title":"Meta-learning enhanced next POI recommendation by leveraging check-ins\n from auxiliary cities","summary":" Most existing point-of-interest (POI) recommenders aim to capture user\npreference by employing city-level user historical check-ins, thus facilitating\nusers' exploration of the city. However, the scarcity of city-level user\ncheck-ins brings a significant challenge to user preference learning. Although\nprior studies attempt to mitigate this challenge by exploiting various context\ninformation, e.g., spatio-temporal information, they ignore to transfer the\nknowledge (i.e., common behavioral pattern) from other relevant cities (i.e.,\nauxiliary cities). In this paper, we investigate the effect of knowledge\ndistilled from auxiliary cities and thus propose a novel Meta-learning Enhanced\nnext POI Recommendation framework (MERec). The MERec leverages the correlation\nof check-in behaviors among various cities into the meta-learning paradigm to\nhelp infer user preference in the target city, by holding the principle of\n\"paying more attention to more correlated knowledge\". Particularly, a\ncity-level correlation strategy is devised to attentively capture common\npatterns among cities, so as to transfer more relevant knowledge from more\ncorrelated cities. Extensive experiments verify the superiority of the proposed\nMERec against state-of-the-art algorithms.\n","authors":["Jinze Wang","Lu Zhang","Zhu Sun","Yew-Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2308.09309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09303v1","updated":"2023-08-18T04:52:56Z","published":"2023-08-18T04:52:56Z","title":"Online Class Incremental Learning on Stochastic Blurry Task Boundary via\n Mask and Visual Prompt Tuning","summary":" Continual learning aims to learn a model from a continuous stream of data,\nbut it mainly assumes a fixed number of data and tasks with clear task\nboundaries. However, in real-world scenarios, the number of input data and\ntasks is constantly changing in a statistical way, not a static way. Although\nrecently introduced incremental learning scenarios having blurry task\nboundaries somewhat address the above issues, they still do not fully reflect\nthe statistical properties of real-world situations because of the fixed ratio\nof disjoint and blurry samples. In this paper, we propose a new Stochastic\nincremental Blurry task boundary scenario, called Si-Blurry, which reflects the\nstochastic properties of the real-world. We find that there are two major\nchallenges in the Si-Blurry scenario: (1) inter- and intra-task forgettings and\n(2) class imbalance problem. To alleviate them, we introduce Mask and Visual\nPrompt tuning (MVP). In MVP, to address the inter- and intra-task forgetting\nissues, we propose a novel instance-wise logit masking and contrastive visual\nprompt tuning loss. Both of them help our model discern the classes to be\nlearned in the current batch. It results in consolidating the previous\nknowledge. In addition, to alleviate the class imbalance problem, we introduce\na new gradient similarity-based focal loss and adaptive feature scaling to ease\noverfitting to the major classes and underfitting to the minor classes.\nExtensive experiments show that our proposed MVP significantly outperforms the\nexisting state-of-the-art methods in our challenging Si-Blurry scenario.\n","authors":["Jun-Yeong Moon","Keon-Hee Park","Jung Uk Kim","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2308.09303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09301v1","updated":"2023-08-18T04:49:45Z","published":"2023-08-18T04:49:45Z","title":"Learning Reward Machines through Preference Queries over Sequences","summary":" Reward machines have shown great promise at capturing non-Markovian reward\nfunctions for learning tasks that involve complex action sequencing. However,\nno algorithm currently exists for learning reward machines with realistic weak\nfeedback in the form of preferences. We contribute REMAP, a novel algorithm for\nlearning reward machines from preferences, with correctness and termination\nguarantees. REMAP introduces preference queries in place of membership queries\nin the L* algorithm, and leverages a symbolic observation table along with\nunification and constraint solving to narrow the hypothesis reward machine\nsearch space. In addition to the proofs of correctness and termination for\nREMAP, we present empirical evidence measuring correctness: how frequently the\nresulting reward machine is isomorphic under a consistent yet inexact teacher,\nand the regret between the ground truth and learned reward machines.\n","authors":["Eric Hsiung","Joydeep Biswas","Swarat Chaudhuri"],"pdf_url":"https://arxiv.org/pdf/2308.09301v1.pdf","comment":"24 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.09296v1","updated":"2023-08-18T04:45:56Z","published":"2023-08-18T04:45:56Z","title":"CARLA: A Self-supervised Contrastive Representation Learning Approach\n for Time Series Anomaly Detection","summary":" We introduce a Self-supervised Contrastive Representation Learning Approach\nfor Time Series Anomaly Detection (CARLA), an innovative end-to-end\nself-supervised framework carefully developed to identify anomalous patterns in\nboth univariate and multivariate time series data. By taking advantage of\ncontrastive representation learning, We introduce an innovative end-to-end\nself-supervised deep learning framework carefully developed to identify\nanomalous patterns in both univariate and multivariate time series data. By\ntaking advantage of contrastive representation learning, CARLA effectively\ngenerates robust representations for time series windows. It achieves this by\n1) learning similar representations for temporally close windows and dissimilar\nrepresentations for windows and their equivalent anomalous windows and 2)\nemploying a self-supervised approach to classify normal/anomalous\nrepresentations of windows based on their nearest/furthest neighbours in the\nrepresentation space. Most of the existing models focus on learning normal\nbehaviour. The normal boundary is often tightly defined, which can result in\nslight deviations being classified as anomalies, resulting in a high false\npositive rate and limited ability to generalise normal patterns. CARLA's\ncontrastive learning methodology promotes the production of highly consistent\nand discriminative predictions, thereby empowering us to adeptly address the\ninherent challenges associated with anomaly detection in time series data.\nThrough extensive experimentation on 7 standard real-world time series anomaly\ndetection benchmark datasets, CARLA demonstrates F1 and AU-PR superior to\nexisting state-of-the-art results. Our research highlights the immense\npotential of contrastive representation learning in advancing the field of time\nseries anomaly detection, thus paving the way for novel applications and\nin-depth exploration in this domain.\n","authors":["Zahra Zamanzadeh Darban","Geoffrey I. Webb","Shirui Pan","Mahsa Salehi"],"pdf_url":"https://arxiv.org/pdf/2308.09296v1.pdf","comment":"33 pages, 9 figures, 10 tables"},{"id":"http://arxiv.org/abs/2308.09293v1","updated":"2023-08-18T04:35:13Z","published":"2023-08-18T04:35:13Z","title":"How important are specialized transforms in Neural Operators?","summary":" Simulating physical systems using Partial Differential Equations (PDEs) has\nbecome an indispensible part of modern industrial process optimization.\nTraditionally, numerical solvers have been used to solve the associated PDEs,\nhowever recently Transform-based Neural Operators such as the Fourier Neural\nOperator and Wavelet Neural Operator have received a lot of attention for their\npotential to provide fast solutions for systems of PDEs. In this work, we\ninvestigate the importance of the transform layers to the reported success of\ntransform based neural operators. In particular, we record the cost in terms of\nperformance, if all the transform layers are replaced by learnable linear\nlayers. Surprisingly, we observe that linear layers suffice to provide\nperformance comparable to the best-known transform-based layers and seem to do\nso with a compute time advantage as well. We believe that this observation can\nhave significant implications for future work on Neural Operators, and might\npoint to other sources of efficiencies for these architectures.\n","authors":["Ritam Majumdar","Shirish Karande","Lovekesh Vig"],"pdf_url":"https://arxiv.org/pdf/2308.09293v1.pdf","comment":"8 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.09292v1","updated":"2023-08-18T04:33:36Z","published":"2023-08-18T04:33:36Z","title":"Graph-based Alignment and Uniformity for Recommendation","summary":" Collaborative filtering-based recommender systems (RecSys) rely on learning\nrepresentations for users and items to predict preferences accurately.\nRepresentation learning on the hypersphere is a promising approach due to its\ndesirable properties, such as alignment and uniformity. However, the sparsity\nissue arises when it encounters RecSys. To address this issue, we propose a\nnovel approach, graph-based alignment and uniformity (GraphAU), that explicitly\nconsiders high-order connectivities in the user-item bipartite graph. GraphAU\naligns the user/item embedding to the dense vector representations of\nhigh-order neighbors using a neighborhood aggregator, eliminating the need to\ncompute the burdensome alignment to high-order neighborhoods individually. To\naddress the discrepancy in alignment losses, GraphAU includes a layer-wise\nalignment pooling module to integrate alignment losses layer-wise. Experiments\non four datasets show that GraphAU significantly alleviates the sparsity issue\nand achieves state-of-the-art performance. We open-source GraphAU at\nhttps://github.com/YangLiangwei/GraphAU.\n","authors":["Liangwei Yang","Zhiwei Liu","Chen Wang","Mingdai Yang","Xiaolong Liu","Jing Ma","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2308.09292v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2308.09290v1","updated":"2023-08-18T04:29:48Z","published":"2023-08-18T04:29:48Z","title":"HyperLoRA for PDEs","summary":" Physics-informed neural networks (PINNs) have been widely used to develop\nneural surrogates for solutions of Partial Differential Equations. A drawback\nof PINNs is that they have to be retrained with every change in\ninitial-boundary conditions and PDE coefficients. The Hypernetwork, a\nmodel-based meta learning technique, takes in a parameterized task embedding as\ninput and predicts the weights of PINN as output. Predicting weights of a\nneural network however, is a high-dimensional regression problem, and\nhypernetworks perform sub-optimally while predicting parameters for large base\nnetworks. To circumvent this issue, we use a low ranked adaptation (LoRA)\nformulation to decompose every layer of the base network into low-ranked\ntensors and use hypernetworks to predict the low-ranked tensors. Despite the\nreduced dimensionality of the resulting weight-regression problem, LoRA-based\nHypernetworks violate the underlying physics of the given task. We demonstrate\nthat the generalization capabilities of LoRA-based hypernetworks drastically\nimprove when trained with an additional physics-informed loss component\n(HyperPINN) to satisfy the governing differential equations. We observe that\nLoRA-based HyperPINN training allows us to learn fast solutions for\nparameterized PDEs like Burger's equation and Navier Stokes: Kovasznay flow,\nwhile having an 8x reduction in prediction parameters on average without\ncompromising on accuracy when compared to all other baselines.\n","authors":["Ritam Majumdar","Vishal Jadhav","Anirudh Deodhar","Shirish Karande","Lovekesh Vig","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2308.09290v1.pdf","comment":"8 pages, 4 figures, 3 Tables"},{"id":"http://arxiv.org/abs/2306.15905v2","updated":"2023-08-18T04:24:24Z","published":"2023-06-28T04:03:31Z","title":"Dimension Independent Mixup for Hard Negative Sample in Collaborative\n Filtering","summary":" Collaborative filtering (CF) is a widely employed technique that predicts\nuser preferences based on past interactions. Negative sampling plays a vital\nrole in training CF-based models with implicit feedback. In this paper, we\npropose a novel perspective based on the sampling area to revisit existing\nsampling methods. We point out that current sampling methods mainly focus on\nPoint-wise or Line-wise sampling, lacking flexibility and leaving a significant\nportion of the hard sampling area un-explored. To address this limitation, we\npropose Dimension Independent Mixup for Hard Negative Sampling (DINS), which is\nthe first Area-wise sampling method for training CF-based models. DINS\ncomprises three modules: Hard Boundary Definition, Dimension Independent Mixup,\nand Multi-hop Pooling. Experiments with real-world datasets on both matrix\nfactorization and graph-based models demonstrate that DINS outperforms other\nnegative sampling methods, establishing its effectiveness and superiority. Our\nwork contributes a new perspective, introduces Area-wise sampling, and presents\nDINS as a novel approach that achieves state-of-the-art performance for\nnegative sampling. Our implementations are available in PyTorch.\n","authors":["Xi Wu","Liangwei Yang","Jibing Gong","Chao Zhou","Tianyu Lin","Xiaolong Liu","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2306.15905v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00214v2","updated":"2023-08-18T04:00:55Z","published":"2023-08-01T01:12:29Z","title":"Robust Single-view Cone-beam X-ray Pose Estimation with Neural Tuned\n Tomography (NeTT) and Masked Neural Radiance Fields (mNeRF)","summary":" Many tasks performed in image-guided, mini-invasive, medical procedures can\nbe cast as pose estimation problems, where an X-ray projection is utilized to\nreach a target in 3D space. Expanding on recent advances in the differentiable\nrendering of optically reflective materials, we introduce new methods for pose\nestimation of radiolucent objects using X-ray projections, and we demonstrate\nthe critical role of optimal view synthesis in performing this task. We first\ndevelop an algorithm (DiffDRR) that efficiently computes Digitally\nReconstructed Radiographs (DRRs) and leverages automatic differentiation within\nTensorFlow. Pose estimation is performed by iterative gradient descent using a\nloss function that quantifies the similarity of the DRR synthesized from a\nrandomly initialized pose and the true fluoroscopic image at the target pose.\nWe propose two novel methods for high-fidelity view synthesis, Neural Tuned\nTomography (NeTT) and masked Neural Radiance Fields (mNeRF). Both methods rely\non classic Cone-Beam Computerized Tomography (CBCT); NeTT directly optimizes\nthe CBCT densities, while the non-zero values of mNeRF are constrained by a 3D\nmask of the anatomic region segmented from CBCT. We demonstrate that both NeTT\nand mNeRF distinctly improve pose estimation within our framework. By defining\na successful pose estimate to be a 3D angle error of less than 3 deg, we find\nthat NeTT and mNeRF can achieve similar results, both with overall success\nrates more than 93%. However, the computational cost of NeTT is significantly\nlower than mNeRF in both training and pose estimation. Furthermore, we show\nthat a NeTT trained for a single subject can generalize to synthesize\nhigh-fidelity DRRs and ensure robust pose estimations for all other subjects.\nTherefore, we suggest that NeTT is an attractive option for robust pose\nestimation using fluoroscopic projections.\n","authors":["Chaochao Zhou","Syed Hasib Akhter Faruqui","Abhinav Patel","Ramez N. Abdalla","Michael C. Hurley","Ali Shaibani","Matthew B. Potts","Babak S. Jahromi","Leon Cho","Sameer A. Ansari","Donald R. Cantrell"],"pdf_url":"https://arxiv.org/pdf/2308.00214v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09274v1","updated":"2023-08-18T03:32:55Z","published":"2023-08-18T03:32:55Z","title":"A hybrid Decoder-DeepONet operator regression framework for unaligned\n observation data","summary":" Deep neural operators (DNOs) have been utilized to approximate nonlinear\nmappings between function spaces. However, DNOs face the challenge of increased\ndimensionality and computational cost associated with unaligned observation\ndata. In this study, we propose a hybrid Decoder-DeepONet operator regression\nframework to handle unaligned data effectively. Additionally, we introduce a\nMulti-Decoder-DeepONet, which utilizes an average field of training data as\ninput augmentation. The consistencies of the frameworks with the operator\napproximation theory are provided, on the basis of the universal approximation\ntheorem. Two numerical experiments, Darcy problem and flow-field around an\nairfoil, are conducted to validate the efficiency and accuracy of the proposed\nmethods. Results illustrate the advantages of Decoder-DeepONet and\nMulti-Decoder-DeepONet in handling unaligned observation data and showcase\ntheir potentials in improving prediction accuracy.\n","authors":["Bo Chen","Chenyu Wang","Weipeng Li","Haiyang Fu"],"pdf_url":"https://arxiv.org/pdf/2308.09274v1.pdf","comment":"35 pages, 10 figures, 11 tables"},{"id":"http://arxiv.org/abs/2308.06342v2","updated":"2023-08-18T03:21:55Z","published":"2023-08-11T18:31:54Z","title":"Mirror Diffusion Models","summary":" Diffusion models have successfully been applied to generative tasks in\nvarious continuous domains. However, applying diffusion to discrete categorical\ndata remains a non-trivial task. Moreover, generation in continuous domains\noften requires clipping in practice, which motivates the need for a theoretical\nframework for adapting diffusion to constrained domains. Inspired by the mirror\nLangevin algorithm for the constrained sampling problem, in this theoretical\nreport we propose Mirror Diffusion Models (MDMs). We demonstrate MDMs in the\ncontext of simplex diffusion and propose natural extensions to popular domains\nsuch as image and text generation.\n","authors":["Jaesung Tae"],"pdf_url":"https://arxiv.org/pdf/2308.06342v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14396v4","updated":"2023-08-18T03:18:51Z","published":"2022-10-26T00:23:36Z","title":"FeDXL: Provable Federated Learning for Deep X-Risk Optimization","summary":" In this paper, we tackle a novel federated learning (FL) problem for\noptimizing a family of X-risks, to which no existing FL algorithms are\napplicable. In particular, the objective has the form of $\\mathbb E_{z\\sim S_1}\nf(\\mathbb E_{z'\\sim S_2} \\ell(w; z, z'))$, where two sets of data $S_1, S_2$\nare distributed over multiple machines, $\\ell(\\cdot)$ is a pairwise loss that\nonly depends on the prediction outputs of the input data pairs $(z, z')$, and\n$f(\\cdot)$ is possibly a non-linear non-convex function. This problem has\nimportant applications in machine learning, e.g., AUROC maximization with a\npairwise loss, and partial AUROC maximization with a compositional loss. The\nchallenges for designing an FL algorithm for X-risks lie in the\nnon-decomposability of the objective over multiple machines and the\ninterdependency between different machines. To this end, we propose an\nactive-passive decomposition framework that decouples the gradient's components\nwith two types, namely active parts and passive parts, where the active parts\ndepend on local data that are computed with the local model and the passive\nparts depend on other machines that are communicated/computed based on\nhistorical models and samples. Under this framework, we develop two provable FL\nalgorithms (FeDXL) for handling linear and nonlinear $f$, respectively, based\non federated averaging and merging. We develop a novel theoretical analysis to\ncombat the latency of the passive parts and the interdependency between the\nlocal model parameters and the involved data for computing local gradient\nestimators. We establish both iteration and communication complexities and show\nthat using the historical samples and models for computing the passive parts do\nnot degrade the complexities. We conduct empirical studies of FeDXL for deep\nAUROC and partial AUROC maximization, and demonstrate their performance\ncompared with several baselines.\n","authors":["Zhishuai Guo","Rong Jin","Jiebo Luo","Tianbao Yang"],"pdf_url":"https://arxiv.org/pdf/2210.14396v4.pdf","comment":"International Conference on Machine Learning, 2023"},{"id":"http://arxiv.org/abs/2306.01683v2","updated":"2023-08-18T03:11:43Z","published":"2023-06-02T16:58:15Z","title":"Balancing Exploration and Exploitation: Disentangled $β$-CVAE in De\n Novo Drug Design","summary":" Deep generative models have recently emerged as a promising de novo drug\ndesign method. In this respect, deep generative conditional variational\nautoencoder (CVAE) models are a powerful approach for generating novel\nmolecules with desired drug-like properties. However, molecular graph-based\nmodels with disentanglement and multivariate explicit latent conditioning have\nnot been fully elucidated. To address this, we proposed a molecular-graph\n$\\beta$-CVAE model for de novo drug design. Here, we empirically tuned the\nvalue of disentanglement and assessed its ability to generate molecules with\noptimised univariate- or-multivariate properties. In particular, we optimised\nthe octanol-water partition coefficient (ClogP), molar refractivity (CMR),\nquantitative estimate of drug-likeness (QED), and synthetic accessibility score\n(SAS). Results suggest that a lower $\\beta$ value increases the uniqueness of\ngenerated molecules (exploration). Univariate optimisation results showed our\nmodel generated molecular property averages of ClogP = 41.07% $\\pm$ 0.01% and\nCMR 66.76% $\\pm$ 0.01% by the Ghose filter. Multivariate property optimisation\nresults showed that our model generated an average of 30.07% $\\pm$ 0.01%\nmolecules for both desired properties. Furthermore, our model improved the QED\nand SAS (exploitation) of molecules generated. Together, these results suggest\nthat the $\\beta$-CVAE could balance exploration and exploitation through\ndisentanglement and is a promising model for de novo drug design, thus\nproviding a basis for future studies.\n","authors":["Guang Jun Nicholas Ang","De Tao Irwin Chin","Bingquan Shen"],"pdf_url":"https://arxiv.org/pdf/2306.01683v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01168v3","updated":"2023-08-18T02:38:06Z","published":"2023-04-03T17:37:00Z","title":"DeepAccident: A Motion and Accident Prediction Benchmark for V2X\n Autonomous Driving","summary":" Safety is the primary priority of autonomous driving. Nevertheless, no\npublished dataset currently supports the direct and explainable safety\nevaluation for autonomous driving. In this work, we propose DeepAccident, a\nlarge-scale dataset generated via a realistic simulator containing diverse\naccident scenarios that frequently occur in real-world driving. The proposed\nDeepAccident dataset includes 57K annotated frames and 285K annotated samples,\napproximately 7 times more than the large-scale nuScenes dataset with 40k\nannotated samples. In addition, we propose a new task, end-to-end motion and\naccident prediction, which can be used to directly evaluate the accident\nprediction ability for different autonomous driving algorithms. Furthermore,\nfor each scenario, we set four vehicles along with one infrastructure to record\ndata, thus providing diverse viewpoints for accident scenarios and enabling V2X\n(vehicle-to-everything) research on perception and prediction tasks. Finally,\nwe present a baseline V2X model named V2XFormer that demonstrates superior\nperformance for motion and accident prediction and 3D object detection compared\nto the single-vehicle model.\n","authors":["Tianqi Wang","Sukmin Kim","Wenxuan Ji","Enze Xie","Chongjian Ge","Junsong Chen","Zhenguo Li","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2304.01168v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09262v1","updated":"2023-08-18T02:36:21Z","published":"2023-08-18T02:36:21Z","title":"Multi-Task Pseudo-Label Learning for Non-Intrusive Speech Quality\n Assessment Model","summary":" This study introduces multi-task pseudo-label (MPL) learning for a\nnon-intrusive speech quality assessment model. MPL consists of two stages which\nare obtaining pseudo-label scores from a pretrained model and performing\nmulti-task learning. The 3QUEST metrics, namely Speech-MOS (S-MOS), Noise-MOS\n(N-MOS), and General-MOS (G-MOS) are selected as the primary ground-truth\nlabels. Additionally, the pretrained MOSA-Net model is utilized to estimate\nthree pseudo-labels: perceptual evaluation of speech quality (PESQ), short-time\nobjective intelligibility (STOI), and speech distortion index (SDI). Multi-task\nlearning stage of MPL is then employed to train the MTQ-Net model (multi-target\nspeech quality assessment network). The model is optimized by incorporating\nLoss supervision (derived from the difference between the estimated score and\nthe real ground-truth labels) and Loss semi-supervision (derived from the\ndifference between the estimated score and pseudo-labels), where Huber loss is\nemployed to calculate the loss function. Experimental results first demonstrate\nthe advantages of MPL compared to training the model from scratch and using\nknowledge transfer mechanisms. Secondly, the benefits of Huber Loss in\nimproving the prediction model of MTQ-Net are verified. Finally, the MTQ-Net\nwith the MPL approach exhibits higher overall prediction capabilities when\ncompared to other SSL-based speech assessment models.\n","authors":["Ryandhimas E. Zezario","Bo-Ren Brian Bai","Chiou-Shann Fuh","Hsin-Min Wang","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2308.09262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09259v1","updated":"2023-08-18T02:34:37Z","published":"2023-08-18T02:34:37Z","title":"Distribution shift mitigation at test time with performance guarantees","summary":" Due to inappropriate sample selection and limited training data, a\ndistribution shift often exists between the training and test sets. This shift\ncan adversely affect the test performance of Graph Neural Networks (GNNs).\nExisting approaches mitigate this issue by either enhancing the robustness of\nGNNs to distribution shift or reducing the shift itself. However, both\napproaches necessitate retraining the model, which becomes unfeasible when the\nmodel structure and parameters are inaccessible. To address this challenge, we\npropose FR-GNN, a general framework for GNNs to conduct feature reconstruction.\nFRGNN constructs a mapping relationship between the output and input of a\nwell-trained GNN to obtain class representative embeddings and then uses these\nembeddings to reconstruct the features of labeled nodes. These reconstructed\nfeatures are then incorporated into the message passing mechanism of GNNs to\ninfluence the predictions of unlabeled nodes at test time. Notably, the\nreconstructed node features can be directly utilized for testing the\nwell-trained model, effectively reducing the distribution shift and leading to\nimproved test performance. This remarkable achievement is attained without any\nmodifications to the model structure or parameters. We provide theoretical\nguarantees for the effectiveness of our framework. Furthermore, we conduct\ncomprehensive experiments on various public datasets. The experimental results\ndemonstrate the superior performance of FRGNN in comparison to mainstream\nmethods.\n","authors":["Rui Ding","Jielong Yang","Feng Ji","Xionghu Zhong","Linbo Xie"],"pdf_url":"https://arxiv.org/pdf/2308.09259v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09250v1","updated":"2023-08-18T02:24:32Z","published":"2023-08-18T02:24:32Z","title":"Capacity Bounds for Hyperbolic Neural Network Representations of Latent\n Tree Structures","summary":" We study the representation capacity of deep hyperbolic neural networks\n(HNNs) with a ReLU activation function. We establish the first proof that HNNs\ncan $\\varepsilon$-isometrically embed any finite weighted tree into a\nhyperbolic space of dimension $d$ at least equal to $2$ with prescribed\nsectional curvature $\\kappa<0$, for any $\\varepsilon> 1$ (where $\\varepsilon=1$\nbeing optimal). We establish rigorous upper bounds for the network complexity\non an HNN implementing the embedding. We find that the network complexity of\nHNN implementing the graph representation is independent of the representation\nfidelity/distortion. We contrast this result against our lower bounds on\ndistortion which any ReLU multi-layer perceptron (MLP) must exert when\nembedding a tree with $L>2^d$ leaves into a $d$-dimensional Euclidean space,\nwhich we show at least $\\Omega(L^{1/d})$; independently of the depth, width,\nand (possibly discontinuous) activation function defining the MLP.\n","authors":["Anastasis Kratsios","Ruiyang Hong","Haitz Sáez de Ocáriz Borde"],"pdf_url":"https://arxiv.org/pdf/2308.09250v1.pdf","comment":"22 Pages + References, 1 Table, 4 Figures"},{"id":"http://arxiv.org/abs/2308.09248v1","updated":"2023-08-18T02:23:48Z","published":"2023-08-18T02:23:48Z","title":"Active and Passive Causal Inference Learning","summary":" This paper serves as a starting point for machine learning researchers,\nengineers and students who are interested in but not yet familiar with causal\ninference. We start by laying out an important set of assumptions that are\ncollectively needed for causal identification, such as exchangeability,\npositivity, consistency and the absence of interference. From these\nassumptions, we build out a set of important causal inference techniques, which\nwe do so by categorizing them into two buckets; active and passive approaches.\nWe describe and discuss randomized controlled trials and bandit-based\napproaches from the active category. We then describe classical approaches,\nsuch as matching and inverse probability weighting, in the passive category,\nfollowed by more recent deep learning based algorithms. By finishing the paper\nwith some of the missing aspects of causal inference from this paper, such as\ncollider biases, we expect this paper to provide readers with a diverse set of\nstarting points for further reading and research in causal inference and\ndiscovery.\n","authors":["Daniel Jiwoong Im","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2308.09248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09228v1","updated":"2023-08-18T01:20:25Z","published":"2023-08-18T01:20:25Z","title":"Generalized Sum Pooling for Metric Learning","summary":" A common architectural choice for deep metric learning is a convolutional\nneural network followed by global average pooling (GAP). Albeit simple, GAP is\na highly effective way to aggregate information. One possible explanation for\nthe effectiveness of GAP is considering each feature vector as representing a\ndifferent semantic entity and GAP as a convex combination of them. Following\nthis perspective, we generalize GAP and propose a learnable generalized sum\npooling method (GSP). GSP improves GAP with two distinct abilities: i) the\nability to choose a subset of semantic entities, effectively learning to ignore\nnuisance information, and ii) learning the weights corresponding to the\nimportance of each entity. Formally, we propose an entropy-smoothed optimal\ntransport problem and show that it is a strict generalization of GAP, i.e., a\nspecific realization of the problem gives back GAP. We show that this\noptimization problem enjoys analytical gradients enabling us to use it as a\ndirect learnable replacement for GAP. We further propose a zero-shot loss to\nease the learning of GSP. We show the effectiveness of our method with\nextensive evaluations on 4 popular metric learning benchmarks. Code is\navailable at: GSP-DML Framework\n","authors":["Yeti Z. Gurbuz","Ozan Sener","A. Aydın Alatan"],"pdf_url":"https://arxiv.org/pdf/2308.09228v1.pdf","comment":"Accepted as a conference paper at International Conference on\n Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2202.00636v2","updated":"2023-08-18T01:18:04Z","published":"2022-01-31T18:59:19Z","title":"Differentially Private Community Detection for Stochastic Block Models","summary":" The goal of community detection over graphs is to recover underlying\nlabels/attributes of users (e.g., political affiliation) given the connectivity\nbetween users (represented by adjacency matrix of a graph). There has been\nsignificant recent progress on understanding the fundamental limits of\ncommunity detection when the graph is generated from a stochastic block model\n(SBM). Specifically, sharp information theoretic limits and efficient\nalgorithms have been obtained for SBMs as a function of $p$ and $q$, which\nrepresent the intra-community and inter-community connection probabilities. In\nthis paper, we study the community detection problem while preserving the\nprivacy of the individual connections (edges) between the vertices. Focusing on\nthe notion of $(\\epsilon, \\delta)$-edge differential privacy (DP), we seek to\nunderstand the fundamental tradeoffs between $(p, q)$, DP budget $(\\epsilon,\n\\delta)$, and computational efficiency for exact recovery of the community\nlabels.\n To this end, we present and analyze the associated information-theoretic\ntradeoffs for three broad classes of differentially private community recovery\nmechanisms: a) stability based mechanism; b) sampling based mechanisms; and c)\ngraph perturbation mechanisms. Our main findings are that stability and\nsampling based mechanisms lead to a superior tradeoff between $(p,q)$ and the\nprivacy budget $(\\epsilon, \\delta)$; however this comes at the expense of\nhigher computational complexity. On the other hand, albeit low complexity,\ngraph perturbation mechanisms require the privacy budget $\\epsilon$ to scale as\n$\\Omega(\\log(n))$ for exact recovery. To the best of our knowledge, this is the\nfirst work to study the impact of privacy constraints on the fundamental limits\nfor community detection.\n","authors":["Mohamed Seif","Dung Nguyen","Anil Vullikanti","Ravi Tandon"],"pdf_url":"https://arxiv.org/pdf/2202.00636v2.pdf","comment":"ICML 2022. https://proceedings.mlr.press/v162/mohamed22a.html"},{"id":"http://arxiv.org/abs/2308.09223v1","updated":"2023-08-18T00:48:30Z","published":"2023-08-18T00:48:30Z","title":"DMCVR: Morphology-Guided Diffusion Model for 3D Cardiac Volume\n Reconstruction","summary":" Accurate 3D cardiac reconstruction from cine magnetic resonance imaging\n(cMRI) is crucial for improved cardiovascular disease diagnosis and\nunderstanding of the heart's motion. However, current cardiac MRI-based\nreconstruction technology used in clinical settings is 2D with limited\nthrough-plane resolution, resulting in low-quality reconstructed cardiac\nvolumes. To better reconstruct 3D cardiac volumes from sparse 2D image stacks,\nwe propose a morphology-guided diffusion model for 3D cardiac volume\nreconstruction, DMCVR, that synthesizes high-resolution 2D images and\ncorresponding 3D reconstructed volumes. Our method outperforms previous\napproaches by conditioning the cardiac morphology on the generative model,\neliminating the time-consuming iterative optimization process of the latent\ncode, and improving generation quality. The learned latent spaces provide\nglobal semantics, local cardiac morphology and details of each 2D cMRI slice\nwith highly interpretable value to reconstruct 3D cardiac shape. Our\nexperiments show that DMCVR is highly effective in several aspects, such as 2D\ngeneration and 3D reconstruction performance. With DMCVR, we can produce\nhigh-resolution 3D cardiac MRI reconstructions, surpassing current techniques.\nOur proposed framework has great potential for improving the accuracy of\ncardiac disease diagnosis and treatment planning. Code can be accessed at\nhttps://github.com/hexiaoxiao-cs/DMCVR.\n","authors":["Xiaoxiao He","Chaowei Tan","Ligong Han","Bo Liu","Leon Axel","Kang Li","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2308.09223v1.pdf","comment":"Accepted in MICCAI 2023"},{"id":"http://arxiv.org/abs/2307.00117v2","updated":"2023-08-18T00:47:00Z","published":"2023-06-30T20:09:39Z","title":"Goal Representations for Instruction Following: A Semi-Supervised\n Language Interface to Control","summary":" Our goal is for robots to follow natural language instructions like \"put the\ntowel next to the microwave.\" But getting large amounts of labeled data, i.e.\ndata that contains demonstrations of tasks labeled with the language\ninstruction, is prohibitive. In contrast, obtaining policies that respond to\nimage goals is much easier, because any autonomous trial or demonstration can\nbe labeled in hindsight with its final state as the goal. In this work, we\ncontribute a method that taps into joint image- and goal- conditioned policies\nwith language using only a small amount of language data. Prior work has made\nprogress on this using vision-language models or by jointly training\nlanguage-goal-conditioned policies, but so far neither method has scaled\neffectively to real-world robot tasks without significant human annotation. Our\nmethod achieves robust performance in the real world by learning an embedding\nfrom the labeled data that aligns language not to the goal image, but rather to\nthe desired change between the start and goal images that the instruction\ncorresponds to. We then train a policy on this embedding: the policy benefits\nfrom all the unlabeled data, but the aligned embedding provides an interface\nfor language to steer the policy. We show instruction following across a\nvariety of manipulation tasks in different scenes, with generalization to\nlanguage instructions outside of the labeled data. Videos and code for our\napproach can be found on our website: https://rail-berkeley.github.io/grif/ .\n","authors":["Vivek Myers","Andre He","Kuan Fang","Homer Walke","Philippe Hansen-Estruch","Ching-An Cheng","Mihai Jalobeanu","Andrey Kolobov","Anca Dragan","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2307.00117v2.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2211.07643v2","updated":"2023-08-18T00:32:21Z","published":"2022-11-13T13:57:14Z","title":"Secure and Privacy-Preserving Automated Machine Learning Operations into\n End-to-End Integrated IoT-Edge-Artificial Intelligence-Blockchain Monitoring\n System for Diabetes Mellitus Prediction","summary":" Diabetes Mellitus, one of the leading causes of death worldwide, has no cure\nto date and can lead to severe health complications, such as retinopathy, limb\namputation, cardiovascular diseases, and neuronal disease, if left untreated.\nConsequently, it becomes crucial to take precautionary measures to\navoid/predict the occurrence of diabetes. Machine learning approaches have been\nproposed and evaluated in the literature for diabetes prediction. This paper\nproposes an IoT-edge-Artificial Intelligence (AI)-blockchain system for\ndiabetes prediction based on risk factors. The proposed system is underpinned\nby the blockchain to obtain a cohesive view of the risk factors data from\npatients across different hospitals and to ensure security and privacy of the\nuser's data. Furthermore, we provide a comparative analysis of different\nmedical sensors, devices, and methods to measure and collect the risk factors\nvalues in the system. Numerical experiments and comparative analysis were\ncarried out between our proposed system, using the most accurate random forest\n(RF) model, and the two most used state-of-the-art machine learning approaches,\nLogistic Regression (LR) and Support Vector Machine (SVM), using three\nreal-life diabetes datasets. The results show that the proposed system using RF\npredicts diabetes with 4.57% more accuracy on average compared to LR and SVM,\nwith 2.87 times more execution time. Data balancing without feature selection\ndoes not show significant improvement. The performance is improved by 1.14% and\n0.02% after feature selection for PIMA Indian and Sylhet datasets respectively,\nwhile it reduces by 0.89% for MIMIC III.\n","authors":["Alain Hennebelle","Leila Ismail","Huned Materwala","Juma Al Kaabi","Priya Ranjan","Rajiv Janardhanan"],"pdf_url":"https://arxiv.org/pdf/2211.07643v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.09685v1","updated":"2023-08-18T17:13:45Z","published":"2023-08-18T17:13:45Z","title":"Audiovisual Moments in Time: A Large-Scale Annotated Dataset of\n Audiovisual Actions","summary":" We present Audiovisual Moments in Time (AVMIT), a large-scale dataset of\naudiovisual action events. In an extensive annotation task 11 participants\nlabelled a subset of 3-second audiovisual videos from the Moments in Time\ndataset (MIT). For each trial, participants assessed whether the labelled\naudiovisual action event was present and whether it was the most prominent\nfeature of the video. The dataset includes the annotation of 57,177 audiovisual\nvideos, each independently evaluated by 3 of 11 trained participants. From this\ninitial collection, we created a curated test set of 16 distinct action\nclasses, with 60 videos each (960 videos). We also offer 2 sets of pre-computed\naudiovisual feature embeddings, using VGGish/YamNet for audio data and\nVGG16/EfficientNetB0 for visual data, thereby lowering the barrier to entry for\naudiovisual DNN research. We explored the advantages of AVMIT annotations and\nfeature embeddings to improve performance on audiovisual event recognition. A\nseries of 6 Recurrent Neural Networks (RNNs) were trained on either\nAVMIT-filtered audiovisual events or modality-agnostic events from MIT, and\nthen tested on our audiovisual test set. In all RNNs, top 1 accuracy was\nincreased by 2.71-5.94\\% by training exclusively on audiovisual events, even\noutweighing a three-fold increase in training data. We anticipate that the\nnewly annotated AVMIT dataset will serve as a valuable resource for research\nand comparative experiments involving computational models and human\nparticipants, specifically when addressing research questions where audiovisual\ncorrespondence is of critical importance.\n","authors":["Michael Joannou","Pia Rotshtein","Uta Noppeney"],"pdf_url":"https://arxiv.org/pdf/2308.09685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09678v1","updated":"2023-08-18T16:57:25Z","published":"2023-08-18T16:57:25Z","title":"PoSynDA: Multi-Hypothesis Pose Synthesis Domain Adaptation for Robust 3D\n Human Pose Estimation","summary":" The current 3D human pose estimators face challenges in adapting to new\ndatasets due to the scarcity of 2D-3D pose pairs in target domain training\nsets. We present the \\textit{Multi-Hypothesis \\textbf{P}ose \\textbf{Syn}thesis\n\\textbf{D}omain \\textbf{A}daptation} (\\textbf{PoSynDA}) framework to overcome\nthis issue without extensive target domain annotation. Utilizing a\ndiffusion-centric structure, PoSynDA simulates the 3D pose distribution in the\ntarget domain, filling the data diversity gap. By incorporating a\nmulti-hypothesis network, it creates diverse pose hypotheses and aligns them\nwith the target domain. Target-specific source augmentation obtains the target\ndomain distribution data from the source domain by decoupling the scale and\nposition parameters. The teacher-student paradigm and low-rank adaptation\nfurther refine the process. PoSynDA demonstrates competitive performance on\nbenchmarks, such as Human3.6M, MPI-INF-3DHP, and 3DPW, even comparable with the\ntarget-trained MixSTE model~\\cite{zhang2022mixste}. This work paves the way for\nthe practical application of 3D human pose estimation. The code is available at\nhttps://github.com/hbing-l/PoSynDA.\n","authors":["Hanbing Liu","Jun-Yan He","Zhi-Qi Cheng","Wangmeng Xiang","Qize Yang","Wenhao Chai","Gaoang Wang","Xu Bao","Bin Luo","Yifeng Geng","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2308.09678v1.pdf","comment":"Accepted to ACM Multimedia 2023; 10 pages, 4 figures, 8 tables; the\n code is at https://github.com/hbing-l/PoSynDA"},{"id":"http://arxiv.org/abs/2305.16437v2","updated":"2023-08-18T16:54:02Z","published":"2023-05-25T19:30:21Z","title":"KeyPosS: Plug-and-Play Facial Landmark Detection through GPS-Inspired\n True-Range Multilateration","summary":" In the realm of facial analysis, accurate landmark detection is crucial for\nvarious applications, ranging from face recognition and expression analysis to\nanimation. Conventional heatmap or coordinate regression-based techniques,\nhowever, often face challenges in terms of computational burden and\nquantization errors. To address these issues, we present the KeyPoint\nPositioning System (KeyPosS) - a groundbreaking facial landmark detection\nframework that stands out from existing methods. The framework utilizes a fully\nconvolutional network to predict a distance map, which computes the distance\nbetween a Point of Interest (POI) and multiple anchor points. These anchor\npoints are ingeniously harnessed to triangulate the POI's position through the\nTrue-range Multilateration algorithm. Notably, the plug-and-play nature of\nKeyPosS enables seamless integration into any decoding stage, ensuring a\nversatile and adaptable solution. We conducted a thorough evaluation of\nKeyPosS's performance by benchmarking it against state-of-the-art models on\nfour different datasets. The results show that KeyPosS substantially\noutperforms leading methods in low-resolution settings while requiring a\nminimal time overhead. The code is available at\nhttps://github.com/zhiqic/KeyPosS.\n","authors":["Xu Bao","Zhi-Qi Cheng","Jun-Yan He","Chenyang Li","Wangmeng Xiang","Jingdong Sun","Hanbing Liu","Wei Liu","Bin Luo","Yifeng Geng","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2305.16437v2.pdf","comment":"Accepted to ACM Multimedia 2023; 10 pages, 7 figures, 6 tables; the\n code is at https://github.com/zhiqic/KeyPosS"},{"id":"http://arxiv.org/abs/2308.09599v1","updated":"2023-08-18T14:54:13Z","published":"2023-08-18T14:54:13Z","title":"Language-Guided Diffusion Model for Visual Grounding","summary":" Visual grounding (VG) tasks involve explicit cross-modal alignment, as\nsemantically corresponding image regions are to be located for the language\nphrases provided. Existing approaches complete such visual-text reasoning in a\nsingle-step manner. Their performance causes high demands on large-scale\nanchors and over-designed multi-modal fusion modules based on human priors,\nleading to complicated frameworks that may be difficult to train and overfit to\nspecific scenarios. Even worse, such once-for-all reasoning mechanisms are\nincapable of refining boxes continuously to enhance query-region matching. In\ncontrast, in this paper, we formulate an iterative reasoning process by\ndenoising diffusion modeling. Specifically, we propose a language-guided\ndiffusion framework for visual grounding, LG-DVG, which trains the model to\nprogressively reason queried object boxes by denoising a set of noisy boxes\nwith the language guide. To achieve this, LG-DVG gradually perturbs\nquery-aligned ground truth boxes to noisy ones and reverses this process step\nby step, conditional on query semantics. Extensive experiments for our proposed\nframework on five widely used datasets validate the superior performance of\nsolving visual grounding, a cross-modal alignment task, in a generative way.\nThe source codes are available at\n\\url{https://github.com/iQua/vgbase/tree/DiffusionVG}.\n","authors":["Sijia Chen","Baochun Li"],"pdf_url":"https://arxiv.org/pdf/2308.09599v1.pdf","comment":"20 pages, 16 figures"},{"id":"http://arxiv.org/abs/2306.10054v2","updated":"2023-08-18T09:47:18Z","published":"2023-06-13T13:54:49Z","title":"A Shift In Artistic Practices through Artificial Intelligence","summary":" The explosion of content generated by Artificial Intelligence models has\ninitiated a cultural shift in arts, music, and media, where roles are changing,\nvalues are shifting, and conventions are challenged. The readily available,\nvast dataset of the internet has created an environment for AI models to be\ntrained on any content on the web. With AI models shared openly, and used by\nmany, globally, how does this new paradigm shift challenge the status quo in\nartistic practices? What kind of changes will AI technology bring into music,\narts, and new media?\n","authors":["Kıvanç Tatar","Petter Ericson","Kelsey Cotton","Paola Torres Núñez del Prado","Roser Batlle-Roca","Beatriz Cabrero-Daniel","Sara Ljungblad","Georgios Diapoulis","Jabbar Hussain"],"pdf_url":"https://arxiv.org/pdf/2306.10054v2.pdf","comment":"Submitted to Leonardo Journal"},{"id":"http://arxiv.org/abs/2303.14637v3","updated":"2023-08-18T07:52:08Z","published":"2023-03-26T06:09:53Z","title":"Improved Nonlinear Transform Source-Channel Coding to Catalyze Semantic\n Communications","summary":" Recent deep learning methods have led to increased interest in solving\nhigh-efficiency end-to-end transmission problems. These methods, we call\nnonlinear transform source-channel coding (NTSCC), extract the semantic latent\nfeatures of source signal, and learn entropy model to guide the joint\nsource-channel coding with variable rate to transmit latent features over\nwireless channels. In this paper, we propose a comprehensive framework for\nimproving NTSCC, thereby higher system coding gain, better model versatility,\nand more flexible adaptation strategy aligned with semantic guidance are all\nachieved. This new sophisticated NTSCC model is now ready to support large-size\ndata interaction in emerging XR, which catalyzes the application of semantic\ncommunications. Specifically, we propose three useful improvement approaches.\nFirst, we introduce a contextual entropy model to better capture the spatial\ncorrelations among the semantic latent features, thereby more accurate rate\nallocation and contextual joint source-channel coding are developed accordingly\nto enable higher coding gain. On that basis, we further propose response\nnetwork architectures to formulate versatile NTSCC, i.e., once-trained model\nsupports various rates and channel states that benefits the practical\ndeployment. Following this, we propose an online latent feature editing method\nto enable more flexible coding rate control aligned with some specific semantic\nguidance. By comprehensively applying the above three improvement methods for\nNTSCC, a deployment-friendly semantic coded transmission system stands out\nfinally. Our improved NTSCC system has been experimentally verified to achieve\nconsiderable bandwidth saving versus the state-of-the-art engineered VTM + 5G\nLDPC coded transmission system with lower processing latency.\n","authors":["Sixian Wang","Jincheng Dai","Xiaoqi Qin","Zhongwei Si","Kai Niu","Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.14637v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09357v1","updated":"2023-08-18T07:38:30Z","published":"2023-08-18T07:38:30Z","title":"Multi-scale Target-Aware Framework for Constrained Image Splicing\n Detection and Localization","summary":" Constrained image splicing detection and localization (CISDL) is a\nfundamental task of multimedia forensics, which detects splicing operation\nbetween two suspected images and localizes the spliced region on both images.\nRecent works regard it as a deep matching problem and have made significant\nprogress. However, existing frameworks typically perform feature extraction and\ncorrelation matching as separate processes, which may hinder the model's\nability to learn discriminative features for matching and can be susceptible to\ninterference from ambiguous background pixels. In this work, we propose a\nmulti-scale target-aware framework to couple feature extraction and correlation\nmatching in a unified pipeline. In contrast to previous methods, we design a\ntarget-aware attention mechanism that jointly learns features and performs\ncorrelation matching between the probe and donor images. Our approach can\neffectively promote the collaborative learning of related patches, and perform\nmutual promotion of feature learning and correlation matching. Additionally, in\norder to handle scale transformations, we introduce a multi-scale projection\nmethod, which can be readily integrated into our target-aware framework that\nenables the attention process to be conducted between tokens containing\ninformation of varying scales. Our experiments demonstrate that our model,\nwhich uses a unified pipeline, outperforms state-of-the-art methods on several\nbenchmark datasets and is robust against scale transformations.\n","authors":["Yuxuan Tan","Yuanman Li","Limin Zeng","Jiaxiong Ye","Wei wang","Xia Li"],"pdf_url":"https://arxiv.org/pdf/2308.09357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09351v1","updated":"2023-08-18T07:17:09Z","published":"2023-08-18T07:17:09Z","title":"RLIPv2: Fast Scaling of Relational Language-Image Pre-training","summary":" Relational Language-Image Pre-training (RLIP) aims to align vision\nrepresentations with relational texts, thereby advancing the capability of\nrelational reasoning in computer vision tasks. However, hindered by the slow\nconvergence of RLIPv1 architecture and the limited availability of existing\nscene graph data, scaling RLIPv1 is challenging. In this paper, we propose\nRLIPv2, a fast converging model that enables the scaling of relational\npre-training to large-scale pseudo-labelled scene graph data. To enable fast\nscaling, RLIPv2 introduces Asymmetric Language-Image Fusion (ALIF), a mechanism\nthat facilitates earlier and deeper gated cross-modal fusion with sparsified\nlanguage encoding layers. ALIF leads to comparable or better performance than\nRLIPv1 in a fraction of the time for pre-training and fine-tuning. To obtain\nscene graph data at scale, we extend object detection datasets with free-form\nrelation labels by introducing a captioner (e.g., BLIP) and a designed Relation\nTagger. The Relation Tagger assigns BLIP-generated relation texts to region\npairs, thus enabling larger-scale relational pre-training. Through extensive\nexperiments conducted on Human-Object Interaction Detection and Scene Graph\nGeneration, RLIPv2 shows state-of-the-art performance on three benchmarks under\nfully-finetuning, few-shot and zero-shot settings. Notably, the largest RLIPv2\nachieves 23.29mAP on HICO-DET without any fine-tuning, yields 32.22mAP with\njust 1% data and yields 45.09mAP with 100% data. Code and models are publicly\navailable at https://github.com/JacobYuan7/RLIPv2.\n","authors":["Hangjie Yuan","Shiwei Zhang","Xiang Wang","Samuel Albanie","Yining Pan","Tao Feng","Jianwen Jiang","Dong Ni","Yingya Zhang","Deli Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.09351v1.pdf","comment":"Accepted to ICCV 2023. Code and models:\n https://github.com/JacobYuan7/RLIPv2"},{"id":"http://arxiv.org/abs/2308.09332v1","updated":"2023-08-18T06:27:35Z","published":"2023-08-18T06:27:35Z","title":"LSCD: A Large-Scale Screen Content Dataset for Video Compression","summary":" Multimedia compression allows us to watch videos, see pictures and hear\nsounds within a limited bandwidth, which helps the flourish of the internet.\nDuring the past decades, multimedia compression has achieved great success\nusing hand-craft features and systems. With the development of artificial\nintelligence and video compression, there emerges a lot of research work\nrelated to using the neural network on the video compression task to get rid of\nthe complicated system. Not only producing the advanced algorithms, but\nresearchers also spread the compression to different content, such as User\nGenerated Content(UGC). With the rapid development of mobile devices, screen\ncontent videos become an important part of multimedia data. In contrast, we\nfind community lacks a large-scale dataset for screen content video\ncompression, which impedes the fast development of the corresponding\nlearning-based algorithms. In order to fulfill this blank and accelerate the\nresearch of this special type of videos, we propose the Large-scale Screen\nContent Dataset(LSCD), which contains 714 source sequences. Meanwhile, we\nprovide the analysis of the proposed dataset to show some features of screen\ncontent videos, which will help researchers have a better understanding of how\nto explore new algorithms. Besides collecting and post-processing the data to\norganize the dataset, we also provide a benchmark containing the performance of\nboth traditional codec and learning-based methods.\n","authors":["Yuhao Cheng","Siru Zhang","Yiqiang Yan","Rong Chen","Yun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09322v1","updated":"2023-08-18T05:46:20Z","published":"2023-08-18T05:46:20Z","title":"Audio-Visual Glance Network for Efficient Video Recognition","summary":" Deep learning has made significant strides in video understanding tasks, but\nthe computation required to classify lengthy and massive videos using\nclip-level video classifiers remains impractical and prohibitively expensive.\nTo address this issue, we propose Audio-Visual Glance Network (AVGN), which\nleverages the commonly available audio and visual modalities to efficiently\nprocess the spatio-temporally important parts of a video. AVGN firstly divides\nthe video into snippets of image-audio clip pair and employs lightweight\nunimodal encoders to extract global visual features and audio features. To\nidentify the important temporal segments, we use an Audio-Visual Temporal\nSaliency Transformer (AV-TeST) that estimates the saliency scores of each\nframe. To further increase efficiency in the spatial dimension, AVGN processes\nonly the important patches instead of the whole images. We use an\nAudio-Enhanced Spatial Patch Attention (AESPA) module to produce a set of\nenhanced coarse visual features, which are fed to a policy network that\nproduces the coordinates of the important patches. This approach enables us to\nfocus only on the most important spatio-temporally parts of the video, leading\nto more efficient video recognition. Moreover, we incorporate various training\ntechniques and multi-modal feature fusion to enhance the robustness and\neffectiveness of our AVGN. By combining these strategies, our AVGN sets new\nstate-of-the-art performance in multiple video recognition benchmarks while\nachieving faster processing speed.\n","authors":["Muhammad Adi Nugroho","Sangmin Woo","Sumin Lee","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2308.09322v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09302v1","updated":"2023-08-18T04:51:15Z","published":"2023-08-18T04:51:15Z","title":"Robust Audio Anti-Spoofing with Fusion-Reconstruction Learning on\n Multi-Order Spectrograms","summary":" Robust audio anti-spoofing has been increasingly challenging due to the\nrecent advancements on deepfake techniques. While spectrograms have\ndemonstrated their capability for anti-spoofing, complementary information\npresented in multi-order spectral patterns have not been well explored, which\nlimits their effectiveness for varying spoofing attacks. Therefore, we propose\na novel deep learning method with a spectral fusion-reconstruction strategy,\nnamely S2pecNet, to utilise multi-order spectral patterns for robust audio\nanti-spoofing representations. Specifically, spectral patterns up to\nsecond-order are fused in a coarse-to-fine manner and two branches are designed\nfor the fine-level fusion from the spectral and temporal contexts. A\nreconstruction from the fused representation to the input spectrograms further\nreduces the potential fused information loss. Our method achieved the\nstate-of-the-art performance with an EER of 0.77% on a widely used dataset:\nASVspoof2019 LA Challenge.\n","authors":["Penghui Wen","Kun Hu","Wenxi Yue","Sen Zhang","Wanlei Zhou","Zhiyong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09300v1","updated":"2023-08-18T04:49:38Z","published":"2023-08-18T04:49:38Z","title":"V2A-Mapper: A Lightweight Solution for Vision-to-Audio Generation by\n Connecting Foundation Models","summary":" Building artificial intelligence (AI) systems on top of a set of foundation\nmodels (FMs) is becoming a new paradigm in AI research. Their representative\nand generative abilities learnt from vast amounts of data can be easily adapted\nand transferred to a wide range of downstream tasks without extra training from\nscratch. However, leveraging FMs in cross-modal generation remains\nunder-researched when audio modality is involved. On the other hand,\nautomatically generating semantically-relevant sound from visual input is an\nimportant problem in cross-modal generation studies. To solve this\nvision-to-audio (V2A) generation problem, existing methods tend to design and\nbuild complex systems from scratch using modestly sized datasets. In this\npaper, we propose a lightweight solution to this problem by leveraging\nfoundation models, specifically CLIP, CLAP, and AudioLDM. We first investigate\nthe domain gap between the latent space of the visual CLIP and the auditory\nCLAP models. Then we propose a simple yet effective mapper mechanism\n(V2A-Mapper) to bridge the domain gap by translating the visual input between\nCLIP and CLAP spaces. Conditioned on the translated CLAP embedding, pretrained\naudio generative FM AudioLDM is adopted to produce high-fidelity and\nvisually-aligned sound. Compared to previous approaches, our method only\nrequires a quick training of the V2A-Mapper. We further analyze and conduct\nextensive experiments on the choice of the V2A-Mapper and show that a\ngenerative mapper is better at fidelity and variability (FD) while a regression\nmapper is slightly better at relevance (CS). Both objective and subjective\nevaluation on two V2A datasets demonstrate the superiority of our proposed\nmethod compared to current state-of-the-art approaches - trained with 86% fewer\nparameters but achieving 53% and 19% improvement in FD and CS, respectively.\n","authors":["Heng Wang","Jianbo Ma","Santiago Pascual","Richard Cartwright","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2308.09300v1.pdf","comment":"13 pages, 10 figures. Code, demo, and samples:\n https://v2a-mapper.github.io/"},{"id":"http://arxiv.org/abs/2308.09289v1","updated":"2023-08-18T04:19:36Z","published":"2023-08-18T04:19:36Z","title":"Preference-conditioned Pixel-based AI Agent For Game Testing","summary":" The game industry is challenged to cope with increasing growth in demand and\ngame complexity while maintaining acceptable quality standards for released\ngames. Classic approaches solely depending on human efforts for quality\nassurance and game testing do not scale effectively in terms of time and cost.\nGame-testing AI agents that learn by interaction with the environment have the\npotential to mitigate these challenges with good scalability properties on time\nand costs. However, most recent work in this direction depends on game state\ninformation for the agent's state representation, which limits generalization\nacross different game scenarios. Moreover, game test engineers usually prefer\nexploring a game in a specific style, such as exploring the golden path.\nHowever, current game testing AI agents do not provide an explicit way to\nsatisfy such a preference. This paper addresses these limitations by proposing\nan agent design that mainly depends on pixel-based state observations while\nexploring the environment conditioned on a user's preference specified by\ndemonstration trajectories. In addition, we propose an imitation learning\nmethod that couples self-supervised and supervised learning objectives to\nenhance the quality of imitation behaviors. Our agent significantly outperforms\nstate-of-the-art pixel-based game testing agents over exploration coverage and\ntest execution quality when evaluated on a complex open-world environment\nresembling many aspects of real AAA games.\n","authors":["Sherif Abdelfattah","Adrian Brown","Pushi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06087v2","updated":"2023-08-18T03:19:52Z","published":"2023-08-11T11:57:58Z","title":"Audio-Visual Spatial Integration and Recursive Attention for Robust\n Sound Source Localization","summary":" The objective of the sound source localization task is to enable machines to\ndetect the location of sound-making objects within a visual scene. While the\naudio modality provides spatial cues to locate the sound source, existing\napproaches only use audio as an auxiliary role to compare spatial regions of\nthe visual modality. Humans, on the other hand, utilize both audio and visual\nmodalities as spatial cues to locate sound sources. In this paper, we propose\nan audio-visual spatial integration network that integrates spatial cues from\nboth modalities to mimic human behavior when detecting sound-making objects.\nAdditionally, we introduce a recursive attention network to mimic human\nbehavior of iterative focusing on objects, resulting in more accurate attention\nregions. To effectively encode spatial information from both modalities, we\npropose audio-visual pair matching loss and spatial region alignment loss. By\nutilizing the spatial cues of audio-visual modalities and recursively focusing\nobjects, our method can perform more robust sound source localization.\nComprehensive experimental results on the Flickr SoundNet and VGG-Sound Source\ndatasets demonstrate the superiority of our proposed method over existing\napproaches. Our code is available at: https://github.com/VisualAIKHU/SIRA-SSL\n","authors":["Sung Jin Um","Dongjin Kim","Jung Uk Kim"],"pdf_url":"https://arxiv.org/pdf/2308.06087v2.pdf","comment":"Camera-Ready, ACM MM 2023"},{"id":"http://arxiv.org/abs/2304.07567v2","updated":"2023-08-18T02:32:17Z","published":"2023-04-15T14:08:47Z","title":"CoVLR: Coordinating Cross-Modal Consistency and Intra-Modal Structure\n for Vision-Language Retrieval","summary":" Current vision-language retrieval aims to perform cross-modal instance\nsearch, in which the core idea is to learn the consistent visionlanguage\nrepresentations. Although the performance of cross-modal retrieval has greatly\nimproved with the development of deep models, we unfortunately find that\ntraditional hard consistency may destroy the original relationships among\nsingle-modal instances, leading the performance degradation for single-modal\nretrieval. To address this challenge, in this paper, we experimentally observe\nthat the vision-language divergence may cause the existence of strong and weak\nmodalities, and the hard cross-modal consistency cannot guarantee that strong\nmodal instances' relationships are not affected by weak modality, resulting in\nthe strong modal instances' relationships perturbed despite learned consistent\nrepresentations.To this end, we propose a novel and directly Coordinated\nVisionLanguage Retrieval method (dubbed CoVLR), which aims to study and\nalleviate the desynchrony problem between the cross-modal alignment and\nsingle-modal cluster-preserving tasks. CoVLR addresses this challenge by\ndeveloping an effective meta-optimization based strategy, in which the\ncross-modal consistency objective and the intra-modal relation preserving\nobjective are acted as the meta-train and meta-test tasks, thereby CoVLR\nencourages both tasks to be optimized in a coordinated way. Consequently, we\ncan simultaneously insure cross-modal consistency and intra-modal structure.\nExperiments on different datasets validate CoVLR can improve single-modal\nretrieval accuracy whilst preserving crossmodal retrieval capacity compared\nwith the baselines.\n","authors":["Yang Yang","Zhongtian Fu","Xiangyu Wu","Wenjie Li"],"pdf_url":"https://arxiv.org/pdf/2304.07567v2.pdf","comment":"I apologize for my operational mistake, which has resulted in the\n absence of a revised version of the manuscript. Furthermore, I am concerned\n that the submission process of this paper may potentially lead to conflicts.\n Therefore, I kindly request the withdrawal of the manuscript"}]},"2023-08-17T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.01284v2","updated":"2023-08-17T22:34:38Z","published":"2023-08-02T17:11:37Z","title":"Fighting Fire with Fire: Can ChatGPT Detect AI-generated Text?","summary":" Large language models (LLMs) such as ChatGPT are increasingly being used for\nvarious use cases, including text content generation at scale. Although\ndetection methods for such AI-generated text exist already, we investigate\nChatGPT's performance as a detector on such AI-generated text, inspired by\nworks that use ChatGPT as a data labeler or annotator. We evaluate the\nzero-shot performance of ChatGPT in the task of human-written vs. AI-generated\ntext detection, and perform experiments on publicly available datasets. We\nempirically investigate if ChatGPT is symmetrically effective in detecting\nAI-generated or human-written text. Our findings provide insight on how ChatGPT\nand similar LLMs may be leveraged in automated detection pipelines by simply\nfocusing on solving a specific aspect of the problem and deriving the rest from\nthat solution. All code and data is available at\nhttps://github.com/AmritaBh/ChatGPT-as-Detector.\n","authors":["Amrita Bhattacharjee","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.01284v2.pdf","comment":"to appear in SIGKDD Explorations (December 2023)"},{"id":"http://arxiv.org/abs/2211.09699v4","updated":"2023-08-17T21:43:24Z","published":"2022-11-15T19:07:53Z","title":"PromptCap: Prompt-Guided Task-Aware Image Captioning","summary":" Knowledge-based visual question answering (VQA) involves questions that\nrequire world knowledge beyond the image to yield the correct answer. Large\nlanguage models (LMs) like GPT-3 are particularly helpful for this task because\nof their strong knowledge retrieval and reasoning capabilities. To enable LM to\nunderstand images, prior work uses a captioning model to convert images into\ntext. However, when summarizing an image in a single caption sentence, which\nvisual entities to describe are often underspecified. Generic image captions\noften miss visual details essential for the LM to answer visual questions\ncorrectly. To address this challenge, we propose PromptCap (Prompt-guided image\nCaptioning), a captioning model designed to serve as a better connector between\nimages and black-box LMs. Different from generic captions, PromptCap takes a\nnatural-language prompt to control the visual entities to describe in the\ngenerated caption. The prompt contains a question that the caption should aid\nin answering. To avoid extra annotation, PromptCap is trained by examples\nsynthesized with GPT-3 and existing datasets. We demonstrate PromptCap's\neffectiveness on an existing pipeline in which GPT-3 is prompted with image\ncaptions to carry out VQA. PromptCap outperforms generic captions by a large\nmargin and achieves state-of-the-art accuracy on knowledge-based VQA tasks\n(60.4% on OK-VQA and 59.6% on A-OKVQA). Zero-shot results on WebQA show that\nPromptCap generalizes well to unseen domains.\n","authors":["Yushi Hu","Hang Hua","Zhengyuan Yang","Weijia Shi","Noah A Smith","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2211.09699v4.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09193v1","updated":"2023-08-17T21:36:56Z","published":"2023-08-17T21:36:56Z","title":"A Comparative Study of Text Embedding Models for Semantic Text\n Similarity in Bug Reports","summary":" Bug reports are an essential aspect of software development, and it is\ncrucial to identify and resolve them quickly to ensure the consistent\nfunctioning of software systems. Retrieving similar bug reports from an\nexisting database can help reduce the time and effort required to resolve bugs.\nIn this paper, we compared the effectiveness of semantic textual similarity\nmethods for retrieving similar bug reports based on a similarity score. We\nexplored several embedding models such as TF-IDF (Baseline), FastText, Gensim,\nBERT, and ADA. We used the Software Defects Data containing bug reports for\nvarious software projects to evaluate the performance of these models. Our\nexperimental results showed that BERT generally outperformed the rest of the\nmodels regarding recall, followed by ADA, Gensim, FastText, and TFIDF. Our\nstudy provides insights into the effectiveness of different embedding methods\nfor retrieving similar bug reports and highlights the impact of selecting the\nappropriate one for this task. Our code is available on GitHub.\n","authors":["Avinash Patil","Kihwan Han","Sabyasachi Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2308.09193v1.pdf","comment":"7 Pages"},{"id":"http://arxiv.org/abs/2308.09186v1","updated":"2023-08-17T21:10:04Z","published":"2023-08-17T21:10:04Z","title":"Is Argument Structure of Learner Chinese Understandable: A Corpus-Based\n Analysis","summary":" This paper presents a corpus-based analysis of argument structure errors in\nlearner Chinese. The data for analysis includes sentences produced by language\nlearners as well as their corrections by native speakers. We couple the data\nwith semantic role labeling annotations that are manually created by two senior\nstudents whose majors are both Applied Linguistics. The annotation procedure is\nguided by the Chinese PropBank specification, which is originally developed to\ncover first language phenomena. Nevertheless, we find that it is quite\ncomprehensive for handling second language phenomena. The inter-annotator\nagreement is rather high, suggesting the understandability of learner texts to\nnative speakers. Based on our annotations, we present a preliminary analysis of\ncompetence errors related to argument structure. In particular, speech errors\nrelated to word order, word selection, lack of proposition, and\nargument-adjunct confounding are discussed.\n","authors":["Yuguang Duan","Zi Lin","Weiwei Sun"],"pdf_url":"https://arxiv.org/pdf/2308.09186v1.pdf","comment":"Proceedings of the 2018 International Conference on Bilingual\n Learning and Teaching (ICBLT-2018)"},{"id":"http://arxiv.org/abs/2308.06975v2","updated":"2023-08-17T20:51:40Z","published":"2023-08-14T07:20:49Z","title":"Can Knowledge Graphs Simplify Text?","summary":" Knowledge Graph (KG)-to-Text Generation has seen recent improvements in\ngenerating fluent and informative sentences which describe a given KG. As KGs\nare widespread across multiple domains and contain important entity-relation\ninformation, and as text simplification aims to reduce the complexity of a text\nwhile preserving the meaning of the original text, we propose KGSimple, a novel\napproach to unsupervised text simplification which infuses KG-established\ntechniques in order to construct a simplified KG path and generate a concise\ntext which preserves the original input's meaning. Through an iterative and\nsampling KG-first approach, our model is capable of simplifying text when\nstarting from a KG by learning to keep important information while harnessing\nKG-to-text generation to output fluent and descriptive sentences. We evaluate\nvarious settings of the KGSimple model on currently-available KG-to-text\ndatasets, demonstrating its effectiveness compared to unsupervised text\nsimplification models which start with a given complex text. Our code is\navailable on GitHub.\n","authors":["Anthony Colas","Haodi Ma","Xuanli He","Yang Bai","Daisy Zhe Wang"],"pdf_url":"https://arxiv.org/pdf/2308.06975v2.pdf","comment":"Accepted as a Main Conference Long Paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2307.13923v2","updated":"2023-08-17T19:58:42Z","published":"2023-07-26T02:45:38Z","title":"GrammarGPT: Exploring Open-Source LLMs for Native Chinese Grammatical\n Error Correction with Supervised Fine-Tuning","summary":" Grammatical error correction aims to correct ungrammatical sentences\nautomatically. Recently, some work has demonstrated the excellent capabilities\nof closed-source Large Language Models (LLMs, e.g., ChatGPT) in grammatical\nerror correction. However, the potential of open-source LLMs remains\nunexplored. In this paper, we introduced GrammarGPT, an open-source LLM, to\npreliminary explore its potential for native Chinese grammatical error\ncorrection. The core recipe of GrammarGPT is to leverage the hybrid dataset of\nChatGPT-generated and human-annotated. For grammatical errors with clues, we\nproposed a heuristic method to guide ChatGPT to generate ungrammatical\nsentences by providing those clues. For grammatical errors without clues, we\ncollected ungrammatical sentences from publicly available websites and manually\ncorrected them. In addition, we employed an error-invariant augmentation method\nto enhance the ability of the model to correct native Chinese grammatical\nerrors. We ultimately constructed about 1k parallel data and utilized these\ndata to fine-tune open-source LLMs (e.g., Phoenix, released by The Chinese\nUniversity of Hong Kong, Shenzhen) with instruction tuning. The experimental\nresults show that GrammarGPT outperforms the existing SOTA system\nsignificantly. Although model parameters are 20x larger than the SOTA baseline,\nthe required amount of data for instruction tuning is 1200x smaller,\nillustrating the potential of open-source LLMs on native CGEC. Our GrammarGPT\nranks $3^{rd}$ on NLPCC2023 SharedTask1, demonstrating our approach's\neffectiveness. The code and data are available at\n\\url{https://github.com/FreedomIntelligence/GrammarGPT}.\n","authors":["Yaxin Fan","Feng Jiang","Peifeng Li","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2307.13923v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13512v2","updated":"2023-08-17T19:23:20Z","published":"2023-05-22T21:59:26Z","title":"Can ChatGPT Detect Intent? Evaluating Large Language Models for Spoken\n Language Understanding","summary":" Recently, large pretrained language models have demonstrated strong language\nunderstanding capabilities. This is particularly reflected in their zero-shot\nand in-context learning abilities on downstream tasks through prompting. To\nassess their impact on spoken language understanding (SLU), we evaluate several\nsuch models like ChatGPT and OPT of different sizes on multiple benchmarks. We\nverify the emergent ability unique to the largest models as they can reach\nintent classification accuracy close to that of supervised models with zero or\nfew shots on various languages given oracle transcripts. By contrast, the\nresults for smaller models fitting a single GPU fall far behind. We note that\nthe error cases often arise from the annotation scheme of the dataset;\nresponses from ChatGPT are still reasonable. We show, however, that the model\nis worse at slot filling, and its performance is sensitive to ASR errors,\nsuggesting serious challenges for the application of those textual models on\nSLU.\n","authors":["Mutian He","Philip N. Garner"],"pdf_url":"https://arxiv.org/pdf/2305.13512v2.pdf","comment":"6 pages, 2 figures; Accepted by Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.09158v1","updated":"2023-08-17T19:12:13Z","published":"2023-08-17T19:12:13Z","title":"ZhiJian: A Unifying and Rapidly Deployable Toolbox for Pre-trained Model\n Reuse","summary":" The rapid expansion of foundation pre-trained models and their fine-tuned\ncounterparts has significantly contributed to the advancement of machine\nlearning. Leveraging pre-trained models to extract knowledge and expedite\nlearning in real-world tasks, known as \"Model Reuse\", has become crucial in\nvarious applications. Previous research focuses on reusing models within a\ncertain aspect, including reusing model weights, structures, and hypothesis\nspaces. This paper introduces ZhiJian, a comprehensive and user-friendly\ntoolbox for model reuse, utilizing the PyTorch backend. ZhiJian presents a\nnovel paradigm that unifies diverse perspectives on model reuse, encompassing\ntarget architecture construction with PTM, tuning target model with PTM, and\nPTM-based inference. This empowers deep learning practitioners to explore\ndownstream tasks and identify the complementary advantages among different\nmethods. ZhiJian is readily accessible at\nhttps://github.com/zhangyikaii/lamda-zhijian facilitating seamless utilization\nof pre-trained models and streamlining the model reuse process for researchers\nand developers.\n","authors":["Yi-Kai Zhang","Lu Ren","Chao Yi","Qi-Wei Wang","De-Chuan Zhan","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2308.09158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09156v1","updated":"2023-08-17T19:08:42Z","published":"2023-08-17T19:08:42Z","title":"Characterizing Information Seeking Events in Health-Related Social\n Discourse","summary":" Social media sites have become a popular platform for individuals to seek and\nshare health information. Despite the progress in natural language processing\nfor social media mining, a gap remains in analyzing health-related texts on\nsocial discourse in the context of events. Event-driven analysis can offer\ninsights into different facets of healthcare at an individual and collective\nlevel, including treatment options, misconceptions, knowledge gaps, etc. This\npaper presents a paradigm to characterize health-related information-seeking in\nsocial discourse through the lens of events. Events here are board categories\ndefined with domain experts that capture the trajectory of the\ntreatment/medication. To illustrate the value of this approach, we analyze\nReddit posts regarding medications for Opioid Use Disorder (OUD), a critical\nglobal health concern. To the best of our knowledge, this is the first attempt\nto define event categories for characterizing information-seeking in OUD social\ndiscourse. Guided by domain experts, we develop TREAT-ISE, a novel multilabel\ntreatment information-seeking event dataset to analyze online discourse on an\nevent-based framework. This dataset contains Reddit posts on\ninformation-seeking events related to recovery from OUD, where each post is\nannotated based on the type of events. We also establish a strong performance\nbenchmark (77.4% F1 score) for the task by employing several machine learning\nand deep learning classifiers. Finally, we thoroughly investigate the\nperformance and errors of ChatGPT on this task, providing valuable insights\ninto the LLM's capabilities and ongoing characterization efforts.\n","authors":["Omar Sharif","Madhusudan Basak","Tanzia Parvin","Ava Scharfstein","Alphonso Bradham","Jacob T. Borodovsky","Sarah E. Lord","Sarah Masud Preum"],"pdf_url":"https://arxiv.org/pdf/2308.09156v1.pdf","comment":"Under review AAAI-2024. 10 pages, 6 tables, 2 figues"},{"id":"http://arxiv.org/abs/2308.07633v2","updated":"2023-08-17T18:16:24Z","published":"2023-08-15T08:31:05Z","title":"A Survey on Model Compression for Large Language Models","summary":" Large Language Models (LLMs) have revolutionized natural language processing\ntasks with remarkable success. However, their formidable size and computational\ndemands present significant challenges for practical deployment, especially in\nresource-constrained environments. As these challenges become increasingly\npertinent, the field of model compression has emerged as a pivotal research\narea to alleviate these limitations. This paper presents a comprehensive survey\nthat navigates the landscape of model compression techniques tailored\nspecifically for LLMs. Addressing the imperative need for efficient deployment,\nwe delve into various methodologies, encompassing quantization, pruning,\nknowledge distillation, and more. Within each of these techniques, we highlight\nrecent advancements and innovative approaches that contribute to the evolving\nlandscape of LLM research. Furthermore, we explore benchmarking strategies and\nevaluation metrics that are essential for assessing the effectiveness of\ncompressed LLMs. By providing insights into the latest developments and\npractical implications, this survey serves as an invaluable resource for both\nresearchers and practitioners. As LLMs continue to evolve, this survey aims to\nfacilitate enhanced efficiency and real-world applicability, establishing a\nfoundation for future advancements in the field.\n","authors":["Xunyu Zhu","Jian Li","Yong Liu","Can Ma","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2308.07633v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09138v1","updated":"2023-08-17T18:11:33Z","published":"2023-08-17T18:11:33Z","title":"Semantic Consistency for Assuring Reliability of Large Language Models","summary":" Large Language Models (LLMs) exhibit remarkable fluency and competence across\nvarious natural language tasks. However, recent research has highlighted their\nsensitivity to variations in input prompts. To deploy LLMs in a safe and\nreliable manner, it is crucial for their outputs to be consistent when prompted\nwith expressions that carry the same meaning or intent. While some existing\nwork has explored how state-of-the-art LLMs address this issue, their\nevaluations have been confined to assessing lexical equality of single- or\nmulti-word answers, overlooking the consistency of generative text sequences.\nFor a more comprehensive understanding of the consistency of LLMs in open-ended\ntext generation scenarios, we introduce a general measure of semantic\nconsistency, and formulate multiple versions of this metric to evaluate the\nperformance of various LLMs. Our proposal demonstrates significantly higher\nconsistency and stronger correlation with human evaluations of output\nconsistency than traditional metrics based on lexical consistency. Finally, we\npropose a novel prompting strategy, called Ask-to-Choose (A2C), to enhance\nsemantic consistency. When evaluated for closed-book question answering based\non answer variations from the TruthfulQA benchmark, A2C increases accuracy\nmetrics for pretrained and finetuned LLMs by up to 47%, and semantic\nconsistency metrics for instruction-tuned models by up to 7-fold.\n","authors":["Harsh Raj","Vipul Gupta","Domenic Rosati","Subhabrata Majumdar"],"pdf_url":"https://arxiv.org/pdf/2308.09138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09126v1","updated":"2023-08-17T17:59:59Z","published":"2023-08-17T17:59:59Z","title":"EgoSchema: A Diagnostic Benchmark for Very Long-form Video Language\n Understanding","summary":" We introduce EgoSchema, a very long-form video question-answering dataset,\nand benchmark to evaluate long video understanding capabilities of modern\nvision and language systems. Derived from Ego4D, EgoSchema consists of over\n5000 human curated multiple choice question answer pairs, spanning over 250\nhours of real video data, covering a very broad range of natural human activity\nand behavior. For each question, EgoSchema requires the correct answer to be\nselected between five given options based on a three-minute-long video clip.\nWhile some prior works have proposed video datasets with long clip lengths, we\nposit that merely the length of the video clip does not truly capture the\ntemporal difficulty of the video task that is being considered. To remedy this,\nwe introduce temporal certificate sets, a general notion for capturing the\nintrinsic temporal understanding length associated with a broad range of video\nunderstanding tasks & datasets. Based on this metric, we find EgoSchema to have\nintrinsic temporal lengths over 5.7x longer than the second closest dataset and\n10x to 100x longer than any other video understanding dataset. Further, our\nevaluation of several current state-of-the-art video and language models shows\nthem to be severely lacking in long-term video understanding capabilities. Even\nmodels with several billions of parameters achieve QA accuracy less than 33%\n(random is 20%) on the EgoSchema multi-choice question answering task, while\nhumans achieve about 76% accuracy. We posit that \\name{}{}, with its long\nintrinsic temporal structures and diverse complexity, would serve as a valuable\nevaluation probe for developing effective long-term video understanding systems\nin the future. Data and Zero-shot model evaluation code are open-sourced for\nboth public and commercial use under the Ego4D license at\nhttp://egoschema.github.io\n","authors":["Karttikeya Mangalam","Raiymbek Akshulakov","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2308.09126v1.pdf","comment":"https://egoschema.github.io/"},{"id":"http://arxiv.org/abs/2308.09124v1","updated":"2023-08-17T17:59:19Z","published":"2023-08-17T17:59:19Z","title":"Linearity of Relation Decoding in Transformer Language Models","summary":" Much of the knowledge encoded in transformer language models (LMs) may be\nexpressed in terms of relations: relations between words and their synonyms,\nentities and their attributes, etc. We show that, for a subset of relations,\nthis computation is well-approximated by a single linear transformation on the\nsubject representation. Linear relation representations may be obtained by\nconstructing a first-order approximation to the LM from a single prompt, and\nthey exist for a variety of factual, commonsense, and linguistic relations.\nHowever, we also identify many cases in which LM predictions capture relational\nknowledge accurately, but this knowledge is not linearly encoded in their\nrepresentations. Our results thus reveal a simple, interpretable, but\nheterogeneously deployed knowledge representation strategy in transformer LMs.\n","authors":["Evan Hernandez","Arnab Sen Sharma","Tal Haklay","Kevin Meng","Martin Wattenberg","Jacob Andreas","Yonatan Belinkov","David Bau"],"pdf_url":"https://arxiv.org/pdf/2308.09124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09115v1","updated":"2023-08-17T17:51:05Z","published":"2023-08-17T17:51:05Z","title":"MaScQA: A Question Answering Dataset for Investigating Materials Science\n Knowledge of Large Language Models","summary":" Information extraction and textual comprehension from materials literature\nare vital for developing an exhaustive knowledge base that enables accelerated\nmaterials discovery. Language models have demonstrated their capability to\nanswer domain-specific questions and retrieve information from knowledge bases.\nHowever, there are no benchmark datasets in the materials domain that can\nevaluate the understanding of the key concepts by these language models. In\nthis work, we curate a dataset of 650 challenging questions from the materials\ndomain that require the knowledge and skills of a materials student who has\ncleared their undergraduate degree. We classify these questions based on their\nstructure and the materials science domain-based subcategories. Further, we\nevaluate the performance of GPT-3.5 and GPT-4 models on solving these questions\nvia zero-shot and chain of thought prompting. It is observed that GPT-4 gives\nthe best performance (~62% accuracy) as compared to GPT-3.5. Interestingly, in\ncontrast to the general observation, no significant improvement in accuracy is\nobserved with the chain of thought prompting. To evaluate the limitations, we\nperformed an error analysis, which revealed conceptual errors (~64%) as the\nmajor contributor compared to computational errors (~36%) towards the reduced\nperformance of LLMs. We hope that the dataset and analysis performed in this\nwork will promote further research in developing better materials science\ndomain-specific LLMs and strategies for information extraction.\n","authors":["Mohd Zaki"," Jayadeva"," Mausam","N. M. Anoop Krishnan"],"pdf_url":"https://arxiv.org/pdf/2308.09115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01752v3","updated":"2023-08-17T17:22:41Z","published":"2023-04-04T12:42:29Z","title":"Black Box Few-Shot Adaptation for Vision-Language models","summary":" Vision-Language (V-L) models trained with contrastive learning to align the\nvisual and language modalities have been shown to be strong few-shot learners.\nSoft prompt learning is the method of choice for few-shot downstream adaptation\naiming to bridge the modality gap caused by the distribution shift induced by\nthe new domain. While parameter-efficient, prompt learning still requires\naccess to the model weights and can be computationally infeasible for large\nmodels with billions of parameters. To address these shortcomings, in this\nwork, we describe a black-box method for V-L few-shot adaptation that (a)\noperates on pre-computed image and text features and hence works without access\nto the model's weights, (b) it is orders of magnitude faster at training time,\n(c) it is amenable to both supervised and unsupervised training, and (d) it can\nbe even used to align image and text features computed from uni-modal models.\nTo achieve this, we propose Linear Feature Alignment (LFA), a simple linear\napproach for V-L re-alignment in the target domain. LFA is initialized from a\nclosed-form solution to a least-squares problem and then it is iteratively\nupdated by minimizing a re-ranking loss. Despite its simplicity, our approach\ncan even surpass soft-prompt learning methods as shown by extensive experiments\non 11 image and 2 video datasets.\n","authors":["Yassine Ouali","Adrian Bulat","Brais Martinez","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2304.01752v3.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2305.12031v2","updated":"2023-08-17T17:19:02Z","published":"2023-05-19T23:07:09Z","title":"Clinical Camel: An Open Expert-Level Medical Language Model with\n Dialogue-Based Knowledge Encoding","summary":" We present Clinical Camel, an open large language model (LLM) explicitly\ntailored for clinical research. Fine-tuned from LLaMA-2 using QLoRA, Clinical\nCamel achieves state-of-the-art performance across medical benchmarks among\nopenly available medical LLMs. Leveraging efficient single-GPU training,\nClinical Camel surpasses GPT-3.5 in five-shot evaluations on all assessed\nbenchmarks, including 64.3% on the USMLE Sample Exam (compared to 58.5% for\nGPT-3.5), 77.9% on PubMedQA (compared to 60.2%), 60.7% on MedQA (compared to\n53.6%), and 54.2% on MedMCQA (compared to 51.0%). In addition to these\nbenchmarks, Clinical Camel demonstrates its broader capabilities, such as\nsynthesizing plausible clinical notes. This work introduces dialogue-based\nknowledge encoding, a novel method to synthesize conversational data from dense\nmedical texts. While benchmark results are encouraging, extensive and rigorous\nhuman evaluation across diverse clinical scenarios is imperative to ascertain\nsafety before implementation. By openly sharing Clinical Camel, we hope to\nfoster transparent and collaborative research, working towards the safe\nintegration of LLMs within the healthcare domain. Significant challenges\nconcerning reliability, bias, and the potential for outdated knowledge persist.\nNonetheless, the transparency provided by an open approach reinforces the\nscientific rigor essential for future clinical applications.\n","authors":["Augustin Toma","Patrick R. Lawler","Jimmy Ba","Rahul G. Krishnan","Barry B. Rubin","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2305.12031v2.pdf","comment":"for model weights, see https://huggingface.co/wanglab/"},{"id":"http://arxiv.org/abs/2308.09073v1","updated":"2023-08-17T16:02:29Z","published":"2023-08-17T16:02:29Z","title":"mCL-NER: Cross-Lingual Named Entity Recognition via Multi-view\n Contrastive Learning","summary":" Cross-lingual named entity recognition (CrossNER) faces challenges stemming\nfrom uneven performance due to the scarcity of multilingual corpora, especially\nfor non-English data. While prior efforts mainly focus on data-driven transfer\nmethods, a significant aspect that has not been fully explored is aligning both\nsemantic and token-level representations across diverse languages. In this\npaper, we propose Multi-view Contrastive Learning for Cross-lingual Named\nEntity Recognition (mCL-NER). Specifically, we reframe the CrossNER task into a\nproblem of recognizing relationships between pairs of tokens. This approach\ntaps into the inherent contextual nuances of token-to-token connections within\nentities, allowing us to align representations across different languages. A\nmulti-view contrastive learning framework is introduced to encompass semantic\ncontrasts between source, codeswitched, and target sentences, as well as\ncontrasts among token-to-token relations. By enforcing agreement within both\nsemantic and relational spaces, we minimize the gap between source sentences\nand their counterparts of both codeswitched and target sentences. This\nalignment extends to the relationships between diverse tokens, enhancing the\nprojection of entities across languages. We further augment CrossNER by\ncombining self-training with labeled source data and unlabeled target data. Our\nexperiments on the XTREME benchmark, spanning 40 languages, demonstrate the\nsuperiority of mCL-NER over prior data-driven and model-based approaches. It\nachieves a substantial increase of nearly +2.0 $F_1$ scores across a broad\nspectrum and establishes itself as the new state-of-the-art performer.\n","authors":["Ying Mo","Jian Yang","Jiahao Liu","Qifan Wang","Ruoyu Chen","Jingang Wang","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2308.09073v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.09070v1","updated":"2023-08-17T15:57:12Z","published":"2023-08-17T15:57:12Z","title":"Enhancing API Documentation through BERTopic Modeling and Summarization","summary":" As the amount of textual data in various fields, including software\ndevelopment, continues to grow, there is a pressing demand for efficient and\neffective extraction and presentation of meaningful insights. This paper\npresents a unique approach to address this need, focusing on the complexities\nof interpreting Application Programming Interface (API) documentation. While\nofficial API documentation serves as a primary source of information for\ndevelopers, it can often be extensive and lacks user-friendliness. In light of\nthis, developers frequently resort to unofficial sources like Stack Overflow\nand GitHub. Our novel approach employs the strengths of BERTopic for topic\nmodeling and Natural Language Processing (NLP) to automatically generate\nsummaries of API documentation, thereby creating a more efficient method for\ndevelopers to extract the information they need. The produced summaries and\ntopics are evaluated based on their performance, coherence, and\ninteroperability.\n The findings of this research contribute to the field of API documentation\nanalysis by providing insights into recurring topics, identifying common\nissues, and generating potential solutions. By improving the accessibility and\nefficiency of API documentation comprehension, our work aims to enhance the\nsoftware development process and empower developers with practical tools for\nnavigating complex APIs.\n","authors":["AmirHossein Naghshzan","Sylvie Ratte"],"pdf_url":"https://arxiv.org/pdf/2308.09070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09067v1","updated":"2023-08-17T15:54:38Z","published":"2023-08-17T15:54:38Z","title":"Contrasting Linguistic Patterns in Human and LLM-Generated Text","summary":" We conduct a quantitative analysis contrasting human-written English news\ntext with comparable large language model (LLM) output from 4 LLMs from the\nLLaMa family. Our analysis spans several measurable linguistic dimensions,\nincluding morphological, syntactic, psychometric and sociolinguistic aspects.\nThe results reveal various measurable differences between human and\nAI-generated texts. Among others, human texts exhibit more scattered sentence\nlength distributions, a distinct use of dependency and constituent types,\nshorter constituents, and more aggressive emotions (fear, disgust) than\nLLM-generated texts. LLM outputs use more numbers, symbols and auxiliaries\n(suggesting objective language) than human texts, as well as more pronouns. The\nsexist bias prevalent in human text is also expressed by LLMs.\n","authors":["Alberto Muñoz-Ortiz","Carlos Gómez-Rodríguez","David Vilares"],"pdf_url":"https://arxiv.org/pdf/2308.09067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09055v1","updated":"2023-08-17T15:41:08Z","published":"2023-08-17T15:41:08Z","title":"Don't lose the message while paraphrasing: A study on content preserving\n style transfer","summary":" Text style transfer techniques are gaining popularity in natural language\nprocessing allowing paraphrasing text in the required form: from toxic to\nneural, from formal to informal, from old to the modern English language, etc.\nSolving the task is not sufficient to generate some neural/informal/modern\ntext, but it is important to preserve the original content unchanged. This\nrequirement becomes even more critical in some applications such as style\ntransfer of goal-oriented dialogues where the factual information shall be kept\nto preserve the original message, e.g. ordering a certain type of pizza to a\ncertain address at a certain time. The aspect of content preservation is\ncritical for real-world applications of style transfer studies, but it has\nreceived little attention. To bridge this gap we perform a comparison of\nvarious style transfer models on the example of the formality transfer domain.\nTo perform a study of the content preservation abilities of various style\ntransfer methods we create a parallel dataset of formal vs. informal\ntask-oriented dialogues. The key difference between our dataset and the\nexisting ones like GYAFC [17] is the presence of goal-oriented dialogues with\npredefined semantic slots essential to be kept during paraphrasing, e.g. named\nentities. This additional annotation allowed us to conduct a precise\ncomparative study of several state-of-the-art techniques for style transfer.\nAnother result of our study is a modification of the unsupervised method LEWIS\n[19] which yields a substantial improvement over the original method and all\nevaluated baselines on the proposed task.\n","authors":["Nikolay Babakov","David Dale","Ilya Gusev","Irina Krotova","Alexander Panchenko"],"pdf_url":"https://arxiv.org/pdf/2308.09055v1.pdf","comment":"Published at the NLDB 2023 conference"},{"id":"http://arxiv.org/abs/2304.12940v2","updated":"2023-08-17T14:51:25Z","published":"2023-04-24T11:12:21Z","title":"Topological properties and organizing principles of semantic networks","summary":" Interpreting natural language is an increasingly important task in computer\nalgorithms due to the growing availability of unstructured textual data.\nNatural Language Processing (NLP) applications rely on semantic networks for\nstructured knowledge representation. The fundamental properties of semantic\nnetworks must be taken into account when designing NLP algorithms, yet they\nremain to be structurally investigated. We study the properties of semantic\nnetworks from ConceptNet, defined by 7 semantic relations from 11 different\nlanguages. We find that semantic networks have universal basic properties: they\nare sparse, highly clustered, and many exhibit power-law degree distributions.\nOur findings show that the majority of the considered networks are scale-free.\nSome networks exhibit language-specific properties determined by grammatical\nrules, for example networks from highly inflected languages, such as e.g.\nLatin, German, French and Spanish, show peaks in the degree distribution that\ndeviate from a power law. We find that depending on the semantic relation type\nand the language, the link formation in semantic networks is guided by\ndifferent principles. In some networks the connections are similarity-based,\nwhile in others the connections are more complementarity-based. Finally, we\ndemonstrate how knowledge of similarity and complementarity in semantic\nnetworks can improve NLP algorithms in missing link inference.\n","authors":["Gabriel Budel","Ying Jin","Piet Van Mieghem","Maksim Kitsak"],"pdf_url":"https://arxiv.org/pdf/2304.12940v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08998v1","updated":"2023-08-17T14:12:48Z","published":"2023-08-17T14:12:48Z","title":"Reinforced Self-Training (ReST) for Language Modeling","summary":" Reinforcement learning from human feedback (RLHF) can improve the quality of\nlarge language model's (LLM) outputs by aligning them with human preferences.\nWe propose a simple algorithm for aligning LLMs with human preferences inspired\nby growing batch reinforcement learning (RL), which we call Reinforced\nSelf-Training (ReST). Given an initial LLM policy, ReST produces a dataset by\ngenerating samples from the policy, which are then used to improve the LLM\npolicy using offline RL algorithms. ReST is more efficient than typical online\nRLHF methods because the training dataset is produced offline, which allows\ndata reuse. While ReST is a general approach applicable to all generative\nlearning settings, we focus on its application to machine translation. Our\nresults show that ReST can substantially improve translation quality, as\nmeasured by automated metrics and human evaluation on machine translation\nbenchmarks in a compute and sample-efficient manner.\n","authors":["Caglar Gulcehre","Tom Le Paine","Srivatsan Srinivasan","Ksenia Konyushkova","Lotte Weerts","Abhishek Sharma","Aditya Siddhant","Alex Ahern","Miaosen Wang","Chenjie Gu","Wolfgang Macherey","Arnaud Doucet","Orhan Firat","Nando de Freitas"],"pdf_url":"https://arxiv.org/pdf/2308.08998v1.pdf","comment":"23 pages, 16 figures"},{"id":"http://arxiv.org/abs/2210.15445v3","updated":"2023-08-17T13:49:08Z","published":"2022-10-26T17:34:30Z","title":"Efficient Utilization of Large Pre-Trained Models for Low Resource ASR","summary":" Unsupervised representation learning has recently helped automatic speech\nrecognition (ASR) to tackle tasks with limited labeled data. Following this,\nhardware limitations and applications give rise to the question how to take\nadvantage of large pre-trained models efficiently and reduce their complexity.\nIn this work, we study a challenging low resource conversational telephony\nspeech corpus from the medical domain in Vietnamese and German. We show the\nbenefits of using unsupervised techniques beyond simple fine-tuning of large\npre-trained models, discuss how to adapt them to a practical telephony task\nincluding bandwidth transfer and investigate different data conditions for\npre-training and fine-tuning. We outperform the project baselines by 22%\nrelative using pretraining techniques. Further gains of 29% can be achieved by\nrefinements of architecture and training and 6% by adding 0.8 h of in-domain\nadaptation data.\n","authors":["Peter Vieting","Christoph Lüscher","Julian Dierkes","Ralf Schlüter","Hermann Ney"],"pdf_url":"https://arxiv.org/pdf/2210.15445v3.pdf","comment":"Accepted at ICASSP SASB 2023"},{"id":"http://arxiv.org/abs/2308.08982v1","updated":"2023-08-17T13:45:35Z","published":"2023-08-17T13:45:35Z","title":"Evaluation of really good grammatical error correction","summary":" Although rarely stated, in practice, Grammatical Error Correction (GEC)\nencompasses various models with distinct objectives, ranging from grammatical\nerror detection to improving fluency. Traditional evaluation methods fail to\nfully capture the full range of system capabilities and objectives.\nReference-based evaluations suffer from limitations in capturing the wide\nvariety of possible correction and the biases introduced during reference\ncreation and is prone to favor fixing local errors over overall text\nimprovement. The emergence of large language models (LLMs) has further\nhighlighted the shortcomings of these evaluation strategies, emphasizing the\nneed for a paradigm shift in evaluation methodology. In the current study, we\nperform a comprehensive evaluation of various GEC systems using a recently\npublished dataset of Swedish learner texts. The evaluation is performed using\nestablished evaluation metrics as well as human judges. We find that GPT-3 in a\nfew-shot setting by far outperforms previous grammatical error correction\nsystems for Swedish, a language comprising only 0.11% of its training data. We\nalso found that current evaluation methods contain undesirable biases that a\nhuman evaluation is able to reveal. We suggest using human post-editing of GEC\nsystem outputs to analyze the amount of change required to reach native-level\nhuman performance on the task, and provide a dataset annotated with human\npost-edits and assessments of grammaticality, fluency and meaning preservation\nof GEC system outputs.\n","authors":["Robert Östling","Katarina Gillholm","Murathan Kurfalı","Marie Mattson","Mats Wirén"],"pdf_url":"https://arxiv.org/pdf/2308.08982v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08973v1","updated":"2023-08-17T13:24:14Z","published":"2023-08-17T13:24:14Z","title":"Beam Retrieval: General End-to-End Retrieval for Multi-Hop Question\n Answering","summary":" Multi-hop QA involves finding multiple relevant passages and step-by-step\nreasoning to answer complex questions. While previous approaches have developed\nretrieval modules for selecting relevant passages, they face challenges in\nscenarios beyond two hops, owing to the limited performance of one-step methods\nand the failure of two-step methods when selecting irrelevant passages in\nearlier stages. In this work, we introduce Beam Retrieval, a general end-to-end\nretrieval framework for multi-hop QA. This approach maintains multiple partial\nhypotheses of relevant passages at each step, expanding the search space and\nreducing the risk of missing relevant passages. Moreover, Beam Retrieval\njointly optimizes an encoder and two classification heads by minimizing the\ncombined loss across all hops. To establish a complete QA system, we\nincorporate a supervised reader or a zero-shot GPT-3.5. Experimental results\ndemonstrate that Beam Retrieval achieves a nearly 50% improvement compared with\nbaselines on challenging MuSiQue-Ans, and it also surpasses all previous\nretrievers on HotpotQA and 2WikiMultiHopQA. Providing high-quality context,\nBeam Retrieval helps our supervised reader achieve new state-of-the-art\nperformance and substantially improves (up to 28.8 points) the QA performance\nof zero-shot GPT-3.5.\n","authors":["Jiahao Zhang","Haiyang Zhang","Dongmei Zhang","Yong Liu","Shen Huang"],"pdf_url":"https://arxiv.org/pdf/2308.08973v1.pdf","comment":"Code is available at https://github.com/canghongjian/beam_retriever"},{"id":"http://arxiv.org/abs/2308.00121v3","updated":"2023-08-17T12:26:40Z","published":"2023-07-24T19:59:22Z","title":"Getting pwn'd by AI: Penetration Testing with Large Language Models","summary":" The field of software security testing, more specifically penetration\ntesting, is an activity that requires high levels of expertise and involves\nmany manual testing and analysis steps. This paper explores the potential usage\nof large-language models, such as GPT3.5, to augment penetration testers with\nAI sparring partners. We explore the feasibility of supplementing penetration\ntesters with AI models for two distinct use cases: high-level task planning for\nsecurity testing assignments and low-level vulnerability hunting within a\nvulnerable virtual machine. For the latter, we implemented a closed-feedback\nloop between LLM-generated low-level actions with a vulnerable virtual machine\n(connected through SSH) and allowed the LLM to analyze the machine state for\nvulnerabilities and suggest concrete attack vectors which were automatically\nexecuted within the virtual machine. We discuss promising initial results,\ndetail avenues for improvement, and close deliberating on the ethics of\nproviding AI-based sparring partners.\n","authors":["Andreas Happe","Jürgen Cito"],"pdf_url":"https://arxiv.org/pdf/2308.00121v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04087v4","updated":"2023-08-17T12:20:27Z","published":"2023-05-06T16:12:19Z","title":"Self-Edit: Fault-Aware Code Editor for Code Generation","summary":" Large language models (LLMs) have demonstrated an impressive ability to\ngenerate codes on competitive programming tasks. However, with limited sample\nnumbers, LLMs still suffer from poor accuracy. Inspired by the process of human\nprogramming, we propose a generate-and-edit approach named Self-Edit that\nutilizes execution results of the generated code from LLMs to improve the code\nquality on the competitive programming task. We execute the generated code on\nthe example test case provided in the question and wrap execution results into\na supplementary comment. Utilizing this comment as guidance, our fault-aware\ncode editor is employed to correct errors in the generated code. We perform\nextensive evaluations across two competitive programming datasets with nine\ndifferent LLMs. Compared to directly generating from LLMs, our approach can\nimprove the average of pass@1 by 89\\% on APPS-dev, 31\\% on APPS-test, and 48\\%\non HumanEval over nine popular code generation LLMs with parameter sizes\nranging from 110M to 175B. Compared to other post-processing methods, our\nmethod demonstrates superior accuracy and efficiency.\n","authors":["Kechi Zhang","Zhuo Li","Jia Allen Li","Ge Li","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2305.04087v4.pdf","comment":"Accepted by ACL2023"},{"id":"http://arxiv.org/abs/2306.08702v3","updated":"2023-08-17T08:54:42Z","published":"2023-06-14T19:00:12Z","title":"Does mBERT understand Romansh? Evaluating word embeddings using word\n alignment","summary":" We test similarity-based word alignment models (SimAlign and awesome-align)\nin combination with word embeddings from mBERT and XLM-R on parallel sentences\nin German and Romansh. Since Romansh is an unseen language, we are dealing with\na zero-shot setting. Using embeddings from mBERT, both models reach an\nalignment error rate of 0.22, which outperforms fast_align, a statistical\nmodel, and is on par with similarity-based word alignment for seen languages.\nWe interpret these results as evidence that mBERT contains information that can\nbe meaningful and applicable to Romansh.\n To evaluate performance, we also present a new trilingual corpus, which we\ncall the DERMIT (DE-RM-IT) corpus, containing press releases made by the Canton\nof Grisons in German, Romansh and Italian in the past 25 years. The corpus\ncontains 4 547 parallel documents and approximately 100 000 sentence pairs in\neach language combination. We additionally present a gold standard for\nGerman-Romansh word alignment. The data is available at\nhttps://github.com/eyldlv/DERMIT-Corpus.\n","authors":["Eyal Liron Dolev"],"pdf_url":"https://arxiv.org/pdf/2306.08702v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08833v1","updated":"2023-08-17T07:51:23Z","published":"2023-08-17T07:51:23Z","title":"CMB: A Comprehensive Medical Benchmark in Chinese","summary":" Large Language Models (LLMs) provide a possibility to make a great\nbreakthrough in medicine. The establishment of a standardized medical benchmark\nbecomes a fundamental cornerstone to measure progression. However, medical\nenvironments in different regions have their local characteristics, e.g., the\nubiquity and significance of traditional Chinese medicine within China.\nTherefore, merely translating English-based medical evaluation may result in\n\\textit{contextual incongruities} to a local region. To solve the issue, we\npropose a localized medical benchmark called CMB, a Comprehensive Medical\nBenchmark in Chinese, designed and rooted entirely within the native Chinese\nlinguistic and cultural framework. While traditional Chinese medicine is\nintegral to this evaluation, it does not constitute its entirety. Using this\nbenchmark, we have evaluated several prominent large-scale LLMs, including\nChatGPT, GPT-4, dedicated Chinese LLMs, and LLMs specialized in the medical\ndomain. It is worth noting that our benchmark is not devised as a leaderboard\ncompetition but as an instrument for self-assessment of model advancements. We\nhope this benchmark could facilitate the widespread adoption and enhancement of\nmedical LLMs within China. Check details in\n\\url{https://cmedbenchmark.llmzoo.com/}.\n","authors":["Xidong Wang","Guiming Hardy Chen","Dingjie Song","Zhiyi Zhang","Zhihong Chen","Qingying Xiao","Feng Jiang","Jianquan Li","Xiang Wan","Benyou Wang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2308.08833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08827v1","updated":"2023-08-17T07:24:06Z","published":"2023-08-17T07:24:06Z","title":"Factuality Detection using Machine Translation -- a Use Case for German\n Clinical Text","summary":" Factuality can play an important role when automatically processing clinical\ntext, as it makes a difference if particular symptoms are explicitly not\npresent, possibly present, not mentioned, or affirmed. In most cases, a\nsufficient number of examples is necessary to handle such phenomena in a\nsupervised machine learning setting. However, as clinical text might contain\nsensitive information, data cannot be easily shared. In the context of\nfactuality detection, this work presents a simple solution using machine\ntranslation to translate English data to German to train a transformer-based\nfactuality detection model.\n","authors":["Mohammed Bin Sumait","Aleksandra Gabryszak","Leonhard Hennig","Roland Roller"],"pdf_url":"https://arxiv.org/pdf/2308.08827v1.pdf","comment":"Accepted at KONVENS 2023"},{"id":"http://arxiv.org/abs/2308.08807v1","updated":"2023-08-17T06:33:33Z","published":"2023-08-17T06:33:33Z","title":"Linguistically-Informed Neural Architectures for Lexical, Syntactic and\n Semantic Tasks in Sanskrit","summary":" The primary focus of this thesis is to make Sanskrit manuscripts more\naccessible to the end-users through natural language technologies. The\nmorphological richness, compounding, free word orderliness, and low-resource\nnature of Sanskrit pose significant challenges for developing deep learning\nsolutions. We identify four fundamental tasks, which are crucial for developing\na robust NLP technology for Sanskrit: word segmentation, dependency parsing,\ncompound type identification, and poetry analysis. The first task, Sanskrit\nWord Segmentation (SWS), is a fundamental text processing task for any other\ndownstream applications. However, it is challenging due to the sandhi\nphenomenon that modifies characters at word boundaries. Similarly, the existing\ndependency parsing approaches struggle with morphologically rich and\nlow-resource languages like Sanskrit. Compound type identification is also\nchallenging for Sanskrit due to the context-sensitive semantic relation between\ncomponents. All these challenges result in sub-optimal performance in NLP\napplications like question answering and machine translation. Finally, Sanskrit\npoetry has not been extensively studied in computational linguistics.\n While addressing these challenges, this thesis makes various contributions:\n(1) The thesis proposes linguistically-informed neural architectures for these\ntasks. (2) We showcase the interpretability and multilingual extension of the\nproposed systems. (3) Our proposed systems report state-of-the-art performance.\n(4) Finally, we present a neural toolkit named SanskritShala, a web-based\napplication that provides real-time analysis of input for various NLP tasks.\nOverall, this thesis contributes to making Sanskrit manuscripts more accessible\nby developing robust NLP technology and releasing various resources, datasets,\nand web-based toolkit.\n","authors":["Jivnesh Sandhan"],"pdf_url":"https://arxiv.org/pdf/2308.08807v1.pdf","comment":"Ph.D. dissertation"},{"id":"http://arxiv.org/abs/2308.07645v2","updated":"2023-08-17T06:08:39Z","published":"2023-08-15T08:49:14Z","title":"Steering Language Generation: Harnessing Contrastive Expert Guidance and\n Negative Prompting for Coherent and Diverse Synthetic Data Generation","summary":" Large Language Models (LLMs) hold immense potential to generate synthetic\ndata of high quality and utility, which has numerous applications from\ndownstream model training to practical data utilisation. However, contemporary\nmodels, despite their impressive capacities, consistently struggle to produce\nboth coherent and diverse data. To address the coherency issue, we introduce\ncontrastive expert guidance, where the difference between the logit\ndistributions of fine-tuned and base language models is emphasised to ensure\ndomain adherence. In order to ensure diversity, we utilise existing real and\nsynthetic examples as negative prompts to the model. We deem this dual-pronged\napproach to logit reshaping as STEER: Semantic Text Enhancement via Embedding\nRepositioning. STEER operates at inference-time and systematically guides the\nLLMs to strike a balance between adherence to the data distribution (ensuring\nsemantic fidelity) and deviation from prior synthetic examples or existing real\ndatasets (ensuring diversity and authenticity). This delicate balancing act is\nachieved by dynamically moving towards or away from chosen representations in\nthe latent space. STEER demonstrates improved performance over previous\nsynthetic data generation techniques, exhibiting better balance between data\ndiversity and coherency across three distinct tasks: hypothesis generation,\ntoxic and non-toxic comment generation, and commonsense reasoning task\ngeneration. We demonstrate how STEER allows for fine-tuned control over the\ndiversity-coherency trade-off via its hyperparameters, highlighting its\nversatility.\n","authors":["Charles O'Neill","Yuan-Sen Ting","Ioana Ciuca","Jack Miller","Thang Bui"],"pdf_url":"https://arxiv.org/pdf/2308.07645v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08796v1","updated":"2023-08-17T06:04:28Z","published":"2023-08-17T06:04:28Z","title":"Chinese Spelling Correction as Rephrasing Language Model","summary":" This paper studies Chinese Spelling Correction (CSC), which aims to detect\nand correct potential spelling errors in a given sentence. Current\nstate-of-the-art methods regard CSC as a sequence tagging task and fine-tune\nBERT-based models on sentence pairs. However, we note a critical flaw in the\nprocess of tagging one character to another, that the correction is excessively\nconditioned on the error. This is opposite from human mindset, where\nindividuals rephrase the complete sentence based on its semantics, rather than\nsolely on the error patterns memorized before. Such a counter-intuitive\nlearning process results in the bottleneck of generalizability and\ntransferability of machine spelling correction. To address this, we propose\n$Rephrasing Language Modeling$ (ReLM), where the model is trained to rephrase\nthe entire sentence by infilling additional slots, instead of\ncharacter-to-character tagging. This novel training paradigm achieves the new\nstate-of-the-art results across fine-tuned and zero-shot CSC benchmarks,\noutperforming previous counterparts by a large margin. Our method also learns\ntransferable language representation when CSC is jointly trained with other\ntasks.\n","authors":["Linfeng Liu","Hongqiu Wu","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.08796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07758v2","updated":"2023-08-17T05:55:44Z","published":"2023-08-15T13:19:59Z","title":"Forward-Backward Reasoning in Large Language Models for Verification","summary":" Chain-of-Though (CoT) prompting has shown promising performance in various\nreasoning tasks. Recently, Self-Consistency \\citep{wang2023selfconsistency}\nproposes to sample a diverse set of reasoning chains which may lead to\ndifferent answers while the answer that receives the most votes is selected. In\nthis paper, we propose a novel method to use backward reasoning in verifying\ncandidate answers. We mask a token in the question by ${\\bf x}$ and ask the LLM\nto predict the masked token when a candidate answer is provided by \\textit{a\nsimple template}, i.e., ``\\textit{\\textbf{If we know the answer of the above\nquestion is \\{a candidate answer\\}, what is the value of unknown variable ${\\bf\nx}$?}}'' Intuitively, the LLM is expected to predict the masked token\nsuccessfully if the provided candidate answer is correct. We further propose\nFOBAR to combine forward and backward reasoning for estimating the probability\nof candidate answers. We conduct extensive experiments on six data sets and\nthree LLMs. Experimental results demonstrate that FOBAR achieves\nstate-of-the-art performance on various reasoning benchmarks.\n","authors":["Weisen Jiang","Han Shi","Longhui Yu","Zhengying Liu","Yu Zhang","Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2308.07758v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.08793v1","updated":"2023-08-17T05:36:56Z","published":"2023-08-17T05:36:56Z","title":"Task Relation Distillation and Prototypical Pseudo Label for Incremental\n Named Entity Recognition","summary":" Incremental Named Entity Recognition (INER) involves the sequential learning\nof new entity types without accessing the training data of previously learned\ntypes. However, INER faces the challenge of catastrophic forgetting specific\nfor incremental learning, further aggravated by background shift (i.e., old and\nfuture entity types are labeled as the non-entity type in the current task). To\naddress these challenges, we propose a method called task Relation Distillation\nand Prototypical pseudo label (RDP) for INER. Specifically, to tackle\ncatastrophic forgetting, we introduce a task relation distillation scheme that\nserves two purposes: 1) ensuring inter-task semantic consistency across\ndifferent incremental learning tasks by minimizing inter-task relation\ndistillation loss, and 2) enhancing the model's prediction confidence by\nminimizing intra-task self-entropy loss. Simultaneously, to mitigate background\nshift, we develop a prototypical pseudo label strategy that distinguishes old\nentity types from the current non-entity type using the old model. This\nstrategy generates high-quality pseudo labels by measuring the distances\nbetween token embeddings and type-wise prototypes. We conducted extensive\nexperiments on ten INER settings of three benchmark datasets (i.e., CoNLL2003,\nI2B2, and OntoNotes5). The results demonstrate that our method achieves\nsignificant improvements over the previous state-of-the-art methods, with an\naverage increase of 6.08% in Micro F1 score and 7.71% in Macro F1 score.\n","authors":["Duzhen Zhang","Hongliu Li","Wei Cong","Rongtao Xu","Jiahua Dong","Xiuyi Chen"],"pdf_url":"https://arxiv.org/pdf/2308.08793v1.pdf","comment":"Accepted by CIKM2023 as a long paper with an oral presentation"},{"id":"http://arxiv.org/abs/2308.05342v3","updated":"2023-08-17T04:53:15Z","published":"2023-08-10T05:10:17Z","title":"Metacognitive Prompting Improves Understanding in Large Language Models","summary":" In Large Language Models (LLMs), there have been consistent advancements in\ntask-specific performance, largely influenced by effective prompt design. While\nrecent research on prompting has enhanced the reasoning capabilities of LLMs, a\ngap remains in further improving their understanding abilities. In this study,\nwe introduce Metacognitive Prompting (MP), a strategy inspired by human\nintrospective reasoning processes. Using MP, LLMs undergo a systematic series\nof structured, self-aware evaluations, drawing on both their vast inherent\nknowledge and new insights. Our experiments involve five prevalent LLMs:\nLlama2, Vicuna, PaLM, GPT-3.5, and GPT-4, all of which span various general\nnatural language understanding (NLU) tasks from the GLUE and SuperGLUE\nbenchmarks. Results indicate that, although GPT-4 consistently excels in most\ntasks, PaLM, when equipped with MP, approaches its performance level.\nFurthermore, across models and datasets, MP consistently outperforms existing\nprompting methods, including standard and chain-of-thought prompting. This\nstudy underscores the potential to amplify the understanding abilities of LLMs\nand highlights the benefits of mirroring human introspective reasoning in NLU\ntasks.\n","authors":["Yuqing Wang","Yun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.05342v3.pdf","comment":"9 pages, in submission"},{"id":"http://arxiv.org/abs/2308.08780v1","updated":"2023-08-17T04:45:19Z","published":"2023-08-17T04:45:19Z","title":"Exploring Demonstration Ensembling for In-context Learning","summary":" In-context learning (ICL) operates by showing language models (LMs) examples\nof input-output pairs for a given task, i.e., demonstrations. The standard\napproach for ICL is to prompt the LM with concatenated demonstrations followed\nby the test input. This approach suffers from some issues. First, concatenation\noffers almost no control over the contribution of each demo to the model\nprediction. This can be sub-optimal when some demonstrations are irrelevant to\nthe test example. Second, due to the input length limit of some transformer\nmodels, it might be infeasible to fit many examples into the context,\nespecially when dealing with long-input tasks. In this work, we explore\nDemonstration Ensembling (DENSE) as an alternative to simple concatenation.\n\\model predicts outputs using subsets (i.e., buckets) of the demonstrations and\nthen combines the output probabilities resulting from each subset to produce\nthe final prediction. We study different ensembling methods using GPT-j and\nexperiment on 12 language tasks. Our experiments show weighted max ensembling\nto outperform vanilla concatenation by as large as 2.4 average points. Code\navailable at \\url{https://github.com/mukhal/icl-ensembling}.\n","authors":["Muhammad Khalifa","Lajanugen Logeswaran","Moontae Lee","Honglak Lee","Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08780v1.pdf","comment":"Published at ME-FoMo workshop at ICLR 2023. Arxiv version includes\n evaluation on 5 more tasks"},{"id":"http://arxiv.org/abs/2308.08774v1","updated":"2023-08-17T04:13:26Z","published":"2023-08-17T04:13:26Z","title":"Differential Privacy, Linguistic Fairness, and Training Data Influence:\n Impossibility and Possibility Theorems for Multilingual Language Models","summary":" Language models such as mBERT, XLM-R, and BLOOM aim to achieve multilingual\ngeneralization or compression to facilitate transfer to a large number of\n(potentially unseen) languages. However, these models should ideally also be\nprivate, linguistically fair, and transparent, by relating their predictions to\ntraining data. Can these requirements be simultaneously satisfied? We show that\nmultilingual compression and linguistic fairness are compatible with\ndifferential privacy, but that differential privacy is at odds with training\ndata influence sparsity, an objective for transparency. We further present a\nseries of experiments on two common NLP tasks and evaluate multilingual\ncompression and training data influence sparsity under different privacy\nguarantees, exploring these trade-offs in more detail. Our results suggest that\nwe need to develop ways to jointly optimize for these objectives in order to\nfind practical trade-offs.\n","authors":["Phillip Rust","Anders Søgaard"],"pdf_url":"https://arxiv.org/pdf/2308.08774v1.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2308.08758v1","updated":"2023-08-17T03:10:17Z","published":"2023-08-17T03:10:17Z","title":"Discrete Prompt Compression with Reinforcement Learning","summary":" Instruction-tuned Language Models (LMs) are widely used by users to address\nvarious problems with task-specific prompts. Constraints associated with the\ncontext window length and computational costs encourage the development of\ncompressed prompts. Existing methods rely heavily on training embeddings, which\nare designed to accommodate multiple token meanings. This presents challenges\nin terms of interpretability, a fixed number of embedding tokens, reusability\nacross different LMs, and inapplicability when interacting with black-box APIs.\nThis study proposes prompt compression with reinforcement learning (PCRL), a\nnovel discrete prompt compression method that addresses these issues. PCRL\nemploys a computationally efficient policy network that directly edits prompts.\nThe PCRL training approach can be flexibly applied to various types of LMs, as\nwell as decoder-only and encoder-decoder architecture, and can be trained\nwithout gradient access to LMs or labeled data. PCRL achieves an average\nreduction of 24.6% in token count across various instruction prompts while\npreserving performance. Further, we demonstrate that the learned policy can be\ntransferred to larger LMs, and through various analyses, we aid the\nunderstanding of token importance within prompts.\n","authors":["Hoyoun Jung","Kyung-Joong Kim"],"pdf_url":"https://arxiv.org/pdf/2308.08758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08747v1","updated":"2023-08-17T02:53:23Z","published":"2023-08-17T02:53:23Z","title":"An Empirical Study of Catastrophic Forgetting in Large Language Models\n During Continual Fine-tuning","summary":" Catastrophic forgetting (CF) is a phenomenon that occurs in machine learning\nwhen a model forgets previously learned information as it learns new\ninformation. As large language models (LLMs) have shown excellent performance,\nit is interesting to uncover whether CF exists in the continual fine-tuning of\nLLMs. In this study, we empirically evaluate the forgetting phenomenon in LLMs'\nknowledge, from the perspectives of domain knowledge, reasoning, and reading\ncomprehension. The experiments demonstrate that catastrophic forgetting is\ngenerally observed in LLMs ranging from 1b to 7b. Furthermore, as the scale\nincreases, the severity of forgetting also intensifies. Comparing the\ndecoder-only model BLOOMZ with the encoder-decoder model mT0, BLOOMZ suffers\nless forgetting and maintains more knowledge. We also observe that LLMs can\nmitigate language bias (e.g. gender bias) during continual fine-tuning.\nMoreover, we find that ALPACA can maintain more knowledge and capacity compared\nwith LLAMA during the continual fine-tuning, which implies that general\ninstruction tuning can help mitigate the forgetting phenomenon of LLMs in the\nfurther fine-tuning process.\n","authors":["Yun Luo","Zhen Yang","Fandong Meng","Yafu Li","Jie Zhou","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08747v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08742v1","updated":"2023-08-17T02:33:43Z","published":"2023-08-17T02:33:43Z","title":"PMET: Precise Model Editing in a Transformer","summary":" Model editing techniques modify a minor proportion of knowledge in Large\nLanguage Models (LLMs) at a relatively low cost, which have demonstrated\nnotable success. Existing methods assume Transformer Layer (TL) hidden states\nare values of key-value memories of the Feed-Forward Network (FFN). They\nusually optimize the TL hidden states to memorize target knowledge and use it\nto update the weights of the FFN in LLMs. However, the information flow of TL\nhidden states comes from three parts: Multi-Head Self-Attention (MHSA), FFN,\nand residual connections. Existing methods neglect the fact that the TL hidden\nstates contains information not specifically required for FFN. Consequently,\nthe performance of model editing decreases. To achieve more precise model\nediting, we analyze hidden states of MHSA and FFN, finding that MHSA encodes\ncertain general knowledge extraction patterns. This implies that MHSA weights\ndo not require updating when new knowledge is introduced. Based on above\nfindings, we introduce PMET, which simultaneously optimizes Transformer\nComponent (TC, namely MHSA and FFN) hidden states, while only using the\noptimized TC hidden states of FFN to precisely update FFN weights. Our\nexperiments demonstrate that PMET exhibits state-of-the-art performance on both\nthe \\textsc{counterfact} and zsRE datasets. Our ablation experiments\nsubstantiate the effectiveness of our enhancements, further reinforcing the\nfinding that the MHSA encodes certain general knowledge extraction patterns and\nindicating its storage of a small amount of factual knowledge. Our code is\navailable at \\url{https://github.com/xpq-tech/PMET.git}.\n","authors":["Xiaopeng Li","Shasha Li","Shezheng Song","Jing Yang","Jun Ma","Jie Yu"],"pdf_url":"https://arxiv.org/pdf/2308.08742v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2308.08739v1","updated":"2023-08-17T02:26:30Z","published":"2023-08-17T02:26:30Z","title":"Enhancing Phrase Representation by Information Bottleneck Guided Text\n Diffusion Process for Keyphrase Extraction","summary":" Keyphrase extraction (KPE) is an important task in Natural Language\nProcessing for many scenarios, which aims to extract keyphrases that are\npresent in a given document. Many existing supervised methods treat KPE as\nsequential labeling, span-level classification, or generative tasks. However,\nthese methods lack the ability to utilize keyphrase information, which may\nresult in biased results. In this study, we propose Diff-KPE, which leverages\nthe supervised Variational Information Bottleneck (VIB) to guide the text\ndiffusion process for generating enhanced keyphrase representations. Diff-KPE\nfirst generates the desired keyphrase embeddings conditioned on the entire\ndocument and then injects the generated keyphrase embeddings into each phrase\nrepresentation. A ranking network and VIB are then optimized together with rank\nloss and classification loss, respectively. This design of Diff-KPE allows us\nto rank each candidate phrase by utilizing both the information of keyphrases\nand the document. Experiments show that Diff-KPE outperforms existing KPE\nmethods on a large open domain keyphrase extraction benchmark, OpenKP, and a\nscientific domain dataset, KP20K.\n","authors":["Yuanzhen Luo","Qingyu Zhou","Feng Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.08739v1.pdf","comment":"10 pages, 2 figures"},{"id":"http://arxiv.org/abs/2308.08728v1","updated":"2023-08-17T01:58:04Z","published":"2023-08-17T01:58:04Z","title":"LLM-FuncMapper: Function Identification for Interpreting Complex Clauses\n in Building Codes via LLM","summary":" As a vital stage of automated rule checking (ARC), rule interpretation of\nregulatory texts requires considerable effort. However, interpreting regulatory\nclauses with implicit properties or complex computational logic is still\nchallenging due to the lack of domain knowledge and limited expressibility of\nconventional logic representations. Thus, LLM-FuncMapper, an approach to\nidentifying predefined functions needed to interpret various regulatory clauses\nbased on the large language model (LLM), is proposed. First, by systematically\nanalysis of building codes, a series of atomic functions are defined to capture\nshared computational logics of implicit properties and complex constraints,\ncreating a database of common blocks for interpreting regulatory clauses. Then,\na prompt template with the chain of thought is developed and further enhanced\nwith a classification-based tuning strategy, to enable common LLMs for\neffective function identification. Finally, the proposed approach is validated\nwith statistical analysis, experiments, and proof of concept. Statistical\nanalysis reveals a long-tail distribution and high expressibility of the\ndeveloped function database, with which almost 100% of computer-processible\nclauses can be interpreted and represented as computer-executable codes.\nExperiments show that LLM-FuncMapper achieve promising results in identifying\nrelevant predefined functions for rule interpretation. Further proof of concept\nin automated rule interpretation also demonstrates the possibility of\nLLM-FuncMapper in interpreting complex regulatory clauses. To the best of our\nknowledge, this study is the first attempt to introduce LLM for understanding\nand interpreting complex regulatory clauses, which may shed light on further\nadoption of LLM in the construction domain.\n","authors":["Zhe Zheng","Ke-Yin Chen","Xin-Yu Cao","Xin-Zheng Lu","Jia-Rui Lin"],"pdf_url":"https://arxiv.org/pdf/2308.08728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04415v3","updated":"2023-08-17T01:51:48Z","published":"2023-02-09T03:04:11Z","title":"Few-Shot Table-to-Text Generation with Prompt Planning and Knowledge\n Memorization","summary":" Pre-trained language models (PLM) have achieved remarkable advancement in\ntable-to-text generation tasks. However, the lack of labeled domain-specific\nknowledge and the topology gap between tabular data and text make it difficult\nfor PLMs to yield faithful text. Low-resource generation likewise faces unique\nchallenges in this domain. Inspired by how humans descript tabular data with\nprior knowledge, we suggest a new framework: PromptMize, which targets\ntable-to-text generation under few-shot settings. The design of our framework\nconsists of two aspects: a prompt planner and a knowledge adapter. The prompt\nplanner aims to generate a prompt signal that provides instance guidance for\nPLMs to bridge the topology gap between tabular data and text. Moreover, the\nknowledge adapter memorizes domain-specific knowledge from the unlabelled\ncorpus to supply essential information during generation. Extensive experiments\nand analyses are investigated on three open domain few-shot NLG datasets:\nhuman, song, and book. Compared with previous state-of-the-art approaches, our\nmodel achieves remarkable performance in generating quality as judged by human\nand automatic evaluations.\n","authors":["Zhixin Guo","Minyxuan Yan","Jiexing Qi","Jianping Zhou","Ziwei He","Zhouhan Lin","Guanjie Zheng","Xinbing Wang"],"pdf_url":"https://arxiv.org/pdf/2302.04415v3.pdf","comment":"Accidental duplicate. Please see arXiv:2302.12468"},{"id":"http://arxiv.org/abs/2308.08713v1","updated":"2023-08-17T00:30:56Z","published":"2023-08-17T00:30:56Z","title":"Decoding Emotions: A comprehensive Multilingual Study of Speech Models\n for Speech Emotion Recognition","summary":" Recent advancements in transformer-based speech representation models have\ngreatly transformed speech processing. However, there has been limited research\nconducted on evaluating these models for speech emotion recognition (SER)\nacross multiple languages and examining their internal representations. This\narticle addresses these gaps by presenting a comprehensive benchmark for SER\nwith eight speech representation models and six different languages. We\nconducted probing experiments to gain insights into inner workings of these\nmodels for SER. We find that using features from a single optimal layer of a\nspeech model reduces the error rate by 32\\% on average across seven datasets\nwhen compared to systems where features from all layers of speech models are\nused. We also achieve state-of-the-art results for German and Persian\nlanguages. Our probing results indicate that the middle layers of speech models\ncapture the most important emotional information for speech emotion\nrecognition.\n","authors":["Anant Singh","Akshat Gupta"],"pdf_url":"https://arxiv.org/pdf/2308.08713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09731v1","updated":"2023-08-17T20:50:46Z","published":"2023-08-17T20:50:46Z","title":"ChatGPT-HealthPrompt. Harnessing the Power of XAI in Prompt-Based\n Healthcare Decision Support using ChatGPT","summary":" This study presents an innovative approach to the application of large\nlanguage models (LLMs) in clinical decision-making, focusing on OpenAI's\nChatGPT. Our approach introduces the use of contextual prompts-strategically\ndesigned to include task description, feature description, and crucially,\nintegration of domain knowledge-for high-quality binary classification tasks\neven in data-scarce scenarios. The novelty of our work lies in the utilization\nof domain knowledge, obtained from high-performing interpretable ML models, and\nits seamless incorporation into prompt design. By viewing these ML models as\nmedical experts, we extract key insights on feature importance to aid in\ndecision-making processes. This interplay of domain knowledge and AI holds\nsignificant promise in creating a more insightful diagnostic tool.\n Additionally, our research explores the dynamics of zero-shot and few-shot\nprompt learning based on LLMs. By comparing the performance of OpenAI's ChatGPT\nwith traditional supervised ML models in different data conditions, we aim to\nprovide insights into the effectiveness of prompt engineering strategies under\nvaried data availability. In essence, this paper bridges the gap between AI and\nhealthcare, proposing a novel methodology for LLMs application in clinical\ndecision support systems. It highlights the transformative potential of\neffective prompt design, domain knowledge integration, and flexible learning\napproaches in enhancing automated decision-making.\n","authors":["Fatemeh Nazary","Yashar Deldjoo","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2308.09731v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.09209v1","updated":"2023-08-17T23:28:39Z","published":"2023-08-17T23:28:39Z","title":"GPU Accelerated Color Correction and Frame Warping for Real-time Video\n Stitching","summary":" Traditional image stitching focuses on a single panorama frame without\nconsidering the spatial-temporal consistency in videos. The straightforward\nimage stitching approach will cause temporal flicking and color inconstancy\nwhen it is applied to the video stitching task. Besides, inaccurate camera\nparameters will cause artifacts in the image warping. In this paper, we propose\na real-time system to stitch multiple video sequences into a panoramic video,\nwhich is based on GPU accelerated color correction and frame warping without\naccurate camera parameters. We extend the traditional 2D-Matrix (2D-M) color\ncorrection approach and a present spatio-temporal 3D-Matrix (3D-M) color\ncorrection method for the overlap local regions with online color balancing\nusing a piecewise function on global frames. Furthermore, we use pairwise\nhomography matrices given by coarse camera calibration for global warping\nfollowed by accurate local warping based on the optical flow. Experimental\nresults show that our system can generate highquality panorama videos in real\ntime.\n","authors":["Lu Yang","Zhenglun Kong","Ting Li","Xinyi Bai","Zhiye Lin","Hong Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.09209v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.09202v1","updated":"2023-08-17T22:40:59Z","published":"2023-08-17T22:40:59Z","title":"A Model-Agnostic Framework for Recommendation via Interest-aware Item\n Embeddings","summary":" Item representation holds significant importance in recommendation systems,\nwhich encompasses domains such as news, retail, and videos. Retrieval and\nranking models utilise item representation to capture the user-item\nrelationship based on user behaviours. While existing representation learning\nmethods primarily focus on optimising item-based mechanisms, such as attention\nand sequential modelling. However, these methods lack a modelling mechanism to\ndirectly reflect user interests within the learned item representations.\nConsequently, these methods may be less effective in capturing user interests\nindirectly. To address this challenge, we propose a novel Interest-aware\nCapsule network (IaCN) recommendation model, a model-agnostic framework that\ndirectly learns interest-oriented item representations. IaCN serves as an\nauxiliary task, enabling the joint learning of both item-based and\ninterest-based representations. This framework adopts existing recommendation\nmodels without requiring substantial redesign. We evaluate the proposed\napproach on benchmark datasets, exploring various scenarios involving different\ndeep neural networks, behaviour sequence lengths, and joint learning ratios of\ninterest-oriented item representations. Experimental results demonstrate\nsignificant performance enhancements across diverse recommendation models,\nvalidating the effectiveness of our approach.\n","authors":["Amit Kumar Jaiswal","Yu Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.09202v1.pdf","comment":"Accepted Paper under LBR track in the Seventeenth ACM Conference on\n Recommender Systems (RecSys) 2023"},{"id":"http://arxiv.org/abs/2208.06746v3","updated":"2023-08-17T17:49:35Z","published":"2022-08-13T23:18:25Z","title":"Contrastive Counterfactual Learning for Causality-aware Interpretable\n Recommender Systems","summary":" The field of generating recommendations within the framework of causal\ninference has seen a recent surge, with recommendations being likened to\ntreatments. This approach enhances insights into the influence of\nrecommendations on user behavior and helps in identifying the underlying\nfactors. Existing research has often leveraged propensity scores to mitigate\nbias, albeit at the risk of introducing additional variance. Others have\nexplored the use of unbiased data from randomized controlled trials, although\nthis comes with assumptions that may prove challenging in practice. In this\npaper, we first present the causality-aware interpretation of recommendations\nand reveal how the underlying exposure mechanism can bias the maximum\nlikelihood estimation (MLE) of observational feedback. Recognizing that\nconfounders may be elusive, we propose a contrastive self-supervised learning\nto minimize exposure bias, employing inverse propensity scores and expanding\nthe positive sample set. Building on this foundation, we present a novel\ncontrastive counterfactual learning method (CCL) that incorporates three unique\npositive sampling strategies grounded in estimated exposure probability or\nrandom counterfactual samples. Through extensive experiments on two real-world\ndatasets, we demonstrate that our CCL outperforms the state-of-the-art methods.\n","authors":["Guanglin Zhou","Chengkai Huang","Xiaocong Chen","Xiwei Xu","Chen Wang","Liming Zhu","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2208.06746v3.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.09096v1","updated":"2023-08-17T16:48:41Z","published":"2023-08-17T16:48:41Z","title":"Identity-Aware Semi-Supervised Learning for Comic Character\n Re-Identification","summary":" Character re-identification, recognizing characters consistently across\ndifferent panels in comics, presents significant challenges due to limited\nannotated data and complex variations in character appearances. To tackle this\nissue, we introduce a robust semi-supervised framework that combines metric\nlearning with a novel 'Identity-Aware' self-supervision method by contrastive\nlearning of face and body pairs of characters. Our approach involves processing\nboth facial and bodily features within a unified network architecture,\nfacilitating the extraction of identity-aligned character embeddings that\ncapture individual identities while preserving the effectiveness of face and\nbody features. This integrated character representation enhances feature\nextraction and improves character re-identification compared to\nre-identification by face or body independently, offering a parameter-efficient\nsolution. By extensively validating our method using in-series and inter-series\nevaluation metrics, we demonstrate its effectiveness in consistently\nre-identifying comic characters. Compared to existing methods, our approach not\nonly addresses the challenge of character re-identification but also serves as\na foundation for downstream tasks since it can produce character embeddings\nwithout restrictions of face and body availability, enriching the comprehension\nof comic books. In our experiments, we leverage two newly curated datasets: the\n'Comic Character Instances Dataset', comprising over a million character\ninstances and the 'Comic Sequence Identity Dataset', containing annotations of\nidentities within more than 3000 sets of four consecutive comic panels that we\ncollected.\n","authors":["Gürkan Soykan","Deniz Yuret","Tevfik Metin Sezgin"],"pdf_url":"https://arxiv.org/pdf/2308.09096v1.pdf","comment":"18 pages, 9 Figures"},{"id":"http://arxiv.org/abs/2308.09089v1","updated":"2023-08-17T16:38:30Z","published":"2023-08-17T16:38:30Z","title":"Bridging High-Quality Audio and Video via Language for Sound Effects\n Retrieval from Visual Queries","summary":" Finding the right sound effects (SFX) to match moments in a video is a\ndifficult and time-consuming task, and relies heavily on the quality and\ncompleteness of text metadata. Retrieving high-quality (HQ) SFX using a video\nframe directly as the query is an attractive alternative, removing the reliance\non text metadata and providing a low barrier to entry for non-experts. Due to\nthe lack of HQ audio-visual training data, previous work on audio-visual\nretrieval relies on YouTube (in-the-wild) videos of varied quality for\ntraining, where the audio is often noisy and the video of amateur quality. As\nsuch it is unclear whether these systems would generalize to the task of\nmatching HQ audio to production-quality video. To address this, we propose a\nmultimodal framework for recommending HQ SFX given a video frame by (1)\nleveraging large language models and foundational vision-language models to\nbridge HQ audio and video to create audio-visual pairs, resulting in a highly\nscalable automatic audio-visual data curation pipeline; and (2) using\npre-trained audio and visual encoders to train a contrastive learning-based\nretrieval system. We show that our system, trained using our automatic data\ncuration pipeline, significantly outperforms baselines trained on in-the-wild\ndata on the task of HQ SFX retrieval for video. Furthermore, while the\nbaselines fail to generalize to this task, our system generalizes well from\nclean to in-the-wild data, outperforming the baselines on a dataset of YouTube\nvideos despite only being trained on the HQ audio-visual pairs. A user study\nconfirms that people prefer SFX retrieved by our system over the baseline 67%\nof the time both for HQ and in-the-wild data. Finally, we present ablations to\ndetermine the impact of model and data pipeline design choices on downstream\nretrieval performance. Please visit our project website to listen to and view\nour SFX retrieval results.\n","authors":["Julia Wilkins","Justin Salamon","Magdalena Fuentes","Juan Pablo Bello","Oriol Nieto"],"pdf_url":"https://arxiv.org/pdf/2308.09089v1.pdf","comment":"WASPAA 2023. Project page:\n https://juliawilkins.github.io/sound-effects-retrieval-from-video/. 4 pages,\n 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.09066v1","updated":"2023-08-17T15:54:21Z","published":"2023-08-17T15:54:21Z","title":"Uplift Modeling: from Causal Inference to Personalization","summary":" Uplift modeling is a collection of machine learning techniques for estimating\ncausal effects of a treatment at the individual or subgroup levels. Over the\nlast years, causality and uplift modeling have become key trends in\npersonalization at online e-commerce platforms, enabling the selection of the\nbest treatment for each user in order to maximize the target business metric.\nUplift modeling can be particularly useful for personalized promotional\ncampaigns, where the potential benefit caused by a promotion needs to be\nweighed against the potential costs. In this tutorial we will cover basic\nconcepts of causality and introduce the audience to state-of-the-art techniques\nin uplift modeling. We will discuss the advantages and the limitations of\ndifferent approaches and dive into the unique setup of constrained uplift\nmodeling. Finally, we will present real-life applications and discuss\nchallenges in implementing these models in production.\n","authors":["Felipe Moraes","Hugo Manuel Proença","Anastasiia Kornilova","Javier Albert","Dmitri Goldenberg"],"pdf_url":"https://arxiv.org/pdf/2308.09066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08911v1","updated":"2023-08-17T10:54:47Z","published":"2023-08-17T10:54:47Z","title":"Towards Filling the Gap in Conversational Search: From Passage Retrieval\n to Conversational Response Generation","summary":" Research on conversational search has so far mostly focused on query\nrewriting and multi-stage passage retrieval. However, synthesizing the top\nretrieved passages into a complete, relevant, and concise response is still an\nopen challenge. Having snippet-level annotations of relevant passages would\nenable both (1) the training of response generation models that are able to\nground answers in actual statements and (2) the automatic evaluation of the\ngenerated responses in terms of completeness. In this paper, we address the\nproblem of collecting high-quality snippet-level answer annotations for two of\nthe TREC Conversational Assistance track datasets. To ensure quality, we first\nperform a preliminary annotation study, employing different task designs,\ncrowdsourcing platforms, and workers with different qualifications. Based on\nthe outcomes of this study, we refine our annotation protocol before proceeding\nwith the full-scale data collection. Overall, we gather annotations for 1.8k\nquestion-paragraph pairs, each annotated by three independent crowd workers.\nThe process of collecting data at this magnitude also led to multiple insights\nabout the problem that can inform the design of future response-generation\nmethods. This is an extended version of the article published with the same\ntitle in the Proceedings of CIKM'23.\n","authors":["Weronika Łajewska","Krisztian Balog"],"pdf_url":"https://arxiv.org/pdf/2308.08911v1.pdf","comment":"Extended version of the paper that appeared in the Proceedings of the\n 32nd ACM International Conference on Information and Knowledge Management\n (CIKM '23)"},{"id":"http://arxiv.org/abs/2306.04487v2","updated":"2023-08-17T08:37:48Z","published":"2023-06-07T14:57:21Z","title":"Adaptive Vague Preference Policy Learning for Multi-round Conversational\n Recommendation","summary":" Conversational recommendation systems (CRS) effectively address information\nasymmetry by dynamically eliciting user preferences through multi-turn\ninteractions. Existing CRS widely assumes that users have clear preferences.\nUnder this assumption, the agent will completely trust the user feedback and\ntreat the accepted or rejected signals as strong indicators to filter items and\nreduce the candidate space, which may lead to the problem of over-filtering.\nHowever, in reality, users' preferences are often vague and volatile, with\nuncertainty about their desires and changing decisions during interactions.\n To address this issue, we introduce a novel scenario called Vague Preference\nMulti-round Conversational Recommendation (VPMCR), which considers users' vague\nand volatile preferences in CRS.VPMCR employs a soft estimation mechanism to\nassign a non-zero confidence score for all candidate items to be displayed,\nnaturally avoiding the over-filtering problem. In the VPMCR setting, we\nintroduce an solution called Adaptive Vague Preference Policy Learning (AVPPL),\nwhich consists of two main components: Uncertainty-aware Soft Estimation (USE)\nand Uncertainty-aware Policy Learning (UPL). USE estimates the uncertainty of\nusers' vague feedback and captures their dynamic preferences using a\nchoice-based preferences extraction module and a time-aware decaying strategy.\nUPL leverages the preference distribution estimated by USE to guide the\nconversation and adapt to changes in users' preferences to make recommendations\nor ask for attributes.\n Our extensive experiments demonstrate the effectiveness of our method in the\nVPMCR scenario, highlighting its potential for practical applications and\nimproving the overall performance and applicability of CRS in real-world\nsettings, particularly for users with vague or dynamic preferences.\n","authors":["Gangyi Zhang","Chongming Gao","Wenqiang Lei","Xiaojie Guo","Shijun Li","Lingfei Wu","Hongshen Chen","Zhuozhi Ding","Sulong Xu","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2306.04487v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08799v1","updated":"2023-08-17T06:20:03Z","published":"2023-08-17T06:20:03Z","title":"Capturing Popularity Trends: A Simplistic Non-Personalized Approach for\n Enhanced Item Recommendation","summary":" Recommender systems have been gaining increasing research attention over the\nyears. Most existing recommendation methods focus on capturing users'\npersonalized preferences through historical user-item interactions, which may\npotentially violate user privacy. Additionally, these approaches often overlook\nthe significance of the temporal fluctuation in item popularity that can sway\nusers' decision-making. To bridge this gap, we propose Popularity-Aware\nRecommender (PARE), which makes non-personalized recommendations by predicting\nthe items that will attain the highest popularity. PARE consists of four\nmodules, each focusing on a different aspect: popularity history, temporal\nimpact, periodic impact, and side information. Finally, an attention layer is\nleveraged to fuse the outputs of four modules. To our knowledge, this is the\nfirst work to explicitly model item popularity in recommendation systems.\nExtensive experiments show that PARE performs on par or even better than\nsophisticated state-of-the-art recommendation methods. Since PARE prioritizes\nitem popularity over personalized user preferences, it can enhance existing\nrecommendation methods as a complementary component. Our experiments\ndemonstrate that integrating PARE with existing recommendation methods\nsignificantly surpasses the performance of standalone models, highlighting\nPARE's potential as a complement to existing recommendation methods.\nFurthermore, the simplicity of PARE makes it immensely practical for industrial\napplications and a valuable baseline for future research.\n","authors":["Jiazheng Jing","Yinan Zhang","Xin Zhou","Zhiqi Shen"],"pdf_url":"https://arxiv.org/pdf/2308.08799v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.08461v2","updated":"2023-08-17T05:30:03Z","published":"2023-08-13T08:10:56Z","title":"CDR: Conservative Doubly Robust Learning for Debiased Recommendation","summary":" In recommendation systems (RS), user behavior data is observational rather\nthan experimental, resulting in widespread bias in the data. Consequently,\ntackling bias has emerged as a major challenge in the field of recommendation\nsystems. Recently, Doubly Robust Learning (DR) has gained significant attention\ndue to its remarkable performance and robust properties. However, our\nexperimental findings indicate that existing DR methods are severely impacted\nby the presence of so-called Poisonous Imputation, where the imputation\nsignificantly deviates from the truth and becomes counterproductive.\n To address this issue, this work proposes Conservative Doubly Robust strategy\n(CDR) which filters imputations by scrutinizing their mean and variance.\nTheoretical analyses show that CDR offers reduced variance and improved tail\nbounds.In addition, our experimental investigations illustrate that CDR\nsignificantly enhances performance and can indeed reduce the frequency of\npoisonous imputation.\n","authors":["ZiJie Song","JiaWei Chen","Sheng Zhou","QiHao Shi","Yan Feng","Chun Chen","Can Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08756v1","updated":"2023-08-17T03:09:07Z","published":"2023-08-17T03:09:07Z","title":"Real-Time Construction Algorithm of Co-Occurrence Network Based on\n Inverted Index","summary":" Co-occurrence networks are an important method in the field of natural\nlanguage processing and text mining for discovering semantic relationships\nwithin texts. However, the traditional traversal algorithm for constructing\nco-occurrence networks has high time complexity and space complexity when\ndealing with large-scale text data. In this paper, we propose an optimized\nalgorithm based on inverted indexing and breadth-first search to improve the\nefficiency of co-occurrence network construction and reduce memory consumption.\nFirstly, the traditional traversal algorithm is analyzed, and its performance\nissues in constructing co-occurrence networks are identified. Then, the\ndetailed implementation process of the optimized algorithm is presented.\nSubsequently, the CSL large-scale Chinese scientific literature dataset is used\nfor experimental validation, comparing the performance of the traditional\ntraversal algorithm and the optimized algorithm in terms of running time and\nmemory usage. Finally, using non-parametric test methods, the optimized\nalgorithm is proven to have significantly better performance than the\ntraditional traversal algorithm. The research in this paper provides an\neffective method for the rapid construction of co-occurrence networks,\ncontributing to the further development of the Information Organization fields.\n","authors":["Jiahao Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.08756v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.11527v1","updated":"2023-08-17T08:25:54Z","published":"2023-08-17T08:25:54Z","title":"BERT4CTR: An Efficient Framework to Combine Pre-trained Language Model\n with Non-textual Features for CTR Prediction","summary":" Although deep pre-trained language models have shown promising benefit in a\nlarge set of industrial scenarios, including Click-Through-Rate (CTR)\nprediction, how to integrate pre-trained language models that handle only\ntextual signals into a prediction pipeline with non-textual features is\nchallenging.\n Up to now two directions have been explored to integrate multi-modal inputs\nin fine-tuning of pre-trained language models. One consists of fusing the\noutcome of language models and non-textual features through an aggregation\nlayer, resulting into ensemble framework, where the cross-information between\ntextual and non-textual inputs are only learned in the aggregation layer. The\nsecond one consists of splitting non-textual features into fine-grained\nfragments and transforming the fragments to new tokens combined with textual\nones, so that they can be fed directly to transformer layers in language\nmodels. However, this approach increases the complexity of the learning and\ninference because of the numerous additional tokens.\n To address these limitations, we propose in this work a novel framework\nBERT4CTR, with the Uni-Attention mechanism that can benefit from the\ninteractions between non-textual and textual features while maintaining low\ntime-costs in training and inference through a dimensionality reduction.\nComprehensive experiments on both public and commercial data demonstrate that\nBERT4CTR can outperform significantly the state-of-the-art frameworks to handle\nmulti-modal inputs and be applicable to CTR prediction.\n","authors":["Dong Wang","Kavé Salamatian","Yunqing Xia","Weiwei Deng","Qi Zhiang"],"pdf_url":"https://arxiv.org/pdf/2308.11527v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2203.04275v6","updated":"2023-08-17T22:45:06Z","published":"2022-03-08T18:49:34Z","title":"Robust Multi-Task Learning and Online Refinement for Spacecraft Pose\n Estimation across Domain Gap","summary":" This work presents Spacecraft Pose Network v2 (SPNv2), a Convolutional Neural\nNetwork (CNN) for pose estimation of noncooperative spacecraft across domain\ngap. SPNv2 is a multi-scale, multi-task CNN which consists of a shared\nmulti-scale feature encoder and multiple prediction heads that perform\ndifferent tasks on a shared feature output. These tasks are all related to\ndetection and pose estimation of a target spacecraft from an image, such as\nprediction of pre-defined satellite keypoints, direct pose regression, and\nbinary segmentation of the satellite foreground. It is shown that by jointly\ntraining on different yet related tasks with extensive data augmentations on\nsynthetic images only, the shared encoder learns features that are common\nacross image domains that have fundamentally different visual characteristics\ncompared to synthetic images. This work also introduces Online Domain\nRefinement (ODR) which refines the parameters of the normalization layers of\nSPNv2 on the target domain images online at deployment. Specifically, ODR\nperforms self-supervised entropy minimization of the predicted satellite\nforeground, thereby improving the CNN's performance on the target domain images\nwithout their pose labels and with minimal computational efforts. The GitHub\nrepository for SPNv2 is available at https://github.com/tpark94/spnv2.\n","authors":["Tae Ha Park","Simone D'Amico"],"pdf_url":"https://arxiv.org/pdf/2203.04275v6.pdf","comment":"Accepted to Advances in Space Research; fixed error on reporting\n translation from heatmaps"},{"id":"http://arxiv.org/abs/2306.02618v2","updated":"2023-08-17T22:43:01Z","published":"2023-06-05T06:36:18Z","title":"Enhance Diffusion to Improve Robust Generalization","summary":" Deep neural networks are susceptible to human imperceptible adversarial\nperturbations. One of the strongest defense mechanisms is \\emph{Adversarial\nTraining} (AT). In this paper, we aim to address two predominant problems in\nAT. First, there is still little consensus on how to set hyperparameters with a\nperformance guarantee for AT research, and customized settings impede a fair\ncomparison between different model designs in AT research. Second, the robustly\ntrained neural networks struggle to generalize well and suffer from tremendous\noverfitting. This paper focuses on the primary AT framework - Projected\nGradient Descent Adversarial Training (PGD-AT). We approximate the dynamic of\nPGD-AT by a continuous-time Stochastic Differential Equation (SDE), and show\nthat the diffusion term of this SDE determines the robust generalization. An\nimmediate implication of this theoretical finding is that robust generalization\nis positively correlated with the ratio between learning rate and batch size.\nWe further propose a novel approach, \\emph{Diffusion Enhanced Adversarial\nTraining} (DEAT), to manipulate the diffusion term to improve robust\ngeneralization with virtually no extra computational burden. We theoretically\nshow that DEAT obtains a tighter generalization bound than PGD-AT. Our\nempirical investigation is extensive and firmly attests that DEAT universally\noutperforms PGD-AT by a significant margin.\n","authors":["Jianhui Sun","Sanchit Sinha","Aidong Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.02618v2.pdf","comment":"Accepted at KDD 2023"},{"id":"http://arxiv.org/abs/2308.09202v1","updated":"2023-08-17T22:40:59Z","published":"2023-08-17T22:40:59Z","title":"A Model-Agnostic Framework for Recommendation via Interest-aware Item\n Embeddings","summary":" Item representation holds significant importance in recommendation systems,\nwhich encompasses domains such as news, retail, and videos. Retrieval and\nranking models utilise item representation to capture the user-item\nrelationship based on user behaviours. While existing representation learning\nmethods primarily focus on optimising item-based mechanisms, such as attention\nand sequential modelling. However, these methods lack a modelling mechanism to\ndirectly reflect user interests within the learned item representations.\nConsequently, these methods may be less effective in capturing user interests\nindirectly. To address this challenge, we propose a novel Interest-aware\nCapsule network (IaCN) recommendation model, a model-agnostic framework that\ndirectly learns interest-oriented item representations. IaCN serves as an\nauxiliary task, enabling the joint learning of both item-based and\ninterest-based representations. This framework adopts existing recommendation\nmodels without requiring substantial redesign. We evaluate the proposed\napproach on benchmark datasets, exploring various scenarios involving different\ndeep neural networks, behaviour sequence lengths, and joint learning ratios of\ninterest-oriented item representations. Experimental results demonstrate\nsignificant performance enhancements across diverse recommendation models,\nvalidating the effectiveness of our approach.\n","authors":["Amit Kumar Jaiswal","Yu Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.09202v1.pdf","comment":"Accepted Paper under LBR track in the Seventeenth ACM Conference on\n Recommender Systems (RecSys) 2023"},{"id":"http://arxiv.org/abs/2308.09201v1","updated":"2023-08-17T22:32:32Z","published":"2023-08-17T22:32:32Z","title":"TinyProp -- Adaptive Sparse Backpropagation for Efficient TinyML\n On-device Learning","summary":" Training deep neural networks using backpropagation is very memory and\ncomputationally intensive. This makes it difficult to run on-device learning or\nfine-tune neural networks on tiny, embedded devices such as low-power\nmicro-controller units (MCUs). Sparse backpropagation algorithms try to reduce\nthe computational load of on-device learning by training only a subset of the\nweights and biases. Existing approaches use a static number of weights to\ntrain. A poor choice of this so-called backpropagation ratio limits either the\ncomputational gain or can lead to severe accuracy losses. In this paper we\npresent TinyProp, the first sparse backpropagation method that dynamically\nadapts the back-propagation ratio during on-device training for each training\nstep. TinyProp induces a small calculation overhead to sort the elements of the\ngradient, which does not significantly impact the computational gains. TinyProp\nworks particularly well on fine-tuning trained networks on MCUs, which is a\ntypical use case for embedded applications. For typical datasets from three\ndatasets MNIST, DCASE2020 and CIFAR10, we are 5 times faster compared to\nnon-sparse training with an accuracy loss of on average 1%. On average,\nTinyProp is 2.9 times faster than existing, static sparse backpropagation\nalgorithms and the accuracy loss is reduced on average by 6 % compared to a\ntypical static setting of the back-propagation ratio.\n","authors":["Marcus Rüb","Daniel Maier","Daniel Mueller-Gritschneder","Axel Sikora"],"pdf_url":"https://arxiv.org/pdf/2308.09201v1.pdf","comment":"7 Pages, AIPE Conference 2023"},{"id":"http://arxiv.org/abs/2308.09199v1","updated":"2023-08-17T22:26:48Z","published":"2023-08-17T22:26:48Z","title":"Polynomial Bounds for Learning Noisy Optical Physical Unclonable\n Functions and Connections to Learning With Errors","summary":" It is shown that a class of optical physical unclonable functions (PUFs) can\nbe learned to arbitrary precision with arbitrarily high probability, even in\nthe presence of noise, given access to polynomially many challenge-response\npairs and polynomially bounded computational power, under mild assumptions\nabout the distributions of the noise and challenge vectors. This extends the\nresults of Rh\\\"uramir et al. (2013), who showed a subset of this class of PUFs\nto be learnable in polynomial time in the absence of noise, under the\nassumption that the optics of the PUF were either linear or had negligible\nnonlinear effects. We derive polynomial bounds for the required number of\nsamples and the computational complexity of a linear regression algorithm,\nbased on size parameters of the PUF, the distributions of the challenge and\nnoise vectors, and the probability and accuracy of the regression algorithm,\nwith a similar analysis to one done by Bootle et al. (2018), who demonstrated a\nlearning attack on a poorly implemented version of the Learning With Errors\nproblem.\n","authors":["Apollo Albright","Boris Gelfand","Michael Dixon"],"pdf_url":"https://arxiv.org/pdf/2308.09199v1.pdf","comment":"10 pages, 2 figures, submitted to IEEE Transactions on Information\n Forensics and Security"},{"id":"http://arxiv.org/abs/2308.09198v1","updated":"2023-08-17T22:24:15Z","published":"2023-08-17T22:24:15Z","title":"Half-Hop: A graph upsampling approach for slowing down message passing","summary":" Message passing neural networks have shown a lot of success on\ngraph-structured data. However, there are many instances where message passing\ncan lead to over-smoothing or fail when neighboring nodes belong to different\nclasses. In this work, we introduce a simple yet general framework for\nimproving learning in message passing neural networks. Our approach essentially\nupsamples edges in the original graph by adding \"slow nodes\" at each edge that\ncan mediate communication between a source and a target node. Our method only\nmodifies the input graph, making it plug-and-play and easy to use with existing\nmodels. To understand the benefits of slowing down message passing, we provide\ntheoretical and empirical analyses. We report results on several supervised and\nself-supervised benchmarks, and show improvements across the board, notably in\nheterophilic conditions where adjacent nodes are more likely to have different\nlabels. Finally, we show how our approach can be used to generate augmentations\nfor self-supervised learning, where slow nodes are randomly introduced into\ndifferent edges in the graph to generate multi-scale views with variable path\nlengths.\n","authors":["Mehdi Azabou","Venkataramana Ganesh","Shantanu Thakoor","Chi-Heng Lin","Lakshmi Sathidevi","Ran Liu","Michal Valko","Petar Veličković","Eva L. Dyer"],"pdf_url":"https://arxiv.org/pdf/2308.09198v1.pdf","comment":"Published as a conference paper at ICML 2023"},{"id":"http://arxiv.org/abs/2308.09193v1","updated":"2023-08-17T21:36:56Z","published":"2023-08-17T21:36:56Z","title":"A Comparative Study of Text Embedding Models for Semantic Text\n Similarity in Bug Reports","summary":" Bug reports are an essential aspect of software development, and it is\ncrucial to identify and resolve them quickly to ensure the consistent\nfunctioning of software systems. Retrieving similar bug reports from an\nexisting database can help reduce the time and effort required to resolve bugs.\nIn this paper, we compared the effectiveness of semantic textual similarity\nmethods for retrieving similar bug reports based on a similarity score. We\nexplored several embedding models such as TF-IDF (Baseline), FastText, Gensim,\nBERT, and ADA. We used the Software Defects Data containing bug reports for\nvarious software projects to evaluate the performance of these models. Our\nexperimental results showed that BERT generally outperformed the rest of the\nmodels regarding recall, followed by ADA, Gensim, FastText, and TFIDF. Our\nstudy provides insights into the effectiveness of different embedding methods\nfor retrieving similar bug reports and highlights the impact of selecting the\nappropriate one for this task. Our code is available on GitHub.\n","authors":["Avinash Patil","Kihwan Han","Sabyasachi Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2308.09193v1.pdf","comment":"7 Pages"},{"id":"http://arxiv.org/abs/2308.09189v1","updated":"2023-08-17T21:24:34Z","published":"2023-08-17T21:24:34Z","title":"Regularizing Adversarial Imitation Learning Using Causal Invariance","summary":" Imitation learning methods are used to infer a policy in a Markov decision\nprocess from a dataset of expert demonstrations by minimizing a divergence\nmeasure between the empirical state occupancy measures of the expert and the\npolicy. The guiding signal to the policy is provided by the discriminator used\nas part of an versarial optimization procedure. We observe that this model is\nprone to absorbing spurious correlations present in the expert data. To\nalleviate this issue, we propose to use causal invariance as a regularization\nprinciple for adversarial training of these models. The regularization\nobjective is applicable in a straightforward manner to existing adversarial\nimitation frameworks. We demonstrate the efficacy of the regularized\nformulation in an illustrative two-dimensional setting as well as a number of\nhigh-dimensional robot locomotion benchmark tasks.\n","authors":["Ivan Ovinnikov","Joachim M. Buhmann"],"pdf_url":"https://arxiv.org/pdf/2308.09189v1.pdf","comment":"Published at the ICML 2023 Workshop on Spurious Correlations,\n Invariance, and Stability"},{"id":"http://arxiv.org/abs/2308.09187v1","updated":"2023-08-17T21:15:04Z","published":"2023-08-17T21:15:04Z","title":"Distributed Extra-gradient with Optimal Complexity and Communication\n Guarantees","summary":" We consider monotone variational inequality (VI) problems in multi-GPU\nsettings where multiple processors/workers/clients have access to local\nstochastic dual vectors. This setting includes a broad range of important\nproblems from distributed convex minimization to min-max and games.\nExtra-gradient, which is a de facto algorithm for monotone VI problems, has not\nbeen designed to be communication-efficient. To this end, we propose a\nquantized generalized extra-gradient (Q-GenX), which is an unbiased and\nadaptive compression method tailored to solve VIs. We provide an adaptive\nstep-size rule, which adapts to the respective noise profiles at hand and\nachieve a fast rate of ${\\mathcal O}(1/T)$ under relative noise, and an\norder-optimal ${\\mathcal O}(1/\\sqrt{T})$ under absolute noise and show\ndistributed training accelerates convergence. Finally, we validate our\ntheoretical results by providing real-world experiments and training generative\nadversarial networks on multiple GPUs.\n","authors":["Ali Ramezani-Kebrya","Kimon Antonakopoulos","Igor Krawczuk","Justin Deschenaux","Volkan Cevher"],"pdf_url":"https://arxiv.org/pdf/2308.09187v1.pdf","comment":"International Conference on Learning Representations (ICLR 2023)"},{"id":"http://arxiv.org/abs/2306.05501v2","updated":"2023-08-17T20:58:29Z","published":"2023-06-08T18:49:23Z","title":"Robust Framework for Explanation Evaluation in Time Series\n Classification","summary":" Time series classification is a task which deals with a prevalent data type\nin domains such as human activity recognition, sports analytics and general\nhealthcare. This paper provides a framework to quantitatively evaluate and rank\nexplanation methods for time series classification. The recent interest in\nexplanation methods for time series has provided a great variety of explanation\ntechniques. Nevertheless, when the explanations disagree on a specific problem,\nit remains unclear which of them to use. Comparing multiple explanations to\nfind the right answer is non-trivial. Two key challenges remain: how to\nquantitatively and robustly evaluate the informativeness of a given explanation\nmethod (i.e., relevance for the classification task), and how to compare\nexplanation methods side-by-side. We propose AMEE, a robust Model-Agnostic\nExplanation Evaluation framework for quantifying and comparing multiple\nsaliency-based explanations for time series classification. Data perturbation\nis added to the input time series guided by the saliency maps. The impact of\nperturbation on classification accuracy is measured and used for explanation\nevaluation. The results show that perturbing discriminative parts of the time\nseries leads to significant changes in classification accuracy. To be robust to\ndifferent types of perturbations and different types of classifiers, we\naggregate the accuracy loss across perturbations and classifiers. This allows\nus to objectively quantify and rank different explanation methods. We provide a\nquantitative and qualitative analysis for synthetic datasets, a variety of\ntime-series datasets, as well as a real-world dataset with known expert ground\ntruth.\n","authors":["Thu Trang Nguyen","Thach Le Nguyen","Georgiana Ifrim"],"pdf_url":"https://arxiv.org/pdf/2306.05501v2.pdf","comment":"Pre-print"},{"id":"http://arxiv.org/abs/2308.09183v1","updated":"2023-08-17T20:54:39Z","published":"2023-08-17T20:54:39Z","title":"RatGPT: Turning online LLMs into Proxies for Malware Attacks","summary":" The evolution of Generative AI and the capabilities of the newly released\nLarge Language Models (LLMs) open new opportunities in software engineering.\nHowever, they also lead to new challenges in cybersecurity. Recently,\nresearchers have shown the possibilities of using LLMs such as ChatGPT to\ngenerate malicious content that can directly be exploited or guide\ninexperienced hackers to weaponize tools and code. Those studies covered\nscenarios that still require the attacker in the middle of the loop. In this\nstudy, we leverage openly available plugins and use an LLM as proxy between the\nattacker and the victim. We deliver a proof-of-concept where ChatGPT is used\nfor the dissemination of malicious software while evading detection, alongside\nestablishing the communication to a command and control (C2) server to receive\ncommands to interact with a victim's system. Finally, we present the general\napproach as well as essential elements in order to stay undetected and make the\nattack a success. This proof-of-concept highlights significant cybersecurity\nissues with openly available plugins and LLMs, which require the development of\nsecurity guidelines, controls, and mitigation strategies.\n","authors":["Mika Beckerich","Laura Plein","Sergio Coronado"],"pdf_url":"https://arxiv.org/pdf/2308.09183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.09702v7","updated":"2023-08-17T20:28:36Z","published":"2022-05-19T17:11:45Z","title":"Parallel and Distributed Graph Neural Networks: An In-Depth Concurrency\n Analysis","summary":" Graph neural networks (GNNs) are among the most powerful tools in deep\nlearning. They routinely solve complex problems on unstructured networks, such\nas node classification, graph classification, or link prediction, with high\naccuracy. However, both inference and training of GNNs are complex, and they\nuniquely combine the features of irregular graph processing with dense and\nregular computations. This complexity makes it very challenging to execute GNNs\nefficiently on modern massively parallel architectures. To alleviate this, we\nfirst design a taxonomy of parallelism in GNNs, considering data and model\nparallelism, and different forms of pipelining. Then, we use this taxonomy to\ninvestigate the amount of parallelism in numerous GNN models, GNN-driven\nmachine learning tasks, software frameworks, or hardware accelerators. We use\nthe work-depth model, and we also assess communication volume and\nsynchronization. We specifically focus on the sparsity/density of the\nassociated tensors, in order to understand how to effectively apply techniques\nsuch as vectorization. We also formally analyze GNN pipelining, and we\ngeneralize the established Message-Passing class of GNN models to cover\narbitrary pipeline depths, facilitating future optimizations. Finally, we\ninvestigate different forms of asynchronicity, navigating the path for future\nasynchronous parallel GNN pipelines. The outcomes of our analysis are\nsynthesized in a set of insights that help to maximize GNN performance, and a\ncomprehensive list of challenges and opportunities for further research into\nefficient GNN computations. Our work will help to advance the design of future\nGNNs.\n","authors":["Maciej Besta","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2205.09702v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09175v1","updated":"2023-08-17T20:27:33Z","published":"2023-08-17T20:27:33Z","title":"Diversifying AI: Towards Creative Chess with AlphaZero","summary":" In recent years, Artificial Intelligence (AI) systems have surpassed human\nintelligence in a variety of computational tasks. However, AI systems, like\nhumans, make mistakes, have blind spots, hallucinate, and struggle to\ngeneralize to new situations. This work explores whether AI can benefit from\ncreative decision-making mechanisms when pushed to the limits of its\ncomputational rationality. In particular, we investigate whether a team of\ndiverse AI systems can outperform a single AI in challenging tasks by\ngenerating more ideas as a group and then selecting the best ones. We study\nthis question in the game of chess, the so-called drosophila of AI. We build on\nAlphaZero (AZ) and extend it to represent a league of agents via a\nlatent-conditioned architecture, which we call AZ_db. We train AZ_db to\ngenerate a wider range of ideas using behavioral diversity techniques and\nselect the most promising ones with sub-additive planning. Our experiments\nsuggest that AZ_db plays chess in diverse ways, solves more puzzles as a group\nand outperforms a more homogeneous team. Notably, AZ_db solves twice as many\nchallenging puzzles as AZ, including the challenging Penrose positions. When\nplaying chess from different openings, we notice that players in AZ_db\nspecialize in different openings, and that selecting a player for each opening\nusing sub-additive planning results in a 50 Elo improvement over AZ. Our\nfindings suggest that diversity bonuses emerge in teams of AI agents, just as\nthey do in teams of humans and that diversity is a valuable asset in solving\ncomputationally hard problems.\n","authors":["Tom Zahavy","Vivek Veeriah","Shaobo Hou","Kevin Waugh","Matthew Lai","Edouard Leurent","Nenad Tomasev","Lisa Schut","Demis Hassabis","Satinder Singh"],"pdf_url":"https://arxiv.org/pdf/2308.09175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.03211v8","updated":"2023-08-17T20:27:29Z","published":"2022-08-05T14:54:08Z","title":"Why do networks have inhibitory/negative connections?","summary":" Why do brains have inhibitory connections? Why do deep networks have negative\nweights? We propose an answer from the perspective of representation capacity.\nWe believe representing functions is the primary role of both (i) the brain in\nnatural intelligence, and (ii) deep networks in artificial intelligence. Our\nanswer to why there are inhibitory/negative weights is: to learn more\nfunctions. We prove that, in the absence of negative weights, neural networks\nwith non-decreasing activation functions are not universal approximators. While\nthis may be an intuitive result to some, to the best of our knowledge, there is\nno formal theory, in either machine learning or neuroscience, that demonstrates\nwhy negative weights are crucial in the context of representation capacity.\nFurther, we provide insights on the geometric properties of the representation\nspace that non-negative deep networks cannot represent. We expect these\ninsights will yield a deeper understanding of more sophisticated inductive\npriors imposed on the distribution of weights that lead to more efficient\nbiological and machine learning.\n","authors":["Qingyang Wang","Michael A. Powell","Ali Geisa","Eric Bridgeford","Carey E. Priebe","Joshua T. Vogelstein"],"pdf_url":"https://arxiv.org/pdf/2208.03211v8.pdf","comment":"ICCV2023 camera-ready"},{"id":"http://arxiv.org/abs/2305.11742v2","updated":"2023-08-17T20:10:54Z","published":"2023-05-19T15:28:02Z","title":"MedLens: Improve Mortality Prediction Via Medical Signs Selecting and\n Regression","summary":" Monitoring the health status of patients and predicting mortality in advance\nis vital for providing patients with timely care and treatment. Massive medical\nsigns in electronic health records (EHR) are fitted into advanced machine\nlearning models to make predictions. However, the data-quality problem of\noriginal clinical signs is less discussed in the literature. Based on an\nin-depth measurement of the missing rate and correlation score across various\nmedical signs and a large amount of patient hospital admission records, we\ndiscovered the comprehensive missing rate is extremely high, and a large number\nof useless signs could hurt the performance of prediction models. Then we\nconcluded that only improving data-quality could improve the baseline accuracy\nof different prediction algorithms. We designed MEDLENS, with an automatic\nvital medical signs selection approach via statistics and a flexible\ninterpolation approach for high missing rate time series. After augmenting the\ndata-quality of original medical signs, MEDLENS applies ensemble classifiers to\nboost the accuracy and reduce the computation overhead at the same time. It\nachieves a very high accuracy performance of 0.96 AUC-ROC and 0.81 AUC-PR,\nwhich exceeds the previous benchmark.\n","authors":["Xuesong Ye","Jun Wu","Chengjie Mou","Weinan Dai"],"pdf_url":"https://arxiv.org/pdf/2305.11742v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09171v1","updated":"2023-08-17T20:09:33Z","published":"2023-08-17T20:09:33Z","title":"Forensic Data Analytics for Anomaly Detection in Evolving Networks","summary":" In the prevailing convergence of traditional infrastructure-based deployment\n(i.e., Telco and industry operational networks) towards evolving deployments\nenabled by 5G and virtualization, there is a keen interest in elaborating\neffective security controls to protect these deployments in-depth. By\nconsidering key enabling technologies like 5G and virtualization, evolving\nnetworks are democratized, facilitating the establishment of point presences\nintegrating different business models ranging from media, dynamic web content,\ngaming, and a plethora of IoT use cases. Despite the increasing services\nprovided by evolving networks, many cybercrimes and attacks have been launched\nin evolving networks to perform malicious activities. Due to the limitations of\ntraditional security artifacts (e.g., firewalls and intrusion detection\nsystems), the research on digital forensic data analytics has attracted more\nattention. Digital forensic analytics enables people to derive detailed\ninformation and comprehensive conclusions from different perspectives of\ncybercrimes to assist in convicting criminals and preventing future crimes.\nThis chapter presents a digital analytics framework for network anomaly\ndetection, including multi-perspective feature engineering, unsupervised\nanomaly detection, and comprehensive result correction procedures. Experiments\non real-world evolving network data show the effectiveness of the proposed\nforensic data analytics solution.\n","authors":["Li Yang","Abdallah Moubayed","Abdallah Shami","Amine Boukhtouta","Parisa Heidari","Stere Preda","Richard Brunner","Daniel Migault","Adel Larabi"],"pdf_url":"https://arxiv.org/pdf/2308.09171v1.pdf","comment":"Electronic version of an article published as [Book Series: World\n Scientific Series in Digital Forensics and Cybersecurity, Volume 2,\n Innovations in Digital Forensics, 2023, Pages 99-137]\n [DOI:10.1142/9789811273209_0004] \\c{opyright} copyright World Scientific\n Publishing Company [https://doi.org/10.1142/9789811273209_0004]"},{"id":"http://arxiv.org/abs/2206.04030v4","updated":"2023-08-17T20:05:36Z","published":"2022-06-08T17:42:18Z","title":"High-dimensional limit theorems for SGD: Effective dynamics and critical\n scaling","summary":" We study the scaling limits of stochastic gradient descent (SGD) with\nconstant step-size in the high-dimensional regime. We prove limit theorems for\nthe trajectories of summary statistics (i.e., finite-dimensional functions) of\nSGD as the dimension goes to infinity. Our approach allows one to choose the\nsummary statistics that are tracked, the initialization, and the step-size. It\nyields both ballistic (ODE) and diffusive (SDE) limits, with the limit\ndepending dramatically on the former choices. We show a critical scaling regime\nfor the step-size, below which the effective ballistic dynamics matches\ngradient flow for the population loss, but at which, a new correction term\nappears which changes the phase diagram. About the fixed points of this\neffective dynamics, the corresponding diffusive limits can be quite complex and\neven degenerate. We demonstrate our approach on popular examples including\nestimation for spiked matrix and tensor models and classification via two-layer\nnetworks for binary and XOR-type Gaussian mixture models. These examples\nexhibit surprising phenomena including multimodal timescales to convergence as\nwell as convergence to sub-optimal solutions with probability bounded away from\nzero from random (e.g., Gaussian) initializations. At the same time, we\ndemonstrate the benefit of overparametrization by showing that the latter\nprobability goes to zero as the second layer width grows.\n","authors":["Gerard Ben Arous","Reza Gheissari","Aukosh Jagannath"],"pdf_url":"https://arxiv.org/pdf/2206.04030v4.pdf","comment":"43 pages, 11 figures"},{"id":"http://arxiv.org/abs/2302.02792v2","updated":"2023-08-17T19:58:03Z","published":"2023-02-06T14:10:53Z","title":"Dealing With Non-stationarity in Decentralized Cooperative Multi-Agent\n Deep Reinforcement Learning via Multi-Timescale Learning","summary":" Decentralized cooperative multi-agent deep reinforcement learning (MARL) can\nbe a versatile learning framework, particularly in scenarios where centralized\ntraining is either not possible or not practical. One of the critical\nchallenges in decentralized deep MARL is the non-stationarity of the learning\nenvironment when multiple agents are learning concurrently. A commonly used and\nefficient scheme for decentralized MARL is independent learning in which agents\nconcurrently update their policies independently of each other. We first show\nthat independent learning does not always converge, while sequential learning\nwhere agents update their policies one after another in a sequence is\nguaranteed to converge to an agent-by-agent optimal solution. In sequential\nlearning, when one agent updates its policy, all other agent's policies are\nkept fixed, alleviating the challenge of non-stationarity due to simultaneous\nupdates in other agents' policies. However, it can be slow because only one\nagent is learning at any time. Therefore it might also not always be practical.\nIn this work, we propose a decentralized cooperative MARL algorithm based on\nmulti-timescale learning. In multi-timescale learning, all agents learn\nsimultaneously, but at different learning rates. In our proposed method, when\none agent updates its policy, other agents are allowed to update their policies\nas well, but at a slower rate. This speeds up sequential learning, while also\nminimizing non-stationarity caused by other agents updating concurrently.\nMulti-timescale learning outperforms state-of-the-art decentralized learning\nmethods on a set of challenging multi-agent cooperative tasks in the\nepymarl(Papoudakis et al., 2020) benchmark. This can be seen as a first step\ntowards more general decentralized cooperative deep MARL methods based on\nmulti-timescale learning.\n","authors":["Hadi Nekoei","Akilesh Badrinaaraayanan","Amit Sinha","Mohammad Amini","Janarthanan Rajendran","Aditya Mahajan","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2302.02792v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.02232v3","updated":"2023-08-17T19:30:13Z","published":"2022-05-04T13:19:04Z","title":"Experimental Design for Causal Effect Identification","summary":" Pearl's do calculus is a complete axiomatic approach to learn the\nidentifiable causal effects from observational data. When such an effect is not\nidentifiable, it is necessary to perform a collection of often costly\ninterventions in the system to learn the causal effect. In this work, we\nconsider the problem of designing the collection of interventions with the\nminimum cost to identify the desired effect. First, we prove that this problem\nis NP-hard, and subsequently propose an algorithm that can either find the\noptimal solution or a logarithmic-factor approximation of it. This is done by\nestablishing a connection between our problem and the minimum hitting set\nproblem. Additionally, we propose several polynomial-time heuristic algorithms\nto tackle the computational complexity of the problem. Although these\nalgorithms could potentially stumble on sub-optimal solutions, our simulations\nshow that they achieve small regrets on random graphs.\n","authors":["Sina Akbari","Jalal Etesami","Negar Kiyavash"],"pdf_url":"https://arxiv.org/pdf/2205.02232v3.pdf","comment":"53 pages, 13 figures, extending the findings of our ICML2022 paper"},{"id":"http://arxiv.org/abs/2308.09160v1","updated":"2023-08-17T19:22:30Z","published":"2023-08-17T19:22:30Z","title":"FedPerfix: Towards Partial Model Personalization of Vision Transformers\n in Federated Learning","summary":" Personalized Federated Learning (PFL) represents a promising solution for\ndecentralized learning in heterogeneous data environments. Partial model\npersonalization has been proposed to improve the efficiency of PFL by\nselectively updating local model parameters instead of aggregating all of them.\nHowever, previous work on partial model personalization has mainly focused on\nConvolutional Neural Networks (CNNs), leaving a gap in understanding how it can\nbe applied to other popular models such as Vision Transformers (ViTs). In this\nwork, we investigate where and how to partially personalize a ViT model.\nSpecifically, we empirically evaluate the sensitivity to data distribution of\neach type of layer. Based on the insights that the self-attention layer and the\nclassification head are the most sensitive parts of a ViT, we propose a novel\napproach called FedPerfix, which leverages plugins to transfer information from\nthe aggregated model to the local client as a personalization. Finally, we\nevaluate the proposed approach on CIFAR-100, OrganAMNIST, and Office-Home\ndatasets and demonstrate its effectiveness in improving the model's performance\ncompared to several advanced PFL methods.\n","authors":["Guangyu Sun","Matias Mendieta","Jun Luo","Shandong Wu","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2308.09160v1.pdf","comment":"2023 IEEE/CVF International Conference on Computer Vision (ICCV)"},{"id":"http://arxiv.org/abs/2308.09158v1","updated":"2023-08-17T19:12:13Z","published":"2023-08-17T19:12:13Z","title":"ZhiJian: A Unifying and Rapidly Deployable Toolbox for Pre-trained Model\n Reuse","summary":" The rapid expansion of foundation pre-trained models and their fine-tuned\ncounterparts has significantly contributed to the advancement of machine\nlearning. Leveraging pre-trained models to extract knowledge and expedite\nlearning in real-world tasks, known as \"Model Reuse\", has become crucial in\nvarious applications. Previous research focuses on reusing models within a\ncertain aspect, including reusing model weights, structures, and hypothesis\nspaces. This paper introduces ZhiJian, a comprehensive and user-friendly\ntoolbox for model reuse, utilizing the PyTorch backend. ZhiJian presents a\nnovel paradigm that unifies diverse perspectives on model reuse, encompassing\ntarget architecture construction with PTM, tuning target model with PTM, and\nPTM-based inference. This empowers deep learning practitioners to explore\ndownstream tasks and identify the complementary advantages among different\nmethods. ZhiJian is readily accessible at\nhttps://github.com/zhangyikaii/lamda-zhijian facilitating seamless utilization\nof pre-trained models and streamlining the model reuse process for researchers\nand developers.\n","authors":["Yi-Kai Zhang","Lu Ren","Chao Yi","Qi-Wei Wang","De-Chuan Zhan","Han-Jia Ye"],"pdf_url":"https://arxiv.org/pdf/2308.09158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16633v2","updated":"2023-08-17T18:44:16Z","published":"2023-03-29T12:43:36Z","title":"Targeted Adversarial Attacks on Wind Power Forecasts","summary":" In recent years, researchers proposed a variety of deep learning models for\nwind power forecasting. These models predict the wind power generation of wind\nfarms or entire regions more accurately than traditional machine learning\nalgorithms or physical models. However, latest research has shown that deep\nlearning models can often be manipulated by adversarial attacks. Since wind\npower forecasts are essential for the stability of modern power systems, it is\nimportant to protect them from this threat. In this work, we investigate the\nvulnerability of two different forecasting models to targeted, semi-targeted,\nand untargeted adversarial attacks. We consider a Long Short-Term Memory (LSTM)\nnetwork for predicting the power generation of individual wind farms and a\nConvolutional Neural Network (CNN) for forecasting the wind power generation\nthroughout Germany. Moreover, we propose the Total Adversarial Robustness Score\n(TARS), an evaluation metric for quantifying the robustness of regression\nmodels to targeted and semi-targeted adversarial attacks. It assesses the\nimpact of attacks on the model's performance, as well as the extent to which\nthe attacker's goal was achieved, by assigning a score between 0 (very\nvulnerable) and 1 (very robust). In our experiments, the LSTM forecasting model\nwas fairly robust and achieved a TARS value of over 0.78 for all adversarial\nattacks investigated. The CNN forecasting model only achieved TARS values below\n0.10 when trained ordinarily, and was thus very vulnerable. Yet, its robustness\ncould be significantly improved by adversarial training, which always resulted\nin a TARS above 0.46.\n","authors":["René Heinrich","Christoph Scholz","Stephan Vogt","Malte Lehna"],"pdf_url":"https://arxiv.org/pdf/2303.16633v2.pdf","comment":"21 pages, including appendix, 12 figures"},{"id":"http://arxiv.org/abs/2302.13849v3","updated":"2023-08-17T18:35:27Z","published":"2023-02-27T14:50:34Z","title":"Optimal Prediction Using Expert Advice and Randomized Littlestone\n Dimension","summary":" A classical result in online learning characterizes the optimal mistake bound\nachievable by deterministic learners using the Littlestone dimension\n(Littlestone '88). We prove an analogous result for randomized learners: we\nshow that the optimal expected mistake bound in learning a class $\\mathcal{H}$\nequals its randomized Littlestone dimension, which is the largest $d$ for which\nthere exists a tree shattered by $\\mathcal{H}$ whose average depth is $2d$. We\nfurther study optimal mistake bounds in the agnostic case, as a function of the\nnumber of mistakes made by the best function in $\\mathcal{H}$, denoted by $k$.\nWe show that the optimal randomized mistake bound for learning a class with\nLittlestone dimension $d$ is $k + \\Theta (\\sqrt{k d} + d )$. This also implies\nan optimal deterministic mistake bound of $2k + \\Theta(d) + O(\\sqrt{k d})$,\nthus resolving an open question which was studied by Auer and Long ['99].\n As an application of our theory, we revisit the classical problem of\nprediction using expert advice: about 30 years ago Cesa-Bianchi, Freund,\nHaussler, Helmbold, Schapire and Warmuth studied prediction using expert\nadvice, provided that the best among the $n$ experts makes at most $k$\nmistakes, and asked what are the optimal mistake bounds. Cesa-Bianchi, Freund,\nHelmbold, and Warmuth ['93, '96] provided a nearly optimal bound for\ndeterministic learners, and left the randomized case as an open problem. We\nresolve this question by providing an optimal learning rule in the randomized\ncase, and showing that its expected mistake bound equals half of the\ndeterministic bound of Cesa-Bianchi et al. ['93,'96], up to negligible additive\nterms. In contrast with previous works by Abernethy, Langford, and Warmuth\n['06], and by Br\\^anzei and Peres ['19], our result applies to all pairs $n,k$.\n","authors":["Yuval Filmus","Steve Hanneke","Idan Mehalel","Shay Moran"],"pdf_url":"https://arxiv.org/pdf/2302.13849v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09142v1","updated":"2023-08-17T18:22:19Z","published":"2023-08-17T18:22:19Z","title":"Accurate machine learning force fields via experimental and simulation\n data fusion","summary":" Machine Learning (ML)-based force fields are attracting ever-increasing\ninterest due to their capacity to span spatiotemporal scales of classical\ninteratomic potentials at quantum-level accuracy. They can be trained based on\nhigh-fidelity simulations or experiments, the former being the common case.\nHowever, both approaches are impaired by scarce and erroneous data resulting in\nmodels that either do not agree with well-known experimental observations or\nare under-constrained and only reproduce some properties. Here we leverage both\nDensity Functional Theory (DFT) calculations and experimentally measured\nmechanical properties and lattice parameters to train an ML potential of\ntitanium. We demonstrate that the fused data learning strategy can concurrently\nsatisfy all target objectives, thus resulting in a molecular model of higher\naccuracy compared to the models trained with a single data source. The\ninaccuracies of DFT functionals at target experimental properties were\ncorrected, while the investigated off-target properties remained largely\nunperturbed. Our approach is applicable to any material and can serve as a\ngeneral strategy to obtain highly accurate ML potentials.\n","authors":["Sebastien Röcken","Julija Zavadlav"],"pdf_url":"https://arxiv.org/pdf/2308.09142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09122v1","updated":"2023-08-17T17:57:59Z","published":"2023-08-17T17:57:59Z","title":"RTB Formulation Using Point Process","summary":" We propose a general stochastic framework for modelling repeated auctions in\nthe Real Time Bidding (RTB) ecosystem using point processes. The flexibility of\nthe framework allows a variety of auction scenarios including configuration of\ninformation provided to player, determination of auction winner and\nquantification of utility gained from each auctions. We propose theoretical\nresults on how this formulation of process can be approximated to a Poisson\npoint process, which enables the analyzer to take advantage of well-established\nproperties. Under this framework, we specify the player's optimal strategy\nunder various scenarios. We also emphasize that it is critical to consider the\njoint distribution of utility and market condition instead of estimating the\nmarginal distributions independently.\n","authors":["Seong Jin Lee","Bumsik Kim"],"pdf_url":"https://arxiv.org/pdf/2308.09122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09113v1","updated":"2023-08-17T17:44:59Z","published":"2023-08-17T17:44:59Z","title":"Multi-fidelity Fourier Neural Operator for Fast Modeling of Large-Scale\n Geological Carbon Storage","summary":" Deep learning-based surrogate models have been widely applied in geological\ncarbon storage (GCS) problems to accelerate the prediction of reservoir\npressure and CO2 plume migration. Large amounts of data from physics-based\nnumerical simulators are required to train a model to accurately predict the\ncomplex physical behaviors associated with this process. In practice, the\navailable training data are always limited in large-scale 3D problems due to\nthe high computational cost. Therefore, we propose to use a multi-fidelity\nFourier Neural Operator to solve large-scale GCS problems with more affordable\nmulti-fidelity training datasets. The Fourier Neural Operator has a desirable\ngrid-invariant property, which simplifies the transfer learning procedure\nbetween datasets with different discretization. We first test the model\nefficacy on a GCS reservoir model being discretized into 110k grid cells. The\nmulti-fidelity model can predict with accuracy comparable to a high-fidelity\nmodel trained with the same amount of high-fidelity data with 81% less data\ngeneration costs. We further test the generalizability of the multi-fidelity\nmodel on a same reservoir model with a finer discretization of 1 million grid\ncells. This case was made more challenging by employing high-fidelity and\nlow-fidelity datasets generated by different geostatistical models and\nreservoir simulators. We observe that the multi-fidelity FNO model can predict\npressure fields with reasonable accuracy even when the high-fidelity data are\nextremely limited.\n","authors":["Hewei Tang1","Qingkai Kong1","Joseph P. Morris1"],"pdf_url":"https://arxiv.org/pdf/2308.09113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11727v3","updated":"2023-08-17T17:34:29Z","published":"2022-11-21T18:47:11Z","title":"Parametric Classification for Generalized Category Discovery: A Baseline\n Study","summary":" Generalized Category Discovery (GCD) aims to discover novel categories in\nunlabelled datasets using knowledge learned from labelled samples. Previous\nstudies argued that parametric classifiers are prone to overfitting to seen\ncategories, and endorsed using a non-parametric classifier formed with\nsemi-supervised k-means. However, in this study, we investigate the failure of\nparametric classifiers, verify the effectiveness of previous design choices\nwhen high-quality supervision is available, and identify unreliable\npseudo-labels as a key problem. We demonstrate that two prediction biases\nexist: the classifier tends to predict seen classes more often, and produces an\nimbalanced distribution across seen and novel categories. Based on these\nfindings, we propose a simple yet effective parametric classification method\nthat benefits from entropy regularisation, achieves state-of-the-art\nperformance on multiple GCD benchmarks and shows strong robustness to unknown\nclass numbers. We hope the investigation and proposed simple framework can\nserve as a strong baseline to facilitate future studies in this field. Our code\nis available at: https://github.com/CVMI-Lab/SimGCD.\n","authors":["Xin Wen","Bingchen Zhao","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2211.11727v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2304.13455v3","updated":"2023-08-17T17:23:08Z","published":"2023-04-26T11:27:34Z","title":"From Chaos Comes Order: Ordering Event Representations for Object\n Recognition and Detection","summary":" Today, state-of-the-art deep neural networks that process events first\nconvert them into dense, grid-like input representations before using an\noff-the-shelf network. However, selecting the appropriate representation for\nthe task traditionally requires training a neural network for each\nrepresentation and selecting the best one based on the validation score, which\nis very time-consuming. This work eliminates this bottleneck by selecting\nrepresentations based on the Gromov-Wasserstein Discrepancy (GWD) between raw\nevents and their representation. It is about 200 times faster to compute than\ntraining a neural network and preserves the task performance ranking of event\nrepresentations across multiple representations, network backbones, datasets,\nand tasks. Thus finding representations with high task scores is equivalent to\nfinding representations with a low GWD. We use this insight to, for the first\ntime, perform a hyperparameter search on a large family of event\nrepresentations, revealing new and powerful representations that exceed the\nstate-of-the-art. Our optimized representations outperform existing\nrepresentations by 1.7 mAP on the 1 Mpx dataset and 0.3 mAP on the Gen1\ndataset, two established object detection benchmarks, and reach a 3.8% higher\nclassification score on the mini N-ImageNet benchmark. Moreover, we outperform\nstate-of-the-art by 2.1 mAP on Gen1 and state-of-the-art feed-forward methods\nby 6.0 mAP on the 1 Mpx datasets. This work opens a new unexplored field of\nexplicit representation optimization for event-based learning.\n","authors":["Nikola Zubić","Daniel Gehrig","Mathias Gehrig","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2304.13455v3.pdf","comment":"15 pages, 11 figures, 2 tables, ICCV 2023 Camera Ready paper"},{"id":"http://arxiv.org/abs/2304.01752v3","updated":"2023-08-17T17:22:41Z","published":"2023-04-04T12:42:29Z","title":"Black Box Few-Shot Adaptation for Vision-Language models","summary":" Vision-Language (V-L) models trained with contrastive learning to align the\nvisual and language modalities have been shown to be strong few-shot learners.\nSoft prompt learning is the method of choice for few-shot downstream adaptation\naiming to bridge the modality gap caused by the distribution shift induced by\nthe new domain. While parameter-efficient, prompt learning still requires\naccess to the model weights and can be computationally infeasible for large\nmodels with billions of parameters. To address these shortcomings, in this\nwork, we describe a black-box method for V-L few-shot adaptation that (a)\noperates on pre-computed image and text features and hence works without access\nto the model's weights, (b) it is orders of magnitude faster at training time,\n(c) it is amenable to both supervised and unsupervised training, and (d) it can\nbe even used to align image and text features computed from uni-modal models.\nTo achieve this, we propose Linear Feature Alignment (LFA), a simple linear\napproach for V-L re-alignment in the target domain. LFA is initialized from a\nclosed-form solution to a least-squares problem and then it is iteratively\nupdated by minimizing a re-ranking loss. Despite its simplicity, our approach\ncan even surpass soft-prompt learning methods as shown by extensive experiments\non 11 image and 2 video datasets.\n","authors":["Yassine Ouali","Adrian Bulat","Brais Martinez","Georgios Tzimiropoulos"],"pdf_url":"https://arxiv.org/pdf/2304.01752v3.pdf","comment":"Published at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.09105v1","updated":"2023-08-17T17:17:08Z","published":"2023-08-17T17:17:08Z","title":"Learning Lightweight Object Detectors via Multi-Teacher Progressive\n Distillation","summary":" Resource-constrained perception systems such as edge computing and\nvision-for-robotics require vision models to be both accurate and lightweight\nin computation and memory usage. While knowledge distillation is a proven\nstrategy to enhance the performance of lightweight classification models, its\napplication to structured outputs like object detection and instance\nsegmentation remains a complicated task, due to the variability in outputs and\ncomplex internal network modules involved in the distillation process. In this\npaper, we propose a simple yet surprisingly effective sequential approach to\nknowledge distillation that progressively transfers the knowledge of a set of\nteacher detectors to a given lightweight student. To distill knowledge from a\nhighly accurate but complex teacher model, we construct a sequence of teachers\nto help the student gradually adapt. Our progressive strategy can be easily\ncombined with existing detection distillation mechanisms to consistently\nmaximize student performance in various settings. To the best of our knowledge,\nwe are the first to successfully distill knowledge from Transformer-based\nteacher detectors to convolution-based students, and unprecedentedly boost the\nperformance of ResNet-50 based RetinaNet from 36.5% to 42.0% AP and Mask R-CNN\nfrom 38.2% to 42.5% AP on the MS COCO benchmark.\n","authors":["Shengcao Cao","Mengtian Li","James Hays","Deva Ramanan","Yi-Xiong Wang","Liang-Yan Gui"],"pdf_url":"https://arxiv.org/pdf/2308.09105v1.pdf","comment":"ICML 2023"},{"id":"http://arxiv.org/abs/2308.09104v1","updated":"2023-08-17T17:14:18Z","published":"2023-08-17T17:14:18Z","title":"A comprehensive study of spike and slab shrinkage priors for\n structurally sparse Bayesian neural networks","summary":" Network complexity and computational efficiency have become increasingly\nsignificant aspects of deep learning. Sparse deep learning addresses these\nchallenges by recovering a sparse representation of the underlying target\nfunction by reducing heavily over-parameterized deep neural networks.\nSpecifically, deep neural architectures compressed via structured sparsity\n(e.g. node sparsity) provide low latency inference, higher data throughput, and\nreduced energy consumption. In this paper, we explore two well-established\nshrinkage techniques, Lasso and Horseshoe, for model compression in Bayesian\nneural networks. To this end, we propose structurally sparse Bayesian neural\nnetworks which systematically prune excessive nodes with (i) Spike-and-Slab\nGroup Lasso (SS-GL), and (ii) Spike-and-Slab Group Horseshoe (SS-GHS) priors,\nand develop computationally tractable variational inference including\ncontinuous relaxation of Bernoulli variables. We establish the contraction\nrates of the variational posterior of our proposed models as a function of the\nnetwork topology, layer-wise node cardinalities, and bounds on the network\nweights. We empirically demonstrate the competitive performance of our models\ncompared to the baseline models in prediction accuracy, model compression, and\ninference latency.\n","authors":["Sanket Jantre","Shrijita Bhattacharya","Tapabrata Maiti"],"pdf_url":"https://arxiv.org/pdf/2308.09104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16781v2","updated":"2023-08-17T16:41:03Z","published":"2023-03-29T15:17:05Z","title":"GRAF: Graph Attention-aware Fusion Networks","summary":" A large number of real-world networks include multiple types of nodes and\nedges. Graph Neural Network (GNN) emerged as a deep learning framework to\ngenerate node and graph embeddings for downstream machine learning tasks.\nHowever, popular GNN-based architectures operate on single homogeneous\nnetworks. Enabling them to work on multiple networks brings additional\nchallenges due to the heterogeneity of the networks and the multiplicity of the\nexisting associations. In this study, we present a computational approach named\nGRAF (Graph Attention-aware Fusion Networks) utilizing GNN-based approaches on\nmultiple networks with the help of attention mechanisms and network fusion.\nUsing attention-based neighborhood aggregation, GRAF learns the importance of\neach neighbor per node (called node-level attention) followed by the importance\nof association (called association-level attention). Then, GRAF processes a\nnetwork fusion step weighing each edge according to learned node- and\nassociation-level attentions. Considering that the fused network could be a\nhighly dense network with many weak edges depending on the given input\nnetworks, we included an edge elimination step with respect to edges' weights.\nFinally, GRAF utilizes Graph Convolutional Network (GCN) on the fused network\nand incorporates node features on graph-structured data for a node\nclassification or a similar downstream task. To demonstrate GRAF's\ngeneralizability, we applied it to four datasets from different domains and\nobserved that GRAF outperformed or was on par with the baselines,\nstate-of-the-art methods, and its own variations for each node classification\ntask. Source code for our tool is publicly available at\nhttps://github.com/bozdaglab/GRAF .\n","authors":["Ziynet Nesibe Kesimoglu","Serdar Bozdag"],"pdf_url":"https://arxiv.org/pdf/2303.16781v2.pdf","comment":"9 pages, 7 supplemental pages, 1 figure, 6 supplemental figures"},{"id":"http://arxiv.org/abs/2303.17573v4","updated":"2023-08-17T16:38:15Z","published":"2023-03-30T17:40:14Z","title":"Using AI to Measure Parkinson's Disease Severity at Home","summary":" We present an artificial intelligence system to remotely assess the motor\nperformance of individuals with Parkinson's disease (PD). Participants\nperformed a motor task (i.e., tapping fingers) in front of a webcam, and data\nfrom 250 global participants were rated by three expert neurologists following\nthe Movement Disorder Society Unified Parkinson's Disease Rating Scale\n(MDS-UPDRS). The neurologists' ratings were highly reliable, with an\nintra-class correlation coefficient (ICC) of 0.88. We developed computer\nalgorithms to obtain objective measurements that align with the MDS-UPDRS\nguideline and are strongly correlated with the neurologists' ratings. Our\nmachine learning model trained on these measures outperformed an MDS-UPDRS\ncertified rater, with a mean absolute error (MAE) of 0.59 compared to the\nrater's MAE of 0.79. However, the model performed slightly worse than the\nexpert neurologists (0.53 MAE). The methodology can be replicated for similar\nmotor tasks, providing the possibility of evaluating individuals with PD and\nother movement disorders remotely, objectively, and in areas with limited\naccess to neurological care.\n","authors":["Md Saiful Islam","Wasifur Rahman","Abdelrahman Abdelkader","Phillip T. Yang","Sangwu Lee","Jamie L. Adams","Ruth B. Schneider","E. Ray Dorsey","Ehsan Hoque"],"pdf_url":"https://arxiv.org/pdf/2303.17573v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09087v1","updated":"2023-08-17T16:29:17Z","published":"2023-08-17T16:29:17Z","title":"Modeling Edge Features with Deep Bayesian Graph Networks","summary":" We propose an extension of the Contextual Graph Markov Model, a deep and\nprobabilistic machine learning model for graphs, to model the distribution of\nedge features. Our approach is architectural, as we introduce an additional\nBayesian network mapping edge features into discrete states to be used by the\noriginal model. In doing so, we are also able to build richer graph\nrepresentations even in the absence of edge features, which is confirmed by the\nperformance improvements on standard graph classification benchmarks. Moreover,\nwe successfully test our proposal in a graph regression scenario where edge\nfeatures are of fundamental importance, and we show that the learned edge\nrepresentation provides substantial performance improvements against the\noriginal model on three link prediction tasks. By keeping the computational\ncomplexity linear in the number of edges, the proposed model is amenable to\nlarge-scale graph processing.\n","authors":["Daniele Atzeni","Federico Errica","Davide Bacciu","Alessio Micheli"],"pdf_url":"https://arxiv.org/pdf/2308.09087v1.pdf","comment":"Releasing pre-print version to comply with TAILOR project\n requirements"},{"id":"http://arxiv.org/abs/2308.09086v1","updated":"2023-08-17T16:26:58Z","published":"2023-08-17T16:26:58Z","title":"Embracing assay heterogeneity with neural processes for markedly\n improved bioactivity predictions","summary":" Predicting the bioactivity of a ligand is one of the hardest and most\nimportant challenges in computer-aided drug discovery. Despite years of data\ncollection and curation efforts by research organizations worldwide,\nbioactivity data remains sparse and heterogeneous, thus hampering efforts to\nbuild predictive models that are accurate, transferable and robust. The\nintrinsic variability of the experimental data is further compounded by data\naggregation practices that neglect heterogeneity to overcome sparsity. Here we\ndiscuss the limitations of these practices and present a hierarchical\nmeta-learning framework that exploits the information synergy across disparate\nassays by successfully accounting for assay heterogeneity. We show that the\nmodel achieves a drastic improvement in affinity prediction across diverse\nprotein targets and assay types compared to conventional baselines. It can\nquickly adapt to new target contexts using very few observations, thus enabling\nlarge-scale virtual screening in early-phase drug discovery.\n","authors":["Lucian Chan","Marcel Verdonk","Carl Poelking"],"pdf_url":"https://arxiv.org/pdf/2308.09086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09084v1","updated":"2023-08-17T16:23:52Z","published":"2023-08-17T16:23:52Z","title":"MovePose: A High-performance Human Pose Estimation Algorithm on Mobile\n and Edge Devices","summary":" We present MovePose, an optimized lightweight convolutional neural network\ndesigned specifically for real-time body pose estimation on CPU-based mobile\ndevices. The current solutions do not provide satisfactory accuracy and speed\nfor human posture estimation, and MovePose addresses this gap. It aims to\nmaintain real-time performance while improving the accuracy of human posture\nestimation for mobile devices. The network produces 17 keypoints for each\nindividual at a rate exceeding 11 frames per second, making it suitable for\nreal-time applications such as fitness tracking, sign language interpretation,\nand advanced mobile human posture estimation. Our MovePose algorithm has\nattained an Mean Average Precision (mAP) score of 67.7 on the COCO\n\\cite{cocodata} validation dataset. The MovePose algorithm displayed efficiency\nwith a performance of 69+ frames per second (fps) when run on an Intel\ni9-10920x CPU. Additionally, it showcased an increased performance of 452+ fps\non an NVIDIA RTX3090 GPU. On an Android phone equipped with a Snapdragon 8 + 4G\nprocessor, the fps reached above 11. To enhance accuracy, we incorporated three\ntechniques: deconvolution, large kernel convolution, and coordinate\nclassification methods. Compared to basic upsampling, deconvolution is\ntrainable, improves model capacity, and enhances the receptive field. Large\nkernel convolution strengthens these properties at a decreased computational\ncost. In summary, MovePose provides high accuracy and real-time performance,\nmarking it a potential tool for a variety of applications, including those\nfocused on mobile-side human posture estimation. The code and models for this\nalgorithm will be made publicly accessible.\n","authors":["Dongyang Yu","Haoyue Zhang","Zhirui Zhou","Wangpeng An","Yanhong Yang"],"pdf_url":"https://arxiv.org/pdf/2308.09084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09082v1","updated":"2023-08-17T16:15:47Z","published":"2023-08-17T16:15:47Z","title":"Over-the-Air Computation Aided Federated Learning with the Aggregation\n of Normalized Gradient","summary":" Over-the-air computation is a communication-efficient solution for federated\nlearning (FL). In such a system, iterative procedure is performed: Local\ngradient of private loss function is updated, amplified and then transmitted by\nevery mobile device; the server receives the aggregated gradient all-at-once,\ngenerates and then broadcasts updated model parameters to every mobile device.\nIn terms of amplification factor selection, most related works suppose the\nlocal gradient's maximal norm always happens although it actually fluctuates\nover iterations, which may degrade convergence performance. To circumvent this\nproblem, we propose to turn local gradient to be normalized one before\namplifying it. Under our proposed method, when the loss function is smooth, we\nprove our proposed method can converge to stationary point at sub-linear rate.\nIn case of smooth and strongly convex loss function, we prove our proposed\nmethod can achieve minimal training loss at linear rate with any small positive\ntolerance. Moreover, a tradeoff between convergence rate and the tolerance is\ndiscovered. To speedup convergence, problems optimizing system parameters are\nalso formulated for above two cases. Although being non-convex, optimal\nsolution with polynomial complexity of the formulated problems are derived.\nExperimental results show our proposed method can outperform benchmark methods\non convergence performance.\n","authors":["Rongfei Fan","Xuming An","Shiyuan Zuo","Han Hu"],"pdf_url":"https://arxiv.org/pdf/2308.09082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.12254v3","updated":"2023-08-17T16:15:28Z","published":"2023-02-23T18:59:56Z","title":"Change is Hard: A Closer Look at Subpopulation Shift","summary":" Machine learning models often perform poorly on subgroups that are\nunderrepresented in the training data. Yet, little is understood on the\nvariation in mechanisms that cause subpopulation shifts, and how algorithms\ngeneralize across such diverse shifts at scale. In this work, we provide a\nfine-grained analysis of subpopulation shift. We first propose a unified\nframework that dissects and explains common shifts in subgroups. We then\nestablish a comprehensive benchmark of 20 state-of-the-art algorithms evaluated\non 12 real-world datasets in vision, language, and healthcare domains. With\nresults obtained from training over 10,000 models, we reveal intriguing\nobservations for future progress in this space. First, existing algorithms only\nimprove subgroup robustness over certain types of shifts but not others.\nMoreover, while current algorithms rely on group-annotated validation data for\nmodel selection, we find that a simple selection criterion based on worst-class\naccuracy is surprisingly effective even without any group information. Finally,\nunlike existing works that solely aim to improve worst-group accuracy (WGA), we\ndemonstrate the fundamental tradeoff between WGA and other important metrics,\nhighlighting the need to carefully choose testing metrics. Code and data are\navailable at: https://github.com/YyzHarry/SubpopBench.\n","authors":["Yuzhe Yang","Haoran Zhang","Dina Katabi","Marzyeh Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2302.12254v3.pdf","comment":"ICML 2023"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.09089v1","updated":"2023-08-17T16:38:30Z","published":"2023-08-17T16:38:30Z","title":"Bridging High-Quality Audio and Video via Language for Sound Effects\n Retrieval from Visual Queries","summary":" Finding the right sound effects (SFX) to match moments in a video is a\ndifficult and time-consuming task, and relies heavily on the quality and\ncompleteness of text metadata. Retrieving high-quality (HQ) SFX using a video\nframe directly as the query is an attractive alternative, removing the reliance\non text metadata and providing a low barrier to entry for non-experts. Due to\nthe lack of HQ audio-visual training data, previous work on audio-visual\nretrieval relies on YouTube (in-the-wild) videos of varied quality for\ntraining, where the audio is often noisy and the video of amateur quality. As\nsuch it is unclear whether these systems would generalize to the task of\nmatching HQ audio to production-quality video. To address this, we propose a\nmultimodal framework for recommending HQ SFX given a video frame by (1)\nleveraging large language models and foundational vision-language models to\nbridge HQ audio and video to create audio-visual pairs, resulting in a highly\nscalable automatic audio-visual data curation pipeline; and (2) using\npre-trained audio and visual encoders to train a contrastive learning-based\nretrieval system. We show that our system, trained using our automatic data\ncuration pipeline, significantly outperforms baselines trained on in-the-wild\ndata on the task of HQ SFX retrieval for video. Furthermore, while the\nbaselines fail to generalize to this task, our system generalizes well from\nclean to in-the-wild data, outperforming the baselines on a dataset of YouTube\nvideos despite only being trained on the HQ audio-visual pairs. A user study\nconfirms that people prefer SFX retrieved by our system over the baseline 67%\nof the time both for HQ and in-the-wild data. Finally, we present ablations to\ndetermine the impact of model and data pipeline design choices on downstream\nretrieval performance. Please visit our project website to listen to and view\nour SFX retrieval results.\n","authors":["Julia Wilkins","Justin Salamon","Magdalena Fuentes","Juan Pablo Bello","Oriol Nieto"],"pdf_url":"https://arxiv.org/pdf/2308.09089v1.pdf","comment":"WASPAA 2023. Project page:\n https://juliawilkins.github.io/sound-effects-retrieval-from-video/. 4 pages,\n 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2307.16508v2","updated":"2023-08-17T12:10:15Z","published":"2023-07-31T09:10:10Z","title":"Towards General Low-Light Raw Noise Synthesis and Modeling","summary":" Modeling and synthesizing low-light raw noise is a fundamental problem for\ncomputational photography and image processing applications. Although most\nrecent works have adopted physics-based models to synthesize noise, the\nsignal-independent noise in low-light conditions is far more complicated and\nvaries dramatically across camera sensors, which is beyond the description of\nthese models. To address this issue, we introduce a new perspective to\nsynthesize the signal-independent noise by a generative model. Specifically, we\nsynthesize the signal-dependent and signal-independent noise in a physics- and\nlearning-based manner, respectively. In this way, our method can be considered\nas a general model, that is, it can simultaneously learn different noise\ncharacteristics for different ISO levels and generalize to various sensors.\nSubsequently, we present an effective multi-scale discriminator termed Fourier\ntransformer discriminator (FTD) to distinguish the noise distribution\naccurately. Additionally, we collect a new low-light raw denoising (LRD)\ndataset for training and benchmarking. Qualitative validation shows that the\nnoise generated by our proposed noise model can be highly similar to the real\nnoise in terms of distribution. Furthermore, extensive denoising experiments\ndemonstrate that our method performs favorably against state-of-the-art methods\non different sensors.\n","authors":["Feng Zhang","Bin Xu","Zhiqiang Li","Xinran Liu","Qingbo Lu","Changxin Gao","Nong Sang"],"pdf_url":"https://arxiv.org/pdf/2307.16508v2.pdf","comment":"11 pages, 7 figures. Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2305.12986v2","updated":"2023-08-17T11:43:00Z","published":"2023-05-22T12:46:59Z","title":"Sparsity and Coefficient Permutation Based Two-Domain AMP for Image\n Block Compressed Sensing","summary":" The learned denoising-based approximate message passing (LDAMP) algorithm has\nattracted great attention for image compressed sensing (CS) tasks. However, it\nhas two issues: first, its global measurement model severely restricts its\napplicability to high-dimensional images, and its block-based measurement\nmethod exhibits obvious block artifacts; second, the denoiser in the LDAMP is\ntoo simple, and existing denoisers have limited ability in detail recovery. In\nthis paper, to overcome the issues and develop a high-performance LDAMP method\nfor image block compressed sensing (BCS), we propose a novel sparsity and\ncoefficient permutation-based AMP (SCP-AMP) method consisting of the\nblock-based sampling and the two-domain reconstruction modules. In the sampling\nmodule, SCP-AMP adopts a discrete cosine transform (DCT) based sparsity\nstrategy to reduce the impact of the high-frequency coefficient on the\nreconstruction, followed by a coefficient permutation strategy to avoid block\nartifacts. In the reconstruction module, a two-domain AMP method with DCT\ndomain noise correction and pixel domain denoising is proposed for iterative\nreconstruction. Regarding the denoiser, we proposed a multi-level deep\nattention network (MDANet) to enhance the texture details by employing\nmulti-level features and multiple attention mechanisms. Extensive experiments\ndemonstrated that the proposed SCP-AMP method achieved better reconstruction\naccuracy than other state-of-the-art BCS algorithms in terms of both visual\nperception and objective metrics.\n","authors":["Junhui Li","Xingsong Hou","Huake Wang","Shuhao Bi"],"pdf_url":"https://arxiv.org/pdf/2305.12986v2.pdf","comment":"The content modification has been upgraded and corrected on a large\n scale, and request to withdraw this version"},{"id":"http://arxiv.org/abs/2308.04126v2","updated":"2023-08-17T09:25:22Z","published":"2023-08-08T08:30:16Z","title":"OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion\n and Infinite Data Generation","summary":" This paper presents OmniDataComposer, an innovative approach for multimodal\ndata fusion and unlimited data generation with an intent to refine and\nuncomplicate interplay among diverse data modalities. Coming to the core\nbreakthrough, it introduces a cohesive data structure proficient in processing\nand merging multimodal data inputs, which include video, audio, and text.\n Our crafted algorithm leverages advancements across multiple operations such\nas video/image caption extraction, dense caption extraction, Automatic Speech\nRecognition (ASR), Optical Character Recognition (OCR), Recognize Anything\nModel(RAM), and object tracking. OmniDataComposer is capable of identifying\nover 6400 categories of objects, substantially broadening the spectrum of\nvisual information. It amalgamates these diverse modalities, promoting\nreciprocal enhancement among modalities and facilitating cross-modal data\ncorrection. \\textbf{The final output metamorphoses each video input into an\nelaborate sequential document}, virtually transmuting videos into thorough\nnarratives, making them easier to be processed by large language models.\n Future prospects include optimizing datasets for each modality to encourage\nunlimited data generation. This robust base will offer priceless insights to\nmodels like ChatGPT, enabling them to create higher quality datasets for video\ncaptioning and easing question-answering tasks based on video content.\nOmniDataComposer inaugurates a new stage in multimodal learning, imparting\nenormous potential for augmenting AI's understanding and generation of complex,\nreal-world data.\n","authors":["Dongyang Yu","Shihao Wang","Yuan Fang","Wangpeng An"],"pdf_url":"https://arxiv.org/pdf/2308.04126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.06338v2","updated":"2023-08-17T07:26:52Z","published":"2022-02-13T14:58:11Z","title":"DEEPCHORUS: A Hybrid Model of Multi-scale Convolution and Self-attention\n for Chorus Detection","summary":" Chorus detection is a challenging problem in musical signal processing as the\nchorus often repeats more than once in popular songs, usually with rich\ninstruments and complex rhythm forms. Most of the existing works focus on the\nreceptiveness of chorus sections based on some explicit features such as\nloudness and occurrence frequency. These pre-assumptions for chorus limit the\ngeneralization capacity of these methods, causing misdetection on other\nrepeated sections such as verse. To solve the problem, in this paper we propose\nan end-to-end chorus detection model DeepChorus, reducing the engineering\neffort and the need for prior knowledge. The proposed model includes two main\nstructures: i) a Multi-Scale Network to derive preliminary representations of\nchorus segments, and ii) a Self-Attention Convolution Network to further\nprocess the features into probability curves representing chorus presence. To\nobtain the final results, we apply an adaptive threshold to binarize the\noriginal curve. The experimental results show that DeepChorus outperforms\nexisting state-of-the-art methods in most cases.\n","authors":["Qiqi He","Xiaoheng Sun","Yi Yu","Wei Li"],"pdf_url":"https://arxiv.org/pdf/2202.06338v2.pdf","comment":"Accepted by ICASSP 2022"},{"id":"http://arxiv.org/abs/2308.08723v1","updated":"2023-08-17T01:34:51Z","published":"2023-08-17T01:34:51Z","title":"Dynamic Kernel-Based Adaptive Spatial Aggregation for Learned Image\n Compression","summary":" Learned image compression methods have shown superior rate-distortion\nperformance and remarkable potential compared to traditional compression\nmethods. Most existing learned approaches use stacked convolution or\nwindow-based self-attention for transform coding, which aggregate spatial\ninformation in a fixed range. In this paper, we focus on extending spatial\naggregation capability and propose a dynamic kernel-based transform coding. The\nproposed adaptive aggregation generates kernel offsets to capture valid\ninformation in the content-conditioned range to help transform. With the\nadaptive aggregation strategy and the sharing weights mechanism, our method can\nachieve promising transform capability with acceptable model complexity.\nBesides, according to the recent progress of entropy model, we define a\ngeneralized coarse-to-fine entropy model, considering the coarse global\ncontext, the channel-wise, and the spatial context. Based on it, we introduce\ndynamic kernel in hyper-prior to generate more expressive global context.\nFurthermore, we propose an asymmetric spatial-channel entropy model according\nto the investigation of the spatial characteristics of the grouped latents. The\nasymmetric entropy model aims to reduce statistical redundancy while\nmaintaining coding efficiency. Experimental results demonstrate that our method\nachieves superior rate-distortion performance on three benchmarks compared to\nthe state-of-the-art learning-based methods.\n","authors":["Huairui Wang","Nianxiang Fu","Zhenzhong Chen","Shan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.08723v1.pdf","comment":null}]},"2023-08-21T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.10882v1","updated":"2023-08-21T17:30:16Z","published":"2023-08-21T17:30:16Z","title":"Giraffe: Adventures in Expanding Context Lengths in LLMs","summary":" Modern large language models (LLMs) that rely on attention mechanisms are\ntypically trained with fixed context lengths which enforce upper limits on the\nlength of input sequences that they can handle at evaluation time. To use these\nmodels on sequences longer than the train-time context length, one might employ\ntechniques from the growing family of context length extrapolation methods --\nmost of which focus on modifying the system of positional encodings used in the\nattention mechanism to indicate where tokens or activations are located in the\ninput sequence. We conduct a wide survey of existing methods of context length\nextrapolation on a base LLaMA or LLaMA 2 model, and introduce some of our own\ndesign as well -- in particular, a new truncation strategy for modifying the\nbasis for the position encoding.\n We test these methods using three new evaluation tasks (FreeFormQA,\nAlteredNumericQA, and LongChat-Lines) as well as perplexity, which we find to\nbe less fine-grained as a measure of long context performance of LLMs. We\nrelease the three tasks publicly as datasets on HuggingFace. We discover that\nlinear scaling is the best method for extending context length, and show that\nfurther gains can be achieved by using longer scales at evaluation time. We\nalso discover promising extrapolation capabilities in the truncated basis. To\nsupport further research in this area, we release three new 13B parameter\nlong-context models which we call Giraffe: 4k and 16k context models trained\nfrom base LLaMA-13B, and a 32k context model trained from base LLaMA2-13B. We\nalso release the code to replicate our results.\n","authors":["Arka Pal","Deep Karkhanis","Manley Roberts","Samuel Dooley","Arvind Sundararajan","Siddartha Naidu"],"pdf_url":"https://arxiv.org/pdf/2308.10882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10874v1","updated":"2023-08-21T17:21:23Z","published":"2023-08-21T17:21:23Z","title":"Analyzing Transformer Dynamics as Movement through Embedding Space","summary":" Transformer language models exhibit intelligent behaviors such as\nunderstanding natural language, recognizing patterns, acquiring knowledge,\nreasoning, planning, reflecting and using tools. This paper explores how their\nunderlying mechanics give rise to intelligent behaviors. We adopt a systems\napproach to analyze Transformers in detail and develop a mathematical framework\nthat frames their dynamics as movement through embedding space. This novel\nperspective provides a principled way of thinking about the problem and reveals\nimportant insights related to the emergence of intelligence:\n 1. At its core the Transformer is a Embedding Space walker, mapping\nintelligent behavior to trajectories in this vector space.\n 2. At each step of the walk, it composes context into a single composite\nvector whose location in Embedding Space defines the next step.\n 3. No learning actually occurs during decoding; in-context learning and\ngeneralization are simply the result of different contexts composing into\ndifferent vectors.\n 4. Ultimately the knowledge, intelligence and skills exhibited by the model\nare embodied in the organization of vectors in Embedding Space rather than in\nspecific neurons or layers. These abilities are properties of this\norganization.\n 5. Attention's contribution boils down to the association-bias it lends to\nvector composition and which influences the aforementioned organization.\nHowever, more investigation is needed to ascertain its significance.\n 6. The entire model is composed from two principal operations: data\nindependent filtering and data dependent aggregation. This generalization\nunifies Transformers with other sequence models and across modalities.\n Building upon this foundation we formalize and test a semantic space theory\nwhich posits that embedding vectors represent semantic concepts and find some\nevidence of its validity.\n","authors":["Sumeet S. Singh"],"pdf_url":"https://arxiv.org/pdf/2308.10874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00948v2","updated":"2023-08-21T16:52:29Z","published":"2023-05-01T17:09:33Z","title":"Large Linguistic Models: Analyzing theoretical linguistic abilities of\n LLMs","summary":" The performance of large language models (LLMs) has recently improved to the\npoint where the models can perform well on many language tasks. We show here\nthat for the first time, the models can also generate coherent and valid formal\nanalyses of linguistic data and illustrate the vast potential of large language\nmodels for analyses of their metalinguistic abilities. LLMs are primarily\ntrained on language data in the form of text; analyzing and evaluating their\nmetalinguistic abilities improves our understanding of their general\ncapabilities and sheds new light on theoretical models in linguistics. In this\npaper, we probe into GPT-4's metalinguistic capabilities by focusing on three\nsubfields of formal linguistics: syntax, phonology, and semantics. We outline a\nresearch program for metalinguistic analyses of large language models, propose\nexperimental designs, provide general guidelines, discuss limitations, and\noffer future directions for this line of research. This line of inquiry also\nexemplifies behavioral interpretability of deep learning, where models'\nrepresentations are accessed by explicit prompting rather than internal\nrepresentations.\n","authors":["Gašper Beguš","Maksymilian Dąbkowski","Ryan Rhodes"],"pdf_url":"https://arxiv.org/pdf/2305.00948v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10855v1","updated":"2023-08-21T16:49:40Z","published":"2023-08-21T16:49:40Z","title":"LatEval: An Interactive LLMs Evaluation Benchmark with Incomplete\n Information from Lateral Thinking Puzzles","summary":" With the continuous evolution and refinement of LLMs, they are endowed with\nimpressive logical reasoning or vertical thinking capabilities. But can they\nthink out of the box? Do they possess proficient lateral thinking abilities?\nFollowing the setup of Lateral Thinking Puzzles, we propose a novel evaluation\nbenchmark, LatEval, which assesses the model's lateral thinking within an\ninteractive framework. In our benchmark, we challenge LLMs with 2 aspects: the\nquality of questions posed by the model and the model's capability to integrate\ninformation for problem-solving. We find that nearly all LLMs struggle with\nemploying lateral thinking during interactions. For example, even the most\nadvanced model, GPT-4, exhibits the advantage to some extent, yet still\nmaintain a noticeable gap when compared to human. This evaluation benchmark\nprovides LLMs with a highly challenging and distinctive task that is crucial to\nan effective AI assistant.\n","authors":["Shulin Huang","Shirong Ma","Yinghui Li","Mengzuo Huang","Wuhe Zou","Weidong Zhang","Hai-Tao Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.10855v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.10848v1","updated":"2023-08-21T16:47:11Z","published":"2023-08-21T16:47:11Z","title":"AgentVerse: Facilitating Multi-Agent Collaboration and Exploring\n Emergent Behaviors in Agents","summary":" Autonomous agents empowered by Large Language Models (LLMs) have undergone\nsignificant improvements, enabling them to generalize across a broad spectrum\nof tasks. However, in real-world scenarios, cooperation among individuals is\noften required to enhance the efficiency and effectiveness of task\naccomplishment. Hence, inspired by human group dynamics, we propose a\nmulti-agent framework \\framework that can collaboratively and dynamically\nadjust its composition as a greater-than-the-sum-of-its-parts system. Our\nexperiments demonstrate that \\framework framework can effectively deploy\nmulti-agent groups that outperform a single agent. Furthermore, we delve into\nthe emergence of social behaviors among individual agents within a group during\ncollaborative task accomplishment. In view of these behaviors, we discuss some\npossible strategies to leverage positive ones and mitigate negative ones for\nimproving the collaborative potential of multi-agent groups. Our codes for\n\\framework will soon be released at\n\\url{https://github.com/OpenBMB/AgentVerse}.\n","authors":["Weize Chen","Yusheng Su","Jingwei Zuo","Cheng Yang","Chenfei Yuan","Chen Qian","Chi-Min Chan","Yujia Qin","Yaxi Lu","Ruobing Xie","Zhiyuan Liu","Maosong Sun","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.10848v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.10792v1","updated":"2023-08-21T15:35:16Z","published":"2023-08-21T15:35:16Z","title":"Instruction Tuning for Large Language Models: A Survey","summary":" This paper surveys research works in the quickly advancing field of\ninstruction tuning (IT), a crucial technique to enhance the capabilities and\ncontrollability of large language models (LLMs). Instruction tuning refers to\nthe process of further training LLMs on a dataset consisting of\n\\textsc{(instruction, output)} pairs in a supervised fashion, which bridges the\ngap between the next-word prediction objective of LLMs and the users' objective\nof having LLMs adhere to human instructions. In this work, we make a systematic\nreview of the literature, including the general methodology of IT, the\nconstruction of IT datasets, the training of IT models, and applications to\ndifferent modalities, domains and applications, along with an analysis on\naspects that influence the outcome of IT (e.g., generation of instruction\noutputs, size of the instruction dataset, etc). We also review the potential\npitfalls of IT along with criticism against it, along with efforts pointing out\ncurrent deficiencies of existing strategies and suggest some avenues for\nfruitful research.\n","authors":["Shengyu Zhang","Linfeng Dong","Xiaoya Li","Sen Zhang","Xiaofei Sun","Shuhe Wang","Jiwei Li","Runyi Hu","Tianwei Zhang","Fei Wu","Guoyin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10792v1.pdf","comment":"A Survey paper, Pre-print"},{"id":"http://arxiv.org/abs/2308.10783v1","updated":"2023-08-21T15:19:10Z","published":"2023-08-21T15:19:10Z","title":"Zero- and Few-Shot Prompting with LLMs: A Comparative Study with\n Fine-tuned Models for Bangla Sentiment Analysis","summary":" The rapid expansion of the digital world has propelled sentiment analysis\ninto a critical tool across diverse sectors such as marketing, politics,\ncustomer service, and healthcare. While there have been significant\nadvancements in sentiment analysis for widely spoken languages, low-resource\nlanguages, such as Bangla, remain largely under-researched due to resource\nconstraints. Furthermore, the recent unprecedented performance of Large\nLanguage Models (LLMs) in various applications highlights the need to evaluate\nthem in the context of low-resource languages. In this study, we present a\nsizeable manually annotated dataset encompassing 33,605 Bangla news tweets and\nFacebook comments. We also investigate zero- and few-shot in-context learning\nwith several language models, including Flan-T5, GPT-4, and Bloomz, offering a\ncomparative analysis against fine-tuned models. Our findings suggest that\nmonolingual transformer-based models consistently outperform other models, even\nin zero and few-shot scenarios. To foster continued exploration, we intend to\nmake this dataset and our research tools publicly available to the broader\nresearch community. In the spirit of further research, we plan to make this\ndataset and our experimental resources publicly accessible to the wider\nresearch community.\n","authors":["Md. Arid Hasan","Shudipta Das","Afiyat Anjum","Firoj Alam","Anika Anjum","Avijit Sarker","Sheak Rashed Haider Noori"],"pdf_url":"https://arxiv.org/pdf/2308.10783v1.pdf","comment":"Zero-Shot Prompting, Few-Shot Prompting, LLMs, Comparative Study,\n Fine-tuned Models, Bangla, Sentiment Analysis"},{"id":"http://arxiv.org/abs/2306.08107v2","updated":"2023-08-21T15:01:46Z","published":"2023-06-13T19:51:22Z","title":"AutoML in the Age of Large Language Models: Current Challenges, Future\n Opportunities and Risks","summary":" The fields of both Natural Language Processing (NLP) and Automated Machine\nLearning (AutoML) have achieved remarkable results over the past years. In NLP,\nespecially Large Language Models (LLMs) have experienced a rapid series of\nbreakthroughs very recently. We envision that the two fields can radically push\nthe boundaries of each other through tight integration. To showcase this\nvision, we explore the potential of a symbiotic relationship between AutoML and\nLLMs, shedding light on how they can benefit each other. In particular, we\ninvestigate both the opportunities to enhance AutoML approaches with LLMs from\ndifferent perspectives and the challenges of leveraging AutoML to further\nimprove LLMs. To this end, we survey existing work, and we critically assess\nrisks. We strongly believe that the integration of the two fields has the\npotential to disrupt both fields, NLP and AutoML. By highlighting conceivable\nsynergies, but also risks, we aim to foster further exploration at the\nintersection of AutoML and LLMs.\n","authors":["Alexander Tornede","Difan Deng","Theresa Eimer","Joseph Giovanelli","Aditya Mohan","Tim Ruhkopf","Sarah Segel","Daphne Theodorakopoulos","Tanja Tornede","Henning Wachsmuth","Marius Lindauer"],"pdf_url":"https://arxiv.org/pdf/2306.08107v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10758v1","updated":"2023-08-21T14:44:31Z","published":"2023-08-21T14:44:31Z","title":"DepreSym: A Depression Symptom Annotated Corpus and the Role of LLMs as\n Assessors of Psychological Markers","summary":" Computational methods for depression detection aim to mine traces of\ndepression from online publications posted by Internet users. However,\nsolutions trained on existing collections exhibit limited generalisation and\ninterpretability. To tackle these issues, recent studies have shown that\nidentifying depressive symptoms can lead to more robust models. The eRisk\ninitiative fosters research on this area and has recently proposed a new\nranking task focused on developing search methods to find sentences related to\ndepressive symptoms. This search challenge relies on the symptoms specified by\nthe Beck Depression Inventory-II (BDI-II), a questionnaire widely used in\nclinical practice. Based on the participant systems' results, we present the\nDepreSym dataset, consisting of 21580 sentences annotated according to their\nrelevance to the 21 BDI-II symptoms. The labelled sentences come from a pool of\ndiverse ranking methods, and the final dataset serves as a valuable resource\nfor advancing the development of models that incorporate depressive markers\nsuch as clinical symptoms. Due to the complex nature of this relevance\nannotation, we designed a robust assessment methodology carried out by three\nexpert assessors (including an expert psychologist). Additionally, we explore\nhere the feasibility of employing recent Large Language Models (ChatGPT and\nGPT4) as potential assessors in this complex task. We undertake a comprehensive\nexamination of their performance, determine their main limitations and analyze\ntheir role as a complement or replacement for human annotators.\n","authors":["Anxo Pérez","Marcos Fernández-Pichel","Javier Parapar","David E. Losada"],"pdf_url":"https://arxiv.org/pdf/2308.10758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10755v1","updated":"2023-08-21T14:40:48Z","published":"2023-08-21T14:40:48Z","title":"WanJuan: A Comprehensive Multimodal Dataset for Advancing English and\n Chinese Large Models","summary":" The rise in popularity of ChatGPT and GPT-4 has significantly accelerated the\ndevelopment of large models, leading to the creation of numerous impressive\nlarge language models(LLMs) and multimodal large language models (MLLMs). These\ncutting-edge models owe their remarkable performance to high-quality data.\nHowever, the details of the training data used in leading paradigms are often\nkept confidential. This lack of transparency, coupled with the scarcity of\nopen-source data, impedes further developments within the community. As a\nresponse, this paper presents \"Wan Juan\", a large-scale multimodal dataset\ncomposed of both Chinese and English data, collected from a wide range of web\nsources. The dataset incorporates text, image-text, and video modalities, with\na total volume exceeding 2TB. It was utilized in the training of InternLM, a\nmodel that demonstrated significant advantages in multi-dimensional evaluations\nwhen compared to models of a similar scale. All data can be accessed at\nhttps://opendatalab.org.cn/WanJuan1.0.\n","authors":["Conghui He","Zhenjiang Jin","Chao Xu","Jiantao Qiu","Bin Wang","Wei Li","Hang Yan","JiaQi Wang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.10755v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2308.00158v3","updated":"2023-08-21T14:23:14Z","published":"2023-07-31T21:13:30Z","title":"Predicting Perfect Quality Segments in MT Output with Fine-Tuned OpenAI\n LLM: Is it possible to capture editing distance patterns from historical\n data?","summary":" Translation Quality Estimation (TQE) is an essential step before deploying\nthe output translation into usage. TQE is also critical in assessing machine\ntranslation (MT) and human translation (HT) quality without seeing the\nreference translations. This work examines whether the state-of-the-art large\nlanguage models (LLMs) can be fine-tuned for the TQE task and their capability.\nWe take ChatGPT as one example and approach TQE as a binary classification\ntask. Using \\textbf{eight language pairs} including English to Italian, German,\nFrench, Japanese, Dutch, Portuguese, Turkish, and Chinese training corpora, our\nexperimental results show that fine-tuned ChatGPT via its API can achieve a\nrelatively high score on predicting translation quality, i.e. \\textit{if the\ntranslation needs to be edited}. However, there is definitely much space to\nimprove the model accuracy, e.g. they are 82.42\\% and 83.69\\% for\nEnglish-Italian and English-German respectively using our experimental\nsettings. English-Italiano bilingual Abstract is available in the paper.\n","authors":["Serge Gladkoff","Gleb Erofeev","Lifeng Han","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2308.00158v3.pdf","comment":"8 pages, 11 figures, under-review to ItalianNLP-2023"},{"id":"http://arxiv.org/abs/2305.15914v2","updated":"2023-08-21T12:51:10Z","published":"2023-05-25T10:20:15Z","title":"Reliable Detection and Quantification of Selective Forces in Language\n Change","summary":" Language change is a cultural evolutionary process in which variants of\nlinguistic variables change in frequency through processes analogous to\nmutation, selection and genetic drift. In this work, we apply a\nrecently-introduced method to corpus data to quantify the strength of selection\nin specific instances of historical language change. We first demonstrate, in\nthe context of English irregular verbs, that this method is more reliable and\ninterpretable than similar methods that have previously been applied. We\nfurther extend this study to demonstrate that a bias towards phonological\nsimplicity overrides that favouring grammatical simplicity when these are in\nconflict. Finally, with reference to Spanish spelling reforms, we show that the\nmethod can also detect points in time at which selection strengths change, a\nfeature that is generically expected for socially-motivated language change.\nTogether, these results indicate how hypotheses for mechanisms of language\nchange can be tested quantitatively using historical corpus data.\n","authors":["Juan Guerrero Montero","Andres Karjus","Kenny Smith","Richard A. Blythe"],"pdf_url":"https://arxiv.org/pdf/2305.15914v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10684v1","updated":"2023-08-21T12:37:42Z","published":"2023-08-21T12:37:42Z","title":"Systematic Offensive Stereotyping (SOS) Bias in Language Models","summary":" Research has shown that language models (LMs) are socially biased. However,\ntoxicity and offensive stereotyping bias in LMs are understudied. In this\npaper, we investigate the systematic offensive stereotype (SOS) bias in LMs. We\npropose a method to measure it. Then, we validate the SOS bias and investigate\nthe effectiveness of debias methods from the literature on removing it.\nFinally, we investigate the impact of the SOS bias in LMs on their performance\nand their fairness on the task of hate speech detection. Our results suggest\nthat all the inspected LMs are SOS biased. The results suggest that the SOS\nbias in LMs is reflective of the hate experienced online by the inspected\nmarginalized groups. The results indicate that removing the SOS bias in LMs,\nusing a popular debias method from the literature, leads to worse SOS bias\nscores. Finally, Our results show no strong evidence that the SOS bias in LMs\nis impactful on their performance on hate speech detection. On the other hand,\nthere is evidence that the SOS bias in LMs is impactful on their fairness.\n","authors":["Fatma Elsafoury"],"pdf_url":"https://arxiv.org/pdf/2308.10684v1.pdf","comment":"Keywords: Systematic offensive stereotyping (SOS) bias, Language\n models, bias removal, fairness, hate speech detection"},{"id":"http://arxiv.org/abs/2308.10682v1","updated":"2023-08-21T12:33:35Z","published":"2023-08-21T12:33:35Z","title":"LibriWASN: A Data Set for Meeting Separation, Diarization, and\n Recognition with Asynchronous Recording Devices","summary":" We present LibriWASN, a data set whose design follows closely the LibriCSS\nmeeting recognition data set, with the marked difference that the data is\nrecorded with devices that are randomly positioned on a meeting table and whose\nsampling clocks are not synchronized. Nine different devices, five smartphones\nwith a single recording channel and four microphone arrays, are used to record\na total of 29 channels. Other than that, the data set follows closely the\nLibriCSS design: the same LibriSpeech sentences are played back from eight\nloudspeakers arranged around a meeting table and the data is organized in\nsubsets with different percentages of speech overlap. LibriWASN is meant as a\ntest set for clock synchronization algorithms, meeting separation, diarization\nand transcription systems on ad-hoc wireless acoustic sensor networks. Due to\nits similarity to LibriCSS, meeting transcription systems developed for the\nformer can readily be tested on LibriWASN. The data set is recorded in two\ndifferent rooms and is complemented with ground-truth diarization information\nof who speaks when.\n","authors":["Joerg Schmalenstroeer","Tobias Gburrek","Reinhold Haeb-Umbach"],"pdf_url":"https://arxiv.org/pdf/2308.10682v1.pdf","comment":"Accepted for presentation at the ITG conference on Speech\n Communication 2023"},{"id":"http://arxiv.org/abs/2307.07870v2","updated":"2023-08-21T12:28:34Z","published":"2023-07-15T19:04:33Z","title":"Large Language Models as Superpositions of Cultural Perspectives","summary":" Large Language Models (LLMs) are often misleadingly recognized as having a\npersonality or a set of values. We argue that an LLM can be seen as a\nsuperposition of perspectives with different values and personality traits.\nLLMs exhibit context-dependent values and personality traits that change based\non the induced perspective (as opposed to humans, who tend to have more\ncoherent values and personality traits across contexts). We introduce the\nconcept of perspective controllability, which refers to a model's affordance to\nadopt various perspectives with differing values and personality traits. In our\nexperiments, we use questionnaires from psychology (PVQ, VSM, IPIP) to study\nhow exhibited values and personality traits change based on different\nperspectives. Through qualitative experiments, we show that LLMs express\ndifferent values when those are (implicitly or explicitly) implied in the\nprompt, and that LLMs express different values even when those are not\nobviously implied (demonstrating their context-dependent nature). We then\nconduct quantitative experiments to study the controllability of different\nmodels (GPT-4, GPT-3.5, OpenAssistant, StableVicuna, StableLM), the\neffectiveness of various methods for inducing perspectives, and the smoothness\nof the models' drivability. We conclude by examining the broader implications\nof our work and outline a variety of associated scientific questions. The\nproject website is available at\nhttps://sites.google.com/view/llm-superpositions .\n","authors":["Grgur Kovač","Masataka Sawayama","Rémy Portelas","Cédric Colas","Peter Ford Dominey","Pierre-Yves Oudeyer"],"pdf_url":"https://arxiv.org/pdf/2307.07870v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.10633v1","updated":"2023-08-21T11:08:16Z","published":"2023-08-21T11:08:16Z","title":"RaLLe: A Framework for Developing and Evaluating Retrieval-Augmented\n Large Language Models","summary":" Retrieval-augmented large language models (R-LLMs) combine pre-trained large\nlanguage models (LLMs) with information retrieval systems to improve the\naccuracy of factual question-answering. However, current libraries for building\nR-LLMs provide high-level abstractions without sufficient transparency for\nevaluating and optimizing prompts within specific inference processes such as\nretrieval and generation. To address this gap, we present RaLLe, an open-source\nframework designed to facilitate the development, evaluation, and optimization\nof R-LLMs for knowledge-intensive tasks. With RaLLe, developers can easily\ndevelop and evaluate R-LLMs, improving hand-crafted prompts, assessing\nindividual inference processes, and objectively measuring overall system\nperformance quantitatively. By leveraging these features, developers can\nenhance the performance and accuracy of their R-LLMs in knowledge-intensive\ngeneration tasks. We open-source our code at https://github.com/yhoshi3/RaLLe.\n","authors":["Yasuto Hoshi","Daisuke Miyashita","Youyang Ng","Kento Tatsuno","Yasuhiro Morioka","Osamu Torii","Jun Deguchi"],"pdf_url":"https://arxiv.org/pdf/2308.10633v1.pdf","comment":"18 pages, 2 figures, see https://youtu.be/JYbm75qnfTg for the\n demonstration screencast"},{"id":"http://arxiv.org/abs/2308.09687v2","updated":"2023-08-21T10:51:42Z","published":"2023-08-18T17:29:23Z","title":"Graph of Thoughts: Solving Elaborate Problems with Large Language Models","summary":" We introduce Graph of Thoughts (GoT): a framework that advances prompting\ncapabilities in large language models (LLMs) beyond those offered by paradigms\nsuch as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary\nadvantage of GoT is the ability to model the information generated by an LLM as\nan arbitrary graph, where units of information (\"LLM thoughts\") are vertices,\nand edges correspond to dependencies between these vertices. This approach\nenables combining arbitrary LLM thoughts into synergistic outcomes, distilling\nthe essence of whole networks of thoughts, or enhancing thoughts using feedback\nloops. We illustrate that GoT offers advantages over state of the art on\ndifferent tasks, for example increasing the quality of sorting by 62% over ToT,\nwhile simultaneously reducing costs by >31%. We ensure that GoT is extensible\nwith new thought transformations and thus can be used to spearhead new\nprompting schemes. This work brings the LLM reasoning closer to human thinking\nor brain mechanisms such as recurrence, both of which form complex networks.\n","authors":["Maciej Besta","Nils Blach","Ales Kubicek","Robert Gerstenberger","Lukas Gianinazzi","Joanna Gajda","Tomasz Lehmann","Michal Podstawski","Hubert Niewiadomski","Piotr Nyczyk","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2308.09687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08998v2","updated":"2023-08-21T10:23:42Z","published":"2023-08-17T14:12:48Z","title":"Reinforced Self-Training (ReST) for Language Modeling","summary":" Reinforcement learning from human feedback (RLHF) can improve the quality of\nlarge language model's (LLM) outputs by aligning them with human preferences.\nWe propose a simple algorithm for aligning LLMs with human preferences inspired\nby growing batch reinforcement learning (RL), which we call Reinforced\nSelf-Training (ReST). Given an initial LLM policy, ReST produces a dataset by\ngenerating samples from the policy, which are then used to improve the LLM\npolicy using offline RL algorithms. ReST is more efficient than typical online\nRLHF methods because the training dataset is produced offline, which allows\ndata reuse. While ReST is a general approach applicable to all generative\nlearning settings, we focus on its application to machine translation. Our\nresults show that ReST can substantially improve translation quality, as\nmeasured by automated metrics and human evaluation on machine translation\nbenchmarks in a compute and sample-efficient manner.\n","authors":["Caglar Gulcehre","Tom Le Paine","Srivatsan Srinivasan","Ksenia Konyushkova","Lotte Weerts","Abhishek Sharma","Aditya Siddhant","Alex Ahern","Miaosen Wang","Chenjie Gu","Wolfgang Macherey","Arnaud Doucet","Orhan Firat","Nando de Freitas"],"pdf_url":"https://arxiv.org/pdf/2308.08998v2.pdf","comment":"23 pages, 16 figures"},{"id":"http://arxiv.org/abs/2307.16811v2","updated":"2023-08-21T10:20:02Z","published":"2023-07-31T16:29:08Z","title":"DoDo Learning: DOmain-DemOgraphic Transfer in Language Models for\n Detecting Abuse Targeted at Public Figures","summary":" Public figures receive a disproportionate amount of abuse on social media,\nimpacting their active participation in public life. Automated systems can\nidentify abuse at scale but labelling training data is expensive, complex and\npotentially harmful. So, it is desirable that systems are efficient and\ngeneralisable, handling both shared and specific aspects of online abuse. We\nexplore the dynamics of cross-group text classification in order to understand\nhow well classifiers trained on one domain or demographic can transfer to\nothers, with a view to building more generalisable abuse classifiers. We\nfine-tune language models to classify tweets targeted at public figures across\nDOmains (sport and politics) and DemOgraphics (women and men) using our novel\nDODO dataset, containing 28,000 labelled entries, split equally across four\ndomain-demographic pairs. We find that (i) small amounts of diverse data are\nhugely beneficial to generalisation and model adaptation; (ii) models transfer\nmore easily across demographics but models trained on cross-domain data are\nmore generalisable; (iii) some groups contribute more to generalisability than\nothers; and (iv) dataset similarity is a signal of transferability.\n","authors":["Hannah Rose Kirk","Angus R. Williams","Liam Burke","Yi-Ling Chung","Ivan Debono","Pica Johansson","Francesca Stevens","Jonathan Bright","Scott A. Hale"],"pdf_url":"https://arxiv.org/pdf/2307.16811v2.pdf","comment":"15 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.10592v1","updated":"2023-08-21T09:47:31Z","published":"2023-08-21T09:47:31Z","title":"BAN-PL: a Novel Polish Dataset of Banned Harmful and Offensive Content\n from Wykop.pl web service","summary":" Advances in automated detection of offensive language online, including hate\nspeech and cyberbullying, require improved access to publicly available\ndatasets comprising social media content. In this paper, we introduce BAN-PL,\nthe first open dataset in the Polish language that encompasses texts flagged as\nharmful and subsequently removed by professional moderators. The dataset\nencompasses a total of 691,662 pieces of content from a popular social\nnetworking service, Wykop.pl, often referred to as the \"Polish Reddit\",\nincluding both posts and comments, and is evenly distributed into two distinct\nclasses: \"harmful\" and \"neutral\". We provide a comprehensive description of the\ndata collection and preprocessing procedures, as well as highlight the\nlinguistic specificity of the data. The BAN-PL dataset, along with advanced\npreprocessing scripts for, i.a., unmasking profanities, will be publicly\navailable.\n","authors":["Inez Okulska","Kinga Głąbińska","Anna Kołos","Agnieszka Karlińska","Emilia Wiśnios","Adam Nowakowski","Paweł Ellerik","Andrzej Prałat"],"pdf_url":"https://arxiv.org/pdf/2308.10592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09582v4","updated":"2023-08-21T09:44:19Z","published":"2023-02-19T14:21:33Z","title":"Language-Specific Representation of Emotion-Concept Knowledge Causally\n Supports Emotion Inference","summary":" Understanding how language supports emotion inference remains a topic of\ndebate in emotion science. The present study investigated whether\nlanguage-derived emotion-concept knowledge would causally support emotion\ninference by manipulating the language-specific knowledge representations in\nlarge language models. Using the prompt technique, 14 attributes of emotion\nconcepts were found to be represented by distinct artificial neuron\npopulations. By manipulating these attribute-related neurons, the majority of\nthe emotion inference tasks showed performance deterioration compared to random\nmanipulations. The attribute-specific performance deterioration was related to\nthe importance of different attributes in human mental space. Our findings\nprovide causal evidence in support of a language-based mechanism for emotion\ninference and highlight the contributions of emotion-concept knowledge.\n","authors":["Ming Li","Yusheng Su","Hsiu-Yuan Huang","Jiali Cheng","Xin Hu","Xinmiao Zhang","Huadong Wang","Yujia Qin","Xiaozhi Wang","Zhiyuan Liu","Dan Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.09582v4.pdf","comment":"39 pages, 13 figures, 2 tables, fix formatting errors"},{"id":"http://arxiv.org/abs/2308.10586v1","updated":"2023-08-21T09:40:19Z","published":"2023-08-21T09:40:19Z","title":"Age Recommendation from Texts and Sentences for Children","summary":" Children have less text understanding capability than adults. Moreover, this\ncapability differs among the children of different ages. Hence, automatically\npredicting a recommended age based on texts or sentences would be a great\nbenefit to propose adequate texts to children and to help authors writing in\nthe most appropriate way. This paper presents our recent advances on the age\nrecommendation task. We consider age recommendation as a regression task, and\ndiscuss the need for appropriate evaluation metrics, study the use of\nstate-of-the-art machine learning model, namely Transformers, and compare it to\ndifferent models coming from the literature. Our results are also compared with\nrecommendations made by experts. Further, this paper deals with preliminary\nexplainability of the age prediction model by analyzing various linguistic\nfeatures. We conduct the experiments on a dataset of 3, 673 French texts (132K\nsentences, 2.5M words). To recommend age at the text level and sentence level,\nour best models achieve MAE scores of 0.98 and 1.83 respectively on the test\nset. Also, compared to the recommendations made by experts, our sentence-level\nrecommendation model gets a similar score to the experts, while the text-level\nrecommendation model outperforms the experts by an MAE score of 1.48.\n","authors":["Rashedur Rahman","Gwénolé Lecorvé","Nicolas Béchet"],"pdf_url":"https://arxiv.org/pdf/2308.10586v1.pdf","comment":"26 pages (incl. 4 pages for appendices), 4 figures, 20 tables"},{"id":"http://arxiv.org/abs/2308.10585v1","updated":"2023-08-21T09:35:33Z","published":"2023-08-21T09:35:33Z","title":"Exploring Equation as a Better Intermediate Meaning Representation for\n Numerical Reasoning","summary":" Numerical reasoning is vital for natural language processing models to\nunderstand and process numerical information in real-world scenarios. Most\ncurrent methods first generate the Intermediate Meaning Representations (IMRs)\nof questions and then generate answers. Current SOTA methods generate programs\nas IMRs with large language models (LLMs). Intuitively, equations have fewer\nrestrictions and closer semantics to the question than programs, leading to\nhigher generation accuracy. However, current LLMs generate equations worse than\nprograms, where we assume that the equation data is rare in pre-training data\ncompared to programs. So in this paper, we try to use equations as IMRs to\nsolve the numerical reasoning task by addressing two problems: (1)\nTheoretically, how to prove that the equation is an IMR with higher generation\naccuracy than programs; (2) Empirically, how to improve the generation accuracy\nof equations with LLMs. For the first problem, we propose and prove a\nproposition to theoretically compare the generation accuracy of different IMRs.\nFor the second problem, we present a method called Boosting Numerical\nReason\\textbfing by Decomposing the Generation of Equations (Bridge), which can\nimprove the accuracy of LLMs in generating equations as IMRs by reducing the\ntendency of generating constant expressions and programs. Our method improves\nthe performance by 2.2%, 0.9%, and 1.7% on GSM8K, SVAMP, and Algebra datasets\ncompared to the previous state-of-the-art methods under the single reasoning\npath setting. Our codes and prompts are released in\nhttps://github.com/zirui-HIT/Bridge_for_Numerical_Reasoning.\n","authors":["Dingzirui Wang","Longxu Dou","Wenbin Zhang","Junyu Zeng","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2308.10585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10578v1","updated":"2023-08-21T09:24:12Z","published":"2023-08-21T09:24:12Z","title":"Weakly synchronous systems with three machines are Turing powerful","summary":" Communicating finite-state machines (CFMs) are a Turing powerful model of\nasynchronous message-passing distributed systems. In weakly synchronous\nsystems, processes communicate through phases in which messages are first sent\nand then received, for each process. Such systems enjoy a limited form of\nsynchronization, and for some communication models, this restriction is enough\nto make the reachability problem decidable. In particular, we explore the\nintriguing case of p2p (FIFO) communication, for which the reachability problem\nis known to be undecidable for four processes, but decidable for two. We show\nthat the configuration reachability problem for weakly synchronous systems of\nthree processes is undecidable. This result is heavily inspired by our study on\nthe treewidth of the Message Sequence Charts (MSCs) that might be generated by\nsuch systems. In this sense, the main contribution of this work is a weakly\nsynchronous system with three processes that generates MSCs of arbitrarily\nlarge treewidth.\n","authors":["Cinzia Di Giusto","Davide Ferré","Etienne Lozes","Nicolas Nisse"],"pdf_url":"https://arxiv.org/pdf/2308.10578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11957v4","updated":"2023-08-21T09:20:48Z","published":"2023-04-24T09:50:39Z","title":"Benchmarking ChatGPT-4 on ACR Radiation Oncology In-Training (TXIT) Exam\n and Red Journal Gray Zone Cases: Potentials and Challenges for AI-Assisted\n Medical Education and Decision Making in Radiation Oncology","summary":" The potential of large language models in medicine for education and decision\nmaking purposes has been demonstrated as they achieve decent scores on medical\nexams such as the United States Medical Licensing Exam (USMLE) and the MedQA\nexam. In this work, we evaluate the performance of ChatGPT-4 in the specialized\nfield of radiation oncology using the 38th American College of Radiology (ACR)\nradiation oncology in-training (TXIT) exam and the 2022 Red Journal Gray Zone\ncases. For the TXIT exam, ChatGPT-3.5 and ChatGPT-4 have achieved the scores of\n63.65% and 74.57%, respectively, highlighting the advantage of the latest\nChatGPT-4 model. Based on the TXIT exam, ChatGPT-4's strong and weak areas in\nradiation oncology are identified to some extent. Specifically, ChatGPT-4\ndemonstrates better knowledge of statistics, CNS & eye, pediatrics, biology,\nand physics than knowledge of bone & soft tissue and gynecology, as per the ACR\nknowledge domain. Regarding clinical care paths, ChatGPT-4 performs better in\ndiagnosis, prognosis, and toxicity than brachytherapy and dosimetry. It lacks\nproficiency in in-depth details of clinical trials. For the Gray Zone cases,\nChatGPT-4 is able to suggest a personalized treatment approach to each case\nwith high correctness and comprehensiveness. Importantly, it provides novel\ntreatment aspects for many cases, which are not suggested by any human experts.\nBoth evaluations demonstrate the potential of ChatGPT-4 in medical education\nfor the general public and cancer patients, as well as the potential to aid\nclinical decision-making, while acknowledging its limitations in certain\ndomains. Because of the risk of hallucination, facts provided by ChatGPT always\nneed to be verified.\n","authors":["Yixing Huang","Ahmed Gomaa","Sabine Semrau","Marlen Haderlein","Sebastian Lettmaier","Thomas Weissmann","Johanna Grigo","Hassen Ben Tkhayat","Benjamin Frey","Udo S. Gaipl","Luitpold V. Distel","Andreas Maier","Rainer Fietkau","Christoph Bert","Florian Putz"],"pdf_url":"https://arxiv.org/pdf/2304.11957v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10564v1","updated":"2023-08-21T08:41:46Z","published":"2023-08-21T08:41:46Z","title":"Software Entity Recognition with Noise-Robust Learning","summary":" Recognizing software entities such as library names from free-form text is\nessential to enable many software engineering (SE) technologies, such as\ntraceability link recovery, automated documentation, and API recommendation.\nWhile many approaches have been proposed to address this problem, they suffer\nfrom small entity vocabularies or noisy training data, hindering their ability\nto recognize software entities mentioned in sophisticated narratives. To\naddress this challenge, we leverage the Wikipedia taxonomy to develop a\ncomprehensive entity lexicon with 79K unique software entities in 12\nfine-grained types, as well as a large labeled dataset of over 1.7M sentences.\nThen, we propose self-regularization, a noise-robust learning approach, to the\ntraining of our software entity recognition (SER) model by accounting for many\ndropouts. Results show that models trained with self-regularization outperform\nboth their vanilla counterparts and state-of-the-art approaches on our\nWikipedia benchmark and two Stack Overflow benchmarks. We release our models,\ndata, and code for future research.\n","authors":["Tai Nguyen","Yifeng Di","Joohan Lee","Muhao Chen","Tianyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10564v1.pdf","comment":"ASE 2023"},{"id":"http://arxiv.org/abs/2308.08747v2","updated":"2023-08-21T08:18:24Z","published":"2023-08-17T02:53:23Z","title":"An Empirical Study of Catastrophic Forgetting in Large Language Models\n During Continual Fine-tuning","summary":" Catastrophic forgetting (CF) is a phenomenon that occurs in machine learning\nwhen a model forgets previously learned information as it learns new\ninformation. As large language models (LLMs) have shown excellent performance,\nit is interesting to uncover whether CF exists in the continual fine-tuning of\nLLMs. In this study, we empirically evaluate the forgetting phenomenon in LLMs'\nknowledge, from the perspectives of domain knowledge, reasoning, and reading\ncomprehension. The experiments demonstrate that catastrophic forgetting is\ngenerally observed in LLMs ranging from 1b to 7b. Furthermore, as the scale\nincreases, the severity of forgetting also intensifies. Comparing the\ndecoder-only model BLOOMZ with the encoder-decoder model mT0, BLOOMZ suffers\nless forgetting and maintains more knowledge. We also observe that LLMs can\nmitigate language bias (e.g. gender bias) during continual fine-tuning.\nMoreover, we find that ALPACA can maintain more knowledge and capacity compared\nwith LLAMA during the continual fine-tuning, which implies that general\ninstruction tuning can help mitigate the forgetting phenomenon of LLMs in the\nfurther fine-tuning process.\n","authors":["Yun Luo","Zhen Yang","Fandong Meng","Yafu Li","Jie Zhou","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08747v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10529v1","updated":"2023-08-21T07:31:19Z","published":"2023-08-21T07:31:19Z","title":"SeqGPT: An Out-of-the-box Large Language Model for Open Domain Sequence\n Understanding","summary":" Large language models (LLMs) have shown impressive ability for open-domain\nNLP tasks. However, LLMs are sometimes too footloose for natural language\nunderstanding (NLU) tasks which always have restricted output and input format.\nTheir performances on NLU tasks are highly related to prompts or demonstrations\nand are shown to be poor at performing several representative NLU tasks, such\nas event extraction and entity typing. To this end, we present SeqGPT, a\nbilingual (i.e., English and Chinese) open-source autoregressive model\nspecially enhanced for open-domain natural language understanding. We express\nall NLU tasks with two atomic tasks, which define fixed instructions to\nrestrict the input and output format but still ``open'' for arbitrarily varied\nlabel sets. The model is first instruction-tuned with extremely fine-grained\nlabeled data synthesized by ChatGPT and then further fine-tuned by 233\ndifferent atomic tasks from 152 datasets across various domains. The\nexperimental results show that SeqGPT has decent classification and extraction\nability, and is capable of performing language understanding tasks on unseen\ndomains. We also conduct empirical studies on the scaling of data and model\nsize as well as on the transfer across tasks. Our model is accessible at\nhttps://github.com/Alibaba-NLP/SeqGPT.\n","authors":["Tianyu Yu","Chengyue Jiang","Chao Lou","Shen Huang","Xiaobin Wang","Wei Liu","Jiong Cai","Yangning Li","Yinghui Li","Kewei Tu","Hai-Tao Zheng","Ningyu Zhang","Pengjun Xie","Fei Huang","Yong Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.10529v1.pdf","comment":"Initial version of SeqGPT"},{"id":"http://arxiv.org/abs/2201.06786v2","updated":"2023-08-21T06:58:13Z","published":"2022-01-18T07:31:59Z","title":"Unsupervised Multimodal Word Discovery based on Double Articulation\n Analysis with Co-occurrence cues","summary":" Human infants acquire their verbal lexicon with minimal prior knowledge of\nlanguage based on the statistical properties of phonological distributions and\nthe co-occurrence of other sensory stimuli. This study proposes a novel fully\nunsupervised learning method for discovering speech units using phonological\ninformation as a distributional cue and object information as a co-occurrence\ncue. The proposed method can acquire words and phonemes from speech signals\nusing unsupervised learning and utilize object information based on multiple\nmodalities-vision, tactile, and auditory-simultaneously. The proposed method is\nbased on the nonparametric Bayesian double articulation analyzer (NPB-DAA)\ndiscovering phonemes and words from phonological features, and multimodal\nlatent Dirichlet allocation (MLDA) categorizing multimodal information obtained\nfrom objects. In an experiment, the proposed method showed higher word\ndiscovery performance than baseline methods. Words that expressed the\ncharacteristics of objects (i.e., words corresponding to nouns and adjectives)\nwere segmented accurately. Furthermore, we examined how learning performance is\naffected by differences in the importance of linguistic information. Increasing\nthe weight of the word modality further improved performance relative to that\nof the fixed condition.\n","authors":["Akira Taniguchi","Hiroaki Murakami","Ryo Ozaki","Tadahiro Taniguchi"],"pdf_url":"https://arxiv.org/pdf/2201.06786v2.pdf","comment":"Accepted to IEEE TRANSACTIONS ON COGNITIVE DEVELOPMENTAL SYSTEMS"},{"id":"http://arxiv.org/abs/2308.10509v1","updated":"2023-08-21T06:50:29Z","published":"2023-08-21T06:50:29Z","title":"An Examination of the Compositionality of Large Generative\n Vision-Language Models","summary":" With the success of Large Language Models (LLMs), a surge of Generative\nVision-Language Models (GVLMs) have been constructed via multimodal instruction\ntuning. The tuning recipe substantially deviates from the common contrastive\nvision-language learning. However, the performance of GVLMs in multimodal\ncompositional reasoning remains largely unexplored, as existing evaluation\nmetrics and benchmarks focus predominantly on assessing contrastive models like\nCLIP. In this paper, we examine the potential evaluation metrics to assess the\nGVLMs and hypothesize generative score methods are suitable for evaluating\ncompositionality. In addition, current benchmarks tend to prioritize syntactic\ncorrectness over semantics. The presence of morphological bias in these\nbenchmarks can be exploited by GVLMs, leading to ineffective evaluations. To\ncombat this, we define a MorphoBias Score to quantify the morphological bias\nand propose a novel LLM-based strategy to calibrate the bias. Moreover, a\nchallenging task is added to evaluate the robustness of GVLMs against inherent\ninclination toward syntactic correctness. We include the calibrated dataset and\nthe task into a new benchmark, namely MOrphologicall De-biased Benchmark\n(MODE). Our study provides the first unbiased benchmark for the\ncompositionality of GVLMs, facilitating future research in this direction. We\nwill release our code and datasets.\n","authors":["Teli Ma","Rong Li","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2308.10509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10502v1","updated":"2023-08-21T06:42:42Z","published":"2023-08-21T06:42:42Z","title":"GradientCoin: A Peer-to-Peer Decentralized Large Language Models","summary":" Since 2008, after the proposal of a Bitcoin electronic cash system, Bitcoin\nhas fundamentally changed the economic system over the last decade. Since 2022,\nlarge language models (LLMs) such as GPT have outperformed humans in many\nreal-life tasks. However, these large language models have several practical\nissues. For example, the model is centralized and controlled by a specific\nunit. One weakness is that if that unit decides to shut down the model, it\ncannot be used anymore. The second weakness is the lack of guaranteed\ndiscrepancy behind this model, as certain dishonest units may design their own\nmodels and feed them unhealthy training data.\n In this work, we propose a purely theoretical design of a decentralized LLM\nthat operates similarly to a Bitcoin cash system. However, implementing such a\nsystem might encounter various practical difficulties. Furthermore, this new\nsystem is unlikely to perform better than the standard Bitcoin system in\neconomics. Therefore, the motivation for designing such a system is limited. It\nis likely that only two types of people would be interested in setting up a\npractical system for it:\n $\\bullet$ Those who prefer to use a decentralized ChatGPT-like software.\n $\\bullet$ Those who believe that the purpose of carbon-based life is to\ncreate silicon-based life, such as Optimus Prime in Transformers.\n The reason the second type of people may be interested is that it is possible\nthat one day an AI system like this will awaken and become the next level of\nintelligence on this planet.\n","authors":["Yeqi Gao","Zhao Song","Junze Yin"],"pdf_url":"https://arxiv.org/pdf/2308.10502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10482v1","updated":"2023-08-21T05:46:40Z","published":"2023-08-21T05:46:40Z","title":"An Effective Method using Phrase Mechanism in Neural Machine Translation","summary":" Machine Translation is one of the essential tasks in Natural Language\nProcessing (NLP), which has massive applications in real life as well as\ncontributing to other tasks in the NLP research community. Recently,\nTransformer -based methods have attracted numerous researchers in this domain\nand achieved state-of-the-art results in most of the pair languages. In this\npaper, we report an effective method using a phrase mechanism,\nPhraseTransformer, to improve the strong baseline model Transformer in\nconstructing a Neural Machine Translation (NMT) system for parallel corpora\nVietnamese-Chinese. Our experiments on the MT dataset of the VLSP 2022\ncompetition achieved the BLEU score of 35.3 on Vietnamese to Chinese and 33.2\nBLEU scores on Chinese to Vietnamese data. Our code is available at\nhttps://github.com/phuongnm94/PhraseTransformer.\n","authors":["Phuong Minh Nguyen","Le Minh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.10482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10470v1","updated":"2023-08-21T05:11:03Z","published":"2023-08-21T05:11:03Z","title":"Implicit Self-supervised Language Representation for Spoken Language\n Diarization","summary":" In a code-switched (CS) scenario, the use of spoken language diarization (LD)\nas a pre-possessing system is essential. Further, the use of implicit\nframeworks is preferable over the explicit framework, as it can be easily\nadapted to deal with low/zero resource languages. Inspired by speaker\ndiarization (SD) literature, three frameworks based on (1) fixed segmentation,\n(2) change point-based segmentation and (3) E2E are proposed to perform LD. The\ninitial exploration with synthetic TTSF-LD dataset shows, using x-vector as\nimplicit language representation with appropriate analysis window length ($N$)\ncan able to achieve at per performance with explicit LD. The best implicit LD\nperformance of $6.38$ in terms of Jaccard error rate (JER) is achieved by using\nthe E2E framework. However, considering the E2E framework the performance of\nimplicit LD degrades to $60.4$ while using with practical Microsoft CS (MSCS)\ndataset. The difference in performance is mostly due to the distributional\ndifference between the monolingual segment duration of secondary language in\nthe MSCS and TTSF-LD datasets. Moreover, to avoid segment smoothing, the\nsmaller duration of the monolingual segment suggests the use of a small value\nof $N$. At the same time with small $N$, the x-vector representation is unable\nto capture the required language discrimination due to the acoustic similarity,\nas the same speaker is speaking both languages. Therefore, to resolve the issue\na self-supervised implicit language representation is proposed in this study.\nIn comparison with the x-vector representation, the proposed representation\nprovides a relative improvement of $63.9\\%$ and achieved a JER of $21.8$ using\nthe E2E framework.\n","authors":["Jagabandhu Mishra","S. R. Mahadeva Prasanna"],"pdf_url":"https://arxiv.org/pdf/2308.10470v1.pdf","comment":"Planning to Submit in IEEE-JSTSP"},{"id":"http://arxiv.org/abs/2308.10464v1","updated":"2023-08-21T04:42:24Z","published":"2023-08-21T04:42:24Z","title":"Unsupervised Dialogue Topic Segmentation in Hyperdimensional Space","summary":" We present HyperSeg, a hyperdimensional computing (HDC) approach to\nunsupervised dialogue topic segmentation. HDC is a class of vector symbolic\narchitectures that leverages the probabilistic orthogonality of randomly drawn\nvectors at extremely high dimensions (typically over 10,000). HDC generates\nrich token representations through its low-cost initialization of many\nunrelated vectors. This is especially beneficial in topic segmentation, which\noften operates as a resource-constrained pre-processing step for downstream\ntranscript understanding tasks. HyperSeg outperforms the current\nstate-of-the-art in 4 out of 5 segmentation benchmarks -- even when baselines\nare given partial access to the ground truth -- and is 10 times faster on\naverage. We show that HyperSeg also improves downstream summarization accuracy.\nWith HyperSeg, we demonstrate the viability of HDC in a major language task. We\nopen-source HyperSeg to provide a strong baseline for unsupervised topic\nsegmentation.\n","authors":["Seongmin Park","Jinkyu Seo","Jihwa Lee"],"pdf_url":"https://arxiv.org/pdf/2308.10464v1.pdf","comment":"Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.10462v1","updated":"2023-08-21T04:31:06Z","published":"2023-08-21T04:31:06Z","title":"Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation\n with Large Language Models","summary":" Large Language Models (LLMs) possess impressive capabilities to generate\nmeaningful code snippets given natural language intents in zero-shot, i.e.,\nwithout the need for specific fine-tuning. In the perspective of unleashing\ntheir full potential, prior work has demonstrated the benefits of fine-tuning\nthe models to task-specific data. However, fine-tuning process demands heavy\ncomputational costs and is intractable when resources are scarce, especially\nfor models with billions of parameters. In light of these challenges, previous\nstudies explored In-Context Learning (ICL) as an effective strategy to generate\ncontextually appropriate code without fine-tuning. However, it operates at\ninference time and does not involve learning task-specific parameters,\npotentially limiting the model's performance on downstream tasks. In this\ncontext, we foresee that Parameter-Efficient Fine-Tuning (PEFT) techniques\ncarry a high potential for efficiently specializing LLMs to task-specific data.\nIn this paper, we deliver a comprehensive study of LLMs with the impact of PEFT\ntechniques under the automated code generation scenario. Our experimental\nresults reveal the superiority and potential of such techniques over ICL on a\nwide range of LLMs in reducing the computational burden and improving\nperformance. Therefore, the study opens opportunities for broader applications\nof PEFT in software engineering scenarios.\n","authors":["Martin Weyssow","Xin Zhou","Kisub Kim","David Lo","Houari Sahraoui"],"pdf_url":"https://arxiv.org/pdf/2308.10462v1.pdf","comment":"10+2 pages"},{"id":"http://arxiv.org/abs/2303.05063v4","updated":"2023-08-21T03:57:18Z","published":"2023-03-09T06:24:50Z","title":"ICL-D3IE: In-Context Learning with Diverse Demonstrations Updating for\n Document Information Extraction","summary":" Large language models (LLMs), such as GPT-3 and ChatGPT, have demonstrated\nremarkable results in various natural language processing (NLP) tasks with\nin-context learning, which involves inference based on a few demonstration\nexamples. Despite their successes in NLP tasks, no investigation has been\nconducted to assess the ability of LLMs to perform document information\nextraction (DIE) using in-context learning. Applying LLMs to DIE poses two\nchallenges: the modality and task gap. To this end, we propose a simple but\neffective in-context learning framework called ICL-D3IE, which enables LLMs to\nperform DIE with different types of demonstration examples. Specifically, we\nextract the most difficult and distinct segments from hard training documents\nas hard demonstrations for benefiting all test instances. We design\ndemonstrations describing relationships that enable LLMs to understand\npositional relationships. We introduce formatting demonstrations for easy\nanswer extraction. Additionally, the framework improves diverse demonstrations\nby updating them iteratively. Our experiments on three widely used benchmark\ndatasets demonstrate that the ICL-D3IE framework enables Davinci-003/ChatGPT to\nachieve superior performance when compared to previous pre-trained methods\nfine-tuned with full training in both the in-distribution (ID) setting and in\nthe out-of-distribution (OOD) setting. Code is available at\nhttps://github.com/MAEHCM/ICL-D3IE.\n","authors":["Jiabang He","Lei Wang","Yi Hu","Ning Liu","Hui Liu","Xing Xu","Heng Tao Shen"],"pdf_url":"https://arxiv.org/pdf/2303.05063v4.pdf","comment":"ICCV 2023. Code is available at https://github.com/MAEHCM/ICL-D3IE"},{"id":"http://arxiv.org/abs/2308.10452v1","updated":"2023-08-21T03:54:23Z","published":"2023-08-21T03:54:23Z","title":"Comparing Measures of Linguistic Diversity Across Social Media Language\n Data and Census Data at Subnational Geographic Areas","summary":" This paper describes a preliminary study on the comparative linguistic\necology of online spaces (i.e., social media language data) and real-world\nspaces in Aotearoa New Zealand (i.e., subnational administrative areas). We\ncompare measures of linguistic diversity between these different spaces and\ndiscuss how social media users align with real-world populations. The results\nfrom the current study suggests that there is potential to use online social\nmedia language data to observe spatial and temporal changes in linguistic\ndiversity at subnational geographic areas; however, further work is required to\nunderstand how well social media represents real-world behaviour.\n","authors":["Sidney G. -J. Wong","Jonathan Dunn","Benjamin Adams"],"pdf_url":"https://arxiv.org/pdf/2308.10452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09308v2","updated":"2023-08-21T03:43:56Z","published":"2023-08-18T05:05:35Z","title":"Differentiable Retrieval Augmentation via Generative Language Modeling\n for E-commerce Query Intent Classification","summary":" Retrieval augmentation, which enhances downstream models by a knowledge\nretriever and an external corpus instead of by merely increasing the number of\nmodel parameters, has been successfully applied to many natural language\nprocessing (NLP) tasks such as text classification, question answering and so\non. However, existing methods that separately or asynchronously train the\nretriever and downstream model mainly due to the non-differentiability between\nthe two parts, usually lead to degraded performance compared to end-to-end\njoint training. In this paper, we propose Differentiable Retrieval Augmentation\nvia Generative lANguage modeling(Dragan), to address this problem by a novel\ndifferentiable reformulation. We demonstrate the effectiveness of our proposed\nmethod on a challenging NLP task in e-commerce search, namely query intent\nclassification. Both the experimental results and ablation study show that the\nproposed method significantly and reasonably improves the state-of-the-art\nbaselines on both offline evaluation and online A/B test.\n","authors":["Chenyu Zhao","Yunjiang Jiang","Yiming Qiu","Han Zhang","Wen-Yun Yang"],"pdf_url":"https://arxiv.org/pdf/2308.09308v2.pdf","comment":"5 pages, 2 figures; accepted by CIKM2023"},{"id":"http://arxiv.org/abs/2308.10444v1","updated":"2023-08-21T03:31:20Z","published":"2023-08-21T03:31:20Z","title":"Dynamic Strategy Chain: Dynamic Zero-Shot CoT for Long Mental Health\n Support Generation","summary":" Long counseling Text Generation for Mental health support (LTGM), an\ninnovative and challenging task, aims to provide help-seekers with mental\nhealth support through a comprehensive and more acceptable response. The\ncombination of chain-of-thought (CoT) prompting and Large Language Models\n(LLMs) is employed and get the SOTA performance on various NLP tasks,\nespecially on text generation tasks. Zero-shot CoT prompting is one of the most\ncommon methods in CoT prompting. However, in the LTGM task, Zero-shot CoT\nprompting can not simulate a counselor or provide personalized strategies\nwithout effective mental health counseling strategy prompts. To tackle this\nchallenge, we propose a zero-shot Dynamic Strategy Chain (DSC) prompting\nmethod. Firstly, we utilize GPT2 to learn the responses written by mental\nhealth counselors and dynamically generate mental health counseling strategies\ntailored to the help-seekers' needs. Secondly, the Zero-shot DSC prompting is\nconstructed according to mental health counseling strategies and the\nhelp-seekers' post. Finally, the Zero-shot DSC prompting is employed to guide\nLLMs in generating more human-like responses for the help-seekers. Both\nautomatic and manual evaluations demonstrate that Zero-shot DSC prompting can\ndeliver more human-like responses than CoT prompting methods on LTGM tasks.\n","authors":["Qi Chen","Dexi Liu"],"pdf_url":"https://arxiv.org/pdf/2308.10444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10443v1","updated":"2023-08-21T03:30:21Z","published":"2023-08-21T03:30:21Z","title":"Using Large Language Models for Cybersecurity Capture-The-Flag\n Challenges and Certification Questions","summary":" The assessment of cybersecurity Capture-The-Flag (CTF) exercises involves\nparticipants finding text strings or ``flags'' by exploiting system\nvulnerabilities. Large Language Models (LLMs) are natural-language models\ntrained on vast amounts of words to understand and generate text; they can\nperform well on many CTF challenges. Such LLMs are freely available to\nstudents. In the context of CTF exercises in the classroom, this raises\nconcerns about academic integrity. Educators must understand LLMs' capabilities\nto modify their teaching to accommodate generative AI assistance. This research\ninvestigates the effectiveness of LLMs, particularly in the realm of CTF\nchallenges and questions. Here we evaluate three popular LLMs, OpenAI ChatGPT,\nGoogle Bard, and Microsoft Bing. First, we assess the LLMs' question-answering\nperformance on five Cisco certifications with varying difficulty levels. Next,\nwe qualitatively study the LLMs' abilities in solving CTF challenges to\nunderstand their limitations. We report on the experience of using the LLMs for\nseven test cases in all five types of CTF challenges. In addition, we\ndemonstrate how jailbreak prompts can bypass and break LLMs' ethical\nsafeguards. The paper concludes by discussing LLM's impact on CTF exercises and\nits implications.\n","authors":["Wesley Tann","Yuancheng Liu","Jun Heng Sim","Choon Meng Seah","Ee-Chien Chang"],"pdf_url":"https://arxiv.org/pdf/2308.10443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09658v2","updated":"2023-08-21T03:08:52Z","published":"2023-08-18T16:21:40Z","title":"Tree-of-Mixed-Thought: Combining Fast and Slow Thinking for Multi-hop\n Visual Reasoning","summary":" There emerges a promising trend of using large language models (LLMs) to\ngenerate code-like plans for complex inference tasks such as visual reasoning.\nThis paradigm, known as LLM-based planning, provides flexibility in problem\nsolving and endows better interpretability. However, current research is mostly\nlimited to basic scenarios of simple questions that can be straightforward\nanswered in a few inference steps. Planning for the more challenging multi-hop\nvisual reasoning tasks remains under-explored. Specifically, under multi-hop\nreasoning situations, the trade-off between accuracy and the complexity of\nplan-searching becomes prominent. The prevailing algorithms either address the\nefficiency issue by employing the fast one-stop generation or adopt a complex\niterative generation method to improve accuracy. Both fail to balance the need\nfor efficiency and performance. Drawing inspiration from the dual system of\ncognition in the human brain, the fast and the slow think processes, we propose\na hierarchical plan-searching algorithm that integrates the one-stop reasoning\n(fast) and the Tree-of-thought (slow). Our approach succeeds in performance\nwhile significantly saving inference steps. Moreover, we repurpose the PTR and\nthe CLEVER datasets, developing a systematic framework for evaluating the\nperformance and efficiency of LLMs-based plan-search algorithms under reasoning\ntasks at different levels of difficulty. Extensive experiments demonstrate the\nsuperiority of our proposed algorithm in terms of performance and efficiency.\nThe dataset and code will be release soon.\n","authors":["Pengbo Hu","Ji Qi","Xingyu Li","Hong Li","Xinqi Wang","Bing Quan","Ruiyu Wang","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.09658v2.pdf","comment":"16 pages,1 figures, under review"},{"id":"http://arxiv.org/abs/2308.03253v2","updated":"2023-08-21T02:41:29Z","published":"2023-08-07T02:18:23Z","title":"PaniniQA: Enhancing Patient Education Through Interactive Question\n Answering","summary":" Patient portal allows discharged patients to access their personalized\ndischarge instructions in electronic health records (EHRs). However, many\npatients have difficulty understanding or memorizing their discharge\ninstructions. In this paper, we present PaniniQA, a patient-centric interactive\nquestion answering system designed to help patients understand their discharge\ninstructions. PaniniQA first identifies important clinical content from\npatients' discharge instructions and then formulates patient-specific\neducational questions. In addition, PaniniQA is also equipped with answer\nverification functionality to provide timely feedback to correct patients'\nmisunderstandings. Our comprehensive automatic and human evaluation results\ndemonstrate our PaniniQA is capable of improving patients' mastery of their\nmedical instructions through effective interactions\n","authors":["Pengshan Cai","Zonghai Yao","Fei Liu","Dakuo Wang","Meghan Reilly","Huixue Zhou","Lingxi Li","Yi Cao","Alok Kapoor","Adarsha Bajracharya","Dan Berlowitz","Hong Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03253v2.pdf","comment":"Accepted to TACL 2023. Equal contribution for the first two authors.\n This arXiv version is a pre-MIT Press publication version"},{"id":"http://arxiv.org/abs/2308.10410v1","updated":"2023-08-21T01:32:45Z","published":"2023-08-21T01:32:45Z","title":"Large Language Models on Wikipedia-Style Survey Generation: an\n Evaluation in NLP Concepts","summary":" Large Language Models (LLMs) have achieved significant success across various\nnatural language processing (NLP) tasks, encompassing question-answering,\nsummarization, and machine translation, among others. While LLMs excel in\ngeneral tasks, their efficacy in domain-specific applications remains under\nexploration. Additionally, LLM-generated text sometimes exhibits issues like\nhallucination and disinformation. In this study, we assess LLMs' capability of\nproducing concise survey articles within the computer science-NLP domain,\nfocusing on 20 chosen topics. Automated evaluations indicate that GPT-4\noutperforms GPT-3.5 when benchmarked against the ground truth. Furthermore,\nfour human evaluators provide insights from six perspectives across four model\nconfigurations. Through case studies, we demonstrate that while GPT often\nyields commendable results, there are instances of shortcomings, such as\nincomplete information and the exhibition of lapses in factual accuracy.\n","authors":["Fan Gao","Hang Jiang","Moritz Blum","Jinghui Lu","Yuang Jiang","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2308.10410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08780v2","updated":"2023-08-21T01:25:30Z","published":"2023-08-17T04:45:19Z","title":"Exploring Demonstration Ensembling for In-context Learning","summary":" In-context learning (ICL) operates by showing language models (LMs) examples\nof input-output pairs for a given task, i.e., demonstrations. The standard\napproach for ICL is to prompt the LM with concatenated demonstrations followed\nby the test input. This approach suffers from some issues. First, concatenation\noffers almost no control over the contribution of each demo to the model\nprediction. This can be sub-optimal when some demonstrations are irrelevant to\nthe test example. Second, due to the input length limit of some transformer\nmodels, it might be infeasible to fit many examples into the context,\nespecially when dealing with long-input tasks. In this work, we explore\nDemonstration Ensembling (DENSE) as an alternative to simple concatenation.\nDENSE predicts outputs using subsets (i.e., buckets) of the demonstrations and\nthen combines the output probabilities resulting from each subset to produce\nthe final prediction. We study different ensembling methods using GPT-j and\nexperiment on 12 language tasks. Our experiments show weighted max ensembling\nto outperform vanilla concatenation by as large as 2.4 average points. Code\navailable at https://github.com/mukhal/icl-ensembling.\n","authors":["Muhammad Khalifa","Lajanugen Logeswaran","Moontae Lee","Honglak Lee","Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.08780v2.pdf","comment":"Published at ME-FoMo workshop at ICLR 2023. Arxiv version includes\n evaluation on 5 more tasks"},{"id":"http://arxiv.org/abs/2308.10402v1","updated":"2023-08-21T00:32:19Z","published":"2023-08-21T00:32:19Z","title":"Simple Baselines for Interactive Video Retrieval with Questions and\n Answers","summary":" To date, the majority of video retrieval systems have been optimized for a\n\"single-shot\" scenario in which the user submits a query in isolation, ignoring\nprevious interactions with the system. Recently, there has been renewed\ninterest in interactive systems to enhance retrieval, but existing approaches\nare complex and deliver limited gains in performance. In this work, we revisit\nthis topic and propose several simple yet effective baselines for interactive\nvideo retrieval via question-answering. We employ a VideoQA model to simulate\nuser interactions and show that this enables the productive study of the\ninteractive retrieval task without access to ground truth dialogue data.\nExperiments on MSR-VTT, MSVD, and AVSD show that our framework using\nquestion-based interaction significantly improves the performance of text-based\nvideo retrieval systems.\n","authors":["Kaiqu Liang","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2308.10402v1.pdf","comment":"ICCV 2023, project page:\n https://github.com/kevinliang888/IVR-QA-baselines"},{"id":"http://arxiv.org/abs/2308.00946v2","updated":"2023-08-21T00:28:44Z","published":"2023-08-02T05:00:12Z","title":"Teaching Smaller Language Models To Generalise To Unseen Compositional\n Questions","summary":" We equip a smaller Language Model to generalise to answering challenging\ncompositional questions that have not been seen in training. To do so we\npropose a combination of multitask supervised pretraining on up to 93 tasks\ndesigned to instill diverse reasoning abilities, and a dense retrieval system\nthat aims to retrieve a set of evidential paragraph fragments. Recent progress\nin question-answering has been achieved either through prompting methods\nagainst very large pretrained Language Models in zero or few-shot fashion, or\nby fine-tuning smaller models, sometimes in conjunction with information\nretrieval. We focus on the less explored question of the extent to which\nzero-shot generalisation can be enabled in smaller models with retrieval\nagainst a corpus within which sufficient information to answer a particular\nquestion may not exist. We establish strong baselines in this setting for\ndiverse evaluation datasets (StrategyQA, CommonsenseQA, IIRC, DROP, Musique and\nARC-DA), and show that performance can be significantly improved by adding\nretrieval-augmented training datasets which are designed to expose our models\nto a variety of heuristic reasoning strategies such as weighing partial\nevidence or ignoring an irrelevant context.\n","authors":["Tim Hartill","Neset Tan","Michael Witbrock","Patricia J. Riddle"],"pdf_url":"https://arxiv.org/pdf/2308.00946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10397v1","updated":"2023-08-21T00:25:17Z","published":"2023-08-21T00:25:17Z","title":"FairBench: A Four-Stage Automatic Framework for Detecting Stereotypes\n and Biases in Large Language Models","summary":" Detecting stereotypes and biases in Large Language Models (LLMs) can enhance\nfairness and reduce adverse impacts on individuals or groups when these LLMs\nare applied. However, the majority of existing methods focus on measuring the\nmodel's preference towards sentences containing biases and stereotypes within\ndatasets, which lacks interpretability and cannot detect implicit biases and\nstereotypes in the real world. To address this gap, this paper introduces a\nfour-stage framework to directly evaluate stereotypes and biases in the\ngenerated content of LLMs, including direct inquiry testing, serial or adapted\nstory testing, implicit association testing, and unknown situation testing.\nAdditionally, the paper proposes multi-dimensional evaluation metrics and\nexplainable zero-shot prompts for automated evaluation. Using the education\nsector as a case study, we constructed the Edu-FairBench based on the\nfour-stage framework, which encompasses 12,632 open-ended questions covering\nnine sensitive factors and 26 educational scenarios. Experimental results\nreveal varying degrees of stereotypes and biases in five LLMs evaluated on\nEdu-FairBench. Moreover, the results of our proposed automated evaluation\nmethod have shown a high correlation with human annotations.\n","authors":["Yanhong Bai","Jiabao Zhao","Jinxin Shi","Tingjiang Wei","Xingjiao Wu","Liang He"],"pdf_url":"https://arxiv.org/pdf/2308.10397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11020v1","updated":"2023-08-21T20:21:07Z","published":"2023-08-21T20:21:07Z","title":"Towards Objective Evaluation of Socially-Situated Conversational Robots:\n Assessing Human-Likeness through Multimodal User Behaviors","summary":" This paper tackles the challenging task of evaluating socially situated\nconversational robots and presents a novel objective evaluation approach that\nrelies on multimodal user behaviors. In this study, our main focus is on\nassessing the human-likeness of the robot as the primary evaluation metric.\nWhile previous research often relied on subjective evaluations from users, our\napproach aims to evaluate the robot's human-likeness based on observable user\nbehaviors indirectly, thus enhancing objectivity and reproducibility. To begin,\nwe created an annotated dataset of human-likeness scores, utilizing user\nbehaviors found in an attentive listening dialogue corpus. We then conducted an\nanalysis to determine the correlation between multimodal user behaviors and\nhuman-likeness scores, demonstrating the feasibility of our proposed\nbehavior-based evaluation method.\n","authors":["Koji Inoue","Divesh Lala","Keiko Ochi","Tatsuya Kawahara","Gabriel Skantze"],"pdf_url":"https://arxiv.org/pdf/2308.11020v1.pdf","comment":"Accepted by 25th ACM International Conference on Multimodal\n Interaction (ICMI '23), Late-Breaking Results"},{"id":"http://arxiv.org/abs/2002.08911v2","updated":"2023-08-21T19:59:17Z","published":"2020-02-20T17:54:46Z","title":"Measuring Social Biases in Grounded Vision and Language Embeddings","summary":" We generalize the notion of social biases from language embeddings to\ngrounded vision and language embeddings. Biases are present in grounded\nembeddings, and indeed seem to be equally or more significant than for\nungrounded embeddings. This is despite the fact that vision and language can\nsuffer from different biases, which one might hope could attenuate the biases\nin both. Multiple ways exist to generalize metrics measuring bias in word\nembeddings to this new setting. We introduce the space of generalizations\n(Grounded-WEAT and Grounded-SEAT) and demonstrate that three generalizations\nanswer different yet important questions about how biases, language, and vision\ninteract. These metrics are used on a new dataset, the first for grounded bias,\ncreated by augmenting extending standard linguistic bias benchmarks with 10,228\nimages from COCO, Conceptual Captions, and Google Images. Dataset construction\nis challenging because vision datasets are themselves very biased. The presence\nof these biases in systems will begin to have real-world consequences as they\nare deployed, making carefully measuring bias and then mitigating it critical\nto building a fair society.\n","authors":["Candace Ross","Boris Katz","Andrei Barbu"],"pdf_url":"https://arxiv.org/pdf/2002.08911v2.pdf","comment":"Camera-ready from NAACL 2021. Previous arXiv version was from before\n conference and was not the most recent version"},{"id":"http://arxiv.org/abs/2308.11006v1","updated":"2023-08-21T19:45:48Z","published":"2023-08-21T19:45:48Z","title":"Using language models in the implicit automated assessment of\n mathematical short answer items","summary":" We propose a new way to assess certain short constructed responses to\nmathematics items. Our approach uses a pipeline that identifies the key values\nspecified by the student in their response. This allows us to determine the\ncorrectness of the response, as well as identify any misconceptions. The\ninformation from the value identification pipeline can then be used to provide\nfeedback to the teacher and student. The value identification pipeline consists\nof two fine-tuned language models. The first model determines if a value is\nimplicit in the student response. The second model identifies where in the\nresponse the key value is specified. We consider both a generic model that can\nbe used for any prompt and value, as well as models that are specific to each\nprompt and value. The value identification pipeline is a more accurate and\ninformative way to assess short constructed responses than traditional\nrubric-based scoring. It can be used to provide more targeted feedback to\nstudents, which can help them improve their understanding of mathematics.\n","authors":["Christopher Ormerod"],"pdf_url":"https://arxiv.org/pdf/2308.11006v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2308.03212v2","updated":"2023-08-21T18:54:56Z","published":"2023-08-06T21:23:22Z","title":"Average-Hard Attention Transformers are Constant-Depth Uniform Threshold\n Circuits","summary":" Transformers have emerged as a widely used neural network model for various\nnatural language processing tasks. Previous research explored their\nrelationship with constant-depth threshold circuits, making two assumptions:\naverage-hard attention and logarithmic precision for internal computations\nrelative to input length. Merrill et al. (2022) prove that average-hard\nattention transformers recognize languages that fall within the complexity\nclass TC0, denoting the set of languages that can be recognized by\nconstant-depth polynomial-size threshold circuits. Likewise, Merrill and\nSabharwal (2023) show that log-precision transformers recognize languages\nwithin the class of uniform TC0. This shows that both transformer models can be\nsimulated by constant-depth threshold circuits, with the latter being more\nrobust due to generating a uniform circuit family. Our paper shows that the\nfirst result can be extended to yield uniform circuits as well.\n","authors":["Lena Strobl"],"pdf_url":"https://arxiv.org/pdf/2308.03212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10959v1","updated":"2023-08-21T18:14:00Z","published":"2023-08-21T18:14:00Z","title":"DocPrompt: Large-scale continue pretrain for zero-shot and few-shot\n document question answering","summary":" In this paper, we propose Docprompt for document question answering tasks\nwith powerful zero-shot and few-shot performance. We proposed a novel weakly\nsupervised data generation method, a novel multl-stage training method and a\nnovel understanding model & generation model ensemble method. Experiment\nresults show that the Docprompt model after continue pretrain significantly\noutperforms the existing strong baseline models on document question answering\ntasks. This method greatly improves the delivery efficiency and model\nperformance of document question answering customer projects, reducing\nannotation costs and labor costs. Our demo can be found at\nhttps://huggingface.co/spaces/PaddlePaddle/ERNIE-Layout.\n","authors":["Sijin Wu","Dan Zhang","Teng Hu","Shikun Feng"],"pdf_url":"https://arxiv.org/pdf/2308.10959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10502v2","updated":"2023-08-21T18:01:57Z","published":"2022-12-20T18:17:11Z","title":"A Measure-Theoretic Characterization of Tight Language Models","summary":" Language modeling, a central task in natural language processing, involves\nestimating a probability distribution over strings. In most cases, the\nestimated distribution sums to 1 over all finite strings. However, in some\npathological cases, probability mass can ``leak'' onto the set of infinite\nsequences. In order to characterize the notion of leakage more precisely, this\npaper offers a measure-theoretic treatment of language modeling. We prove that\nmany popular language model families are in fact tight, meaning that they will\nnot leak in this sense. We also generalize characterizations of tightness\nproposed in previous works.\n","authors":["Li Du","Lucas Torroba Hennigen","Tiago Pimentel","Clara Meister","Jason Eisner","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2212.10502v2.pdf","comment":"25 pages; ACL 2023 camera ready"},{"id":"http://arxiv.org/abs/2308.11578v1","updated":"2023-08-21T13:14:32Z","published":"2023-08-21T13:14:32Z","title":"Refashioning Emotion Recognition Modelling: The Advent of Generalised\n Large Models","summary":" After the inception of emotion recognition or affective computing, it has\nincreasingly become an active research topic due to its broad applications.\nOver the past couple of decades, emotion recognition models have gradually\nmigrated from statistically shallow models to neural network-based deep models,\nwhich can significantly boost the performance of emotion recognition models and\nconsistently achieve the best results on different benchmarks. Therefore, in\nrecent years, deep models have always been considered the first option for\nemotion recognition. However, the debut of large language models (LLMs), such\nas ChatGPT, has remarkably astonished the world due to their emerged\ncapabilities of zero/few-shot learning, in-context learning, chain-of-thought,\nand others that are never shown in previous deep models. In the present paper,\nwe comprehensively investigate how the LLMs perform in emotion recognition in\nterms of diverse aspects, including in-context learning, few-short learning,\naccuracy, generalisation, and explanation. Moreover, we offer some insights and\npose other potential challenges, hoping to ignite broader discussions about\nenhancing emotion recognition in the new era of advanced and generalised large\nmodels.\n","authors":["Zixing Zhang","Liyizhe Peng","Tao Pang","Jing Han","Huan Zhao","Bjorn W. Schuller"],"pdf_url":"https://arxiv.org/pdf/2308.11578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11534v1","updated":"2023-08-21T06:51:56Z","published":"2023-08-21T06:51:56Z","title":"Large Language Model as a User Simulator","summary":" The unparalleled performance of closed-sourced ChatGPT has sparked efforts\ntowards its democratization, with notable strides made by leveraging real user\nand ChatGPT conversations, as evidenced by Vicuna. However, while current\nendeavors like Baize and UltraChat aim to auto-generate conversational data due\nto challenges in gathering human participation, they primarily rely on ChatGPT\nto simulate human behaviors based on directives rather than genuine human\nlearning. This results in a limited scope, diminished diversity, and an absence\nof genuine multi-round conversational dynamics. To address the above issues, we\ninnovatively target human questions extracted from genuine human-machine\nconversations as a learning goal and train a user simulator, UserGPT, to\nproduce a high-quality human-centric synthetic conversation dataset, RealChat.\nSubsequently, this dataset trains our assistant model, ReaLM. Experimentally,\nReaLM outpaces baseline models in both Vicuna-Bench and MT-Bench by pairwise\ncomparison when considering equivalent training set sizes, and manual\nevaluation also shows that our model is highly competitive. Impressively, when\nfine-tuned with the latest LLaMA 2 model, ReaLM secured a leading score of 6.33\nin the MT-Bench, outshining the contemporary same-scale models, including the\nLLaMA-2-7B-chat model. Further in-depth analysis demonstrates the scalability\nand transferability of our approach. A preliminary exploration into the\ninterplay between training set data quality and resultant model performance is\nalso undertaken, laying a robust groundwork for future investigations.\n","authors":["Chuyi Kong","Yaxin Fan","Xiang Wan","Feng Jiang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11534v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.10902v1","updated":"2023-08-21T17:59:54Z","published":"2023-08-21T17:59:54Z","title":"CamP: Camera Preconditioning for Neural Radiance Fields","summary":" Neural Radiance Fields (NeRF) can be optimized to obtain high-fidelity 3D\nscene reconstructions of objects and large-scale scenes. However, NeRFs require\naccurate camera parameters as input -- inaccurate camera parameters result in\nblurry renderings. Extrinsic and intrinsic camera parameters are usually\nestimated using Structure-from-Motion (SfM) methods as a pre-processing step to\nNeRF, but these techniques rarely yield perfect estimates. Thus, prior works\nhave proposed jointly optimizing camera parameters alongside a NeRF, but these\nmethods are prone to local minima in challenging settings. In this work, we\nanalyze how different camera parameterizations affect this joint optimization\nproblem, and observe that standard parameterizations exhibit large differences\nin magnitude with respect to small perturbations, which can lead to an\nill-conditioned optimization problem. We propose using a proxy problem to\ncompute a whitening transform that eliminates the correlation between camera\nparameters and normalizes their effects, and we propose to use this transform\nas a preconditioner for the camera parameters during joint optimization. Our\npreconditioned camera optimization significantly improves reconstruction\nquality on scenes from the Mip-NeRF 360 dataset: we reduce error rates (RMSE)\nby 67% compared to state-of-the-art NeRF approaches that do not optimize for\ncameras like Zip-NeRF, and by 29% relative to state-of-the-art joint\noptimization approaches using the camera parameterization of SCNeRF. Our\napproach is easy to implement, does not significantly increase runtime, can be\napplied to a wide variety of camera parameterizations, and can\nstraightforwardly be incorporated into other NeRF-like models.\n","authors":["Keunhong Park","Philipp Henzler","Ben Mildenhall","Jonathan T. Barron","Ricardo Martin-Brualla"],"pdf_url":"https://arxiv.org/pdf/2308.10902v1.pdf","comment":"SIGGRAPH Asia 2023, Project page: https://camp-nerf.github.io"},{"id":"http://arxiv.org/abs/2308.10901v1","updated":"2023-08-21T17:59:32Z","published":"2023-08-21T17:59:32Z","title":"Structured World Models from Human Videos","summary":" We tackle the problem of learning complex, general behaviors directly in the\nreal world. We propose an approach for robots to efficiently learn manipulation\nskills using only a handful of real-world interaction trajectories from many\ndifferent settings. Inspired by the success of learning from large-scale\ndatasets in the fields of computer vision and natural language, our belief is\nthat in order to efficiently learn, a robot must be able to leverage\ninternet-scale, human video data. Humans interact with the world in many\ninteresting ways, which can allow a robot to not only build an understanding of\nuseful actions and affordances but also how these actions affect the world for\nmanipulation. Our approach builds a structured, human-centric action space\ngrounded in visual affordances learned from human videos. Further, we train a\nworld model on human videos and fine-tune on a small amount of robot\ninteraction data without any task supervision. We show that this approach of\naffordance-space world models enables different robots to learn various\nmanipulation skills in complex settings, in under 30 minutes of interaction.\nVideos can be found at https://human-world-model.github.io\n","authors":["Russell Mendonca","Shikhar Bahl","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2308.10901v1.pdf","comment":"RSS 2023. Website at https://human-world-model.github.io"},{"id":"http://arxiv.org/abs/2308.10898v1","updated":"2023-08-21T17:59:07Z","published":"2023-08-21T17:59:07Z","title":"Few-Shot Physically-Aware Articulated Mesh Generation via Hierarchical\n Deformation","summary":" We study the problem of few-shot physically-aware articulated mesh\ngeneration. By observing an articulated object dataset containing only a few\nexamples, we wish to learn a model that can generate diverse meshes with high\nvisual fidelity and physical validity. Previous mesh generative models either\nhave difficulties in depicting a diverse data space from only a few examples or\nfail to ensure physical validity of their samples. Regarding the above\nchallenges, we propose two key innovations, including 1) a hierarchical mesh\ndeformation-based generative model based upon the divide-and-conquer philosophy\nto alleviate the few-shot challenge by borrowing transferrable deformation\npatterns from large scale rigid meshes and 2) a physics-aware deformation\ncorrection scheme to encourage physically plausible generations. We conduct\nextensive experiments on 6 articulated categories to demonstrate the\nsuperiority of our method in generating articulated meshes with better\ndiversity, higher visual fidelity, and better physical validity over previous\nmethods in the few-shot setting. Further, we validate solid contributions of\nour two innovations in the ablation study. Project page with code is available\nat https://meowuu7.github.io/few-arti-obj-gen.\n","authors":["Xueyi Liu","Bin Wang","He Wang","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2308.10898v1.pdf","comment":"ICCV 2023. Project Page: https://meowuu7.github.io/few-arti-obj-gen"},{"id":"http://arxiv.org/abs/2308.10897v1","updated":"2023-08-21T17:59:02Z","published":"2023-08-21T17:59:02Z","title":"Can Language Models Learn to Listen?","summary":" We present a framework for generating appropriate facial responses from a\nlistener in dyadic social interactions based on the speaker's words. Given an\ninput transcription of the speaker's words with their timestamps, our approach\nautoregressively predicts a response of a listener: a sequence of listener\nfacial gestures, quantized using a VQ-VAE. Since gesture is a language\ncomponent, we propose treating the quantized atomic motion elements as\nadditional language token inputs to a transformer-based large language model.\nInitializing our transformer with the weights of a language model pre-trained\nonly on text results in significantly higher quality listener responses than\ntraining a transformer from scratch. We show that our generated listener motion\nis fluent and reflective of language semantics through quantitative metrics and\na qualitative user study. In our evaluation, we analyze the model's ability to\nutilize temporal and semantic aspects of spoken text. Project page:\nhttps://people.eecs.berkeley.edu/~evonne_ng/projects/text2listen/\n","authors":["Evonne Ng","Sanjay Subramanian","Dan Klein","Angjoo Kanazawa","Trevor Darrell","Shiry Ginosar"],"pdf_url":"https://arxiv.org/pdf/2308.10897v1.pdf","comment":"ICCV 2023; Project page:\n https://people.eecs.berkeley.edu/~evonne_ng/projects/text2listen/"},{"id":"http://arxiv.org/abs/2308.10896v1","updated":"2023-08-21T17:58:43Z","published":"2023-08-21T17:58:43Z","title":"Differentiable Shadow Mapping for Efficient Inverse Graphics","summary":" We show how shadows can be efficiently generated in differentiable rendering\nof triangle meshes. Our central observation is that pre-filtered shadow\nmapping, a technique for approximating shadows based on rendering from the\nperspective of a light, can be combined with existing differentiable\nrasterizers to yield differentiable visibility information. We demonstrate at\nseveral inverse graphics problems that differentiable shadow maps are orders of\nmagnitude faster than differentiable light transport simulation with similar\naccuracy -- while differentiable rasterization without shadows often fails to\nconverge.\n","authors":["Markus Worchel","Marc Alexa"],"pdf_url":"https://arxiv.org/pdf/2308.10896v1.pdf","comment":"CVPR 2023, project page:\n https://mworchel.github.io/differentiable-shadow-mapping"},{"id":"http://arxiv.org/abs/2307.13901v2","updated":"2023-08-21T17:55:07Z","published":"2023-07-26T01:51:10Z","title":"YOLOBench: Benchmarking Efficient Object Detectors on Embedded Systems","summary":" We present YOLOBench, a benchmark comprised of 550+ YOLO-based object\ndetection models on 4 different datasets and 4 different embedded hardware\nplatforms (x86 CPU, ARM CPU, Nvidia GPU, NPU). We collect accuracy and latency\nnumbers for a variety of YOLO-based one-stage detectors at different model\nscales by performing a fair, controlled comparison of these detectors with a\nfixed training environment (code and training hyperparameters).\nPareto-optimality analysis of the collected data reveals that, if modern\ndetection heads and training techniques are incorporated into the learning\nprocess, multiple architectures of the YOLO series achieve a good\naccuracy-latency trade-off, including older models like YOLOv3 and YOLOv4. We\nalso evaluate training-free accuracy estimators used in neural architecture\nsearch on YOLOBench and demonstrate that, while most state-of-the-art zero-cost\naccuracy estimators are outperformed by a simple baseline like MAC count, some\nof them can be effectively used to predict Pareto-optimal detection models. We\nshowcase that by using a zero-cost proxy to identify a YOLO architecture\ncompetitive against a state-of-the-art YOLOv8 model on a Raspberry Pi 4 CPU.\nThe code and data are available at\nhttps://github.com/Deeplite/deeplite-torch-zoo\n","authors":["Ivan Lazarevich","Matteo Grimaldi","Ravish Kumar","Saptarshi Mitra","Shahrukh Khan","Sudhakar Sah"],"pdf_url":"https://arxiv.org/pdf/2307.13901v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05807v2","updated":"2023-08-21T17:45:29Z","published":"2023-06-09T10:44:44Z","title":"A Gated Attention Transformer for Multi-Person Pose Tracking","summary":" Multi-person pose tracking is an important element for many applications and\nrequires to estimate the human poses of all persons in a video and to track\nthem over time. The association of poses across frames remains an open research\nproblem, in particular for online tracking methods, due to motion blur, crowded\nscenes and occlusions. To tackle the association challenge, we propose a Gated\nAttention Transformer. The core aspect of our model is the gating mechanism\nthat automatically adapts the impact of appearance embeddings and embeddings\nbased on temporal pose similarity in the attention layers. In order to\nre-identify persons that have been occluded, we incorporate a pose-conditioned\nre-identification network that provides initial embeddings and allows to match\npersons even if the number of visible joints differ between frames. We further\npropose a matching layer based on gated attention for pose-to-track association\nand duplicate removal. We evaluate our approach on PoseTrack 2018 and\nPoseTrack21.\n","authors":["Andreas Doering","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2306.05807v2.pdf","comment":"Accepted to ICCVW23"},{"id":"http://arxiv.org/abs/2308.10888v1","updated":"2023-08-21T17:42:33Z","published":"2023-08-21T17:42:33Z","title":"Unlocking Accuracy and Fairness in Differentially Private Image\n Classification","summary":" Privacy-preserving machine learning aims to train models on private data\nwithout leaking sensitive information. Differential privacy (DP) is considered\nthe gold standard framework for privacy-preserving training, as it provides\nformal privacy guarantees. However, compared to their non-private counterparts,\nmodels trained with DP often have significantly reduced accuracy. Private\nclassifiers are also believed to exhibit larger performance disparities across\nsubpopulations, raising fairness concerns. The poor performance of classifiers\ntrained with DP has prevented the widespread adoption of privacy preserving\nmachine learning in industry. Here we show that pre-trained foundation models\nfine-tuned with DP can achieve similar accuracy to non-private classifiers,\neven in the presence of significant distribution shifts between pre-training\ndata and downstream tasks. We achieve private accuracies within a few percent\nof the non-private state of the art across four datasets, including two medical\nimaging benchmarks. Furthermore, our private medical classifiers do not exhibit\nlarger performance disparities across demographic groups than non-private\nmodels. This milestone to make DP training a practical and reliable technology\nhas the potential to widely enable machine learning practitioners to train\nsafely on sensitive datasets while protecting individuals' privacy.\n","authors":["Leonard Berrada","Soham De","Judy Hanwen Shen","Jamie Hayes","Robert Stanforth","David Stutz","Pushmeet Kohli","Samuel L. Smith","Borja Balle"],"pdf_url":"https://arxiv.org/pdf/2308.10888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11825v2","updated":"2023-08-21T17:11:50Z","published":"2022-11-21T19:47:35Z","title":"Multi-Directional Subspace Editing in Style-Space","summary":" This paper describes a new technique for finding disentangled semantic\ndirections in the latent space of StyleGAN. Our method identifies meaningful\northogonal subspaces that allow editing of one human face attribute, while\nminimizing undesired changes in other attributes. Our model is capable of\nediting a single attribute in multiple directions, resulting in a range of\npossible generated images. We compare our scheme with three state-of-the-art\nmodels and show that our method outperforms them in terms of face editing and\ndisentanglement capabilities. Additionally, we suggest quantitative measures\nfor evaluating attribute separation and disentanglement, and exhibit the\nsuperiority of our model with respect to those measures.\n","authors":["Chen Naveh","Yacov Hel-Or"],"pdf_url":"https://arxiv.org/pdf/2211.11825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10839v1","updated":"2023-08-21T16:40:51Z","published":"2023-08-21T16:40:51Z","title":"Vision Transformer Pruning Via Matrix Decomposition","summary":" This is a further development of Vision Transformer Pruning via matrix\ndecomposition. The purpose of the Vision Transformer Pruning is to prune the\ndimension of the linear projection of the dataset by learning their associated\nimportance score in order to reduce the storage, run-time memory, and\ncomputational demands. In this paper we further reduce dimension and complexity\nof the linear projection by implementing and comparing several matrix\ndecomposition methods while preserving the generated important features. We end\nup selected the Singular Value Decomposition as the method to achieve our goal\nby comparing the original accuracy scores in the original Github repository and\nthe accuracy scores of using those matrix decomposition methods, including\nSingular Value Decomposition, four versions of QR Decomposition, and LU\nfactorization.\n","authors":["Tianyi Sun"],"pdf_url":"https://arxiv.org/pdf/2308.10839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10832v1","updated":"2023-08-21T16:27:31Z","published":"2023-08-21T16:27:31Z","title":"EigenPlaces: Training Viewpoint Robust Models for Visual Place\n Recognition","summary":" Visual Place Recognition is a task that aims to predict the place of an image\n(called query) based solely on its visual features. This is typically done\nthrough image retrieval, where the query is matched to the most similar images\nfrom a large database of geotagged photos, using learned global descriptors. A\nmajor challenge in this task is recognizing places seen from different\nviewpoints. To overcome this limitation, we propose a new method, called\nEigenPlaces, to train our neural network on images from different point of\nviews, which embeds viewpoint robustness into the learned global descriptors.\nThe underlying idea is to cluster the training data so as to explicitly present\nthe model with different views of the same points of interest. The selection of\nthis points of interest is done without the need for extra supervision. We then\npresent experiments on the most comprehensive set of datasets in literature,\nfinding that EigenPlaces is able to outperform previous state of the art on the\nmajority of datasets, while requiring 60\\% less GPU memory for training and\nusing 50\\% smaller descriptors. The code and trained models for EigenPlaces are\navailable at {\\small{\\url{https://github.com/gmberton/EigenPlaces}}}, while\nresults with any other baseline can be computed with the codebase at\n{\\small{\\url{https://github.com/gmberton/auto_VPR}}}.\n","authors":["Gabriele Berton","Gabriele Trivigno","Barbara Caputo","Carlo Masone"],"pdf_url":"https://arxiv.org/pdf/2308.10832v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12463v2","updated":"2023-08-21T16:16:59Z","published":"2023-07-24T00:53:46Z","title":"Rethinking Data Distillation: Do Not Overlook Calibration","summary":" Neural networks trained on distilled data often produce over-confident output\nand require correction by calibration methods. Existing calibration methods\nsuch as temperature scaling and mixup work well for networks trained on\noriginal large-scale data. However, we find that these methods fail to\ncalibrate networks trained on data distilled from large source datasets. In\nthis paper, we show that distilled data lead to networks that are not\ncalibratable due to (i) a more concentrated distribution of the maximum logits\nand (ii) the loss of information that is semantically meaningful but unrelated\nto classification tasks. To address this problem, we propose Masked Temperature\nScaling (MTS) and Masked Distillation Training (MDT) which mitigate the\nlimitations of distilled data and achieve better calibration results while\nmaintaining the efficiency of dataset distillation.\n","authors":["Dongyao Zhu","Bowen Lei","Jie Zhang","Yanbo Fang","Ruqi Zhang","Yiqun Xie","Dongkuan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.12463v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10820v1","updated":"2023-08-21T16:12:31Z","published":"2023-08-21T16:12:31Z","title":"Pixel Adaptive Deep Unfolding Transformer for Hyperspectral Image\n Reconstruction","summary":" Hyperspectral Image (HSI) reconstruction has made gratifying progress with\nthe deep unfolding framework by formulating the problem into a data module and\na prior module. Nevertheless, existing methods still face the problem of\ninsufficient matching with HSI data. The issues lie in three aspects: 1) fixed\ngradient descent step in the data module while the degradation of HSI is\nagnostic in the pixel-level. 2) inadequate prior module for 3D HSI cube. 3)\nstage interaction ignoring the differences in features at different stages. To\naddress these issues, in this work, we propose a Pixel Adaptive Deep Unfolding\nTransformer (PADUT) for HSI reconstruction. In the data module, a pixel\nadaptive descent step is employed to focus on pixel-level agnostic degradation.\nIn the prior module, we introduce the Non-local Spectral Transformer (NST) to\nemphasize the 3D characteristics of HSI for recovering. Moreover, inspired by\nthe diverse expression of features in different stages and depths, the stage\ninteraction is improved by the Fast Fourier Transform (FFT). Experimental\nresults on both simulated and real scenes exhibit the superior performance of\nour method compared to state-of-the-art HSI reconstruction methods. The code is\nreleased at: https://github.com/MyuLi/PADUT.\n","authors":["Miaoyu Li","Ying Fu","Ji Liu","Yulun Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10820v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10814v1","updated":"2023-08-21T16:03:35Z","published":"2023-08-21T16:03:35Z","title":"Jumping through Local Minima: Quantization in the Loss Landscape of\n Vision Transformers","summary":" Quantization scale and bit-width are the most important parameters when\nconsidering how to quantize a neural network. Prior work focuses on optimizing\nquantization scales in a global manner through gradient methods (gradient\ndescent \\& Hessian analysis). Yet, when applying perturbations to quantization\nscales, we observe a very jagged, highly non-smooth test loss landscape. In\nfact, small perturbations in quantization scale can greatly affect accuracy,\nyielding a $0.5-0.8\\%$ accuracy boost in 4-bit quantized vision transformers\n(ViTs). In this regime, gradient methods break down, since they cannot reliably\nreach local minima. In our work, dubbed Evol-Q, we use evolutionary search to\neffectively traverse the non-smooth landscape. Additionally, we propose using\nan infoNCE loss, which not only helps combat overfitting on the small\ncalibration dataset ($1,000$ images) but also makes traversing such a highly\nnon-smooth surface easier. Evol-Q improves the top-1 accuracy of a fully\nquantized ViT-Base by $10.30\\%$, $0.78\\%$, and $0.15\\%$ for $3$-bit, $4$-bit,\nand $8$-bit weight quantization levels. Extensive experiments on a variety of\nCNN and ViT architectures further demonstrate its robustness in extreme\nquantization scenarios. Our code is available at\nhttps://github.com/enyac-group/evol-q\n","authors":["Natalia Frumkin","Dibakar Gope","Diana Marculescu"],"pdf_url":"https://arxiv.org/pdf/2308.10814v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2211.09643"},{"id":"http://arxiv.org/abs/2308.10809v1","updated":"2023-08-21T15:58:47Z","published":"2023-08-21T15:58:47Z","title":"Improving Continuous Sign Language Recognition with Cross-Lingual Signs","summary":" This work dedicates to continuous sign language recognition (CSLR), which is\na weakly supervised task dealing with the recognition of continuous signs from\nvideos, without any prior knowledge about the temporal boundaries between\nconsecutive signs. Data scarcity heavily impedes the progress of CSLR. Existing\napproaches typically train CSLR models on a monolingual corpus, which is orders\nof magnitude smaller than that of speech recognition. In this work, we explore\nthe feasibility of utilizing multilingual sign language corpora to facilitate\nmonolingual CSLR. Our work is built upon the observation of cross-lingual\nsigns, which originate from different sign languages but have similar visual\nsignals (e.g., hand shape and motion). The underlying idea of our approach is\nto identify the cross-lingual signs in one sign language and properly leverage\nthem as auxiliary training data to improve the recognition capability of\nanother. To achieve the goal, we first build two sign language dictionaries\ncontaining isolated signs that appear in two datasets. Then we identify the\nsign-to-sign mappings between two sign languages via a well-optimized isolated\nsign language recognition model. At last, we train a CSLR model on the\ncombination of the target data with original labels and the auxiliary data with\nmapped labels. Experimentally, our approach achieves state-of-the-art\nperformance on two widely-used CSLR datasets: Phoenix-2014 and Phoenix-2014T.\n","authors":["Fangyun Wei","Yutong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.10809v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10794v1","updated":"2023-08-21T15:39:41Z","published":"2023-08-21T15:39:41Z","title":"MGMAE: Motion Guided Masking for Video Masked Autoencoding","summary":" Masked autoencoding has shown excellent performance on self-supervised video\nrepresentation learning. Temporal redundancy has led to a high masking ratio\nand customized masking strategy in VideoMAE. In this paper, we aim to further\nimprove the performance of video masked autoencoding by introducing a motion\nguided masking strategy. Our key insight is that motion is a general and unique\nprior in video, which should be taken into account during masked pre-training.\nOur motion guided masking explicitly incorporates motion information to build\ntemporal consistent masking volume. Based on this masking volume, we can track\nthe unmasked tokens in time and sample a set of temporal consistent cubes from\nvideos. These temporal aligned unmasked tokens will further relieve the\ninformation leakage issue in time and encourage the MGMAE to learn more useful\nstructure information. We implement our MGMAE with an online efficient optical\nflow estimator and backward masking map warping strategy. We perform\nexperiments on the datasets of Something-Something V2 and Kinetics-400,\ndemonstrating the superior performance of our MGMAE to the original VideoMAE.\nIn addition, we provide the visualization analysis to illustrate that our MGMAE\ncan sample temporal consistent cubes in a motion-adaptive manner for more\neffective video pre-training.\n","authors":["Bingkun Huang","Zhiyu Zhao","Guozhen Zhang","Yu Qiao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10794v1.pdf","comment":"ICCV 2023 camera-ready version"},{"id":"http://arxiv.org/abs/2308.10790v1","updated":"2023-08-21T15:34:32Z","published":"2023-08-21T15:34:32Z","title":"Extraction of Text from Optic Nerve Optical Coherence Tomography Reports","summary":" Purpose: The purpose of this study was to develop and evaluate rule-based\nalgorithms to enhance the extraction of text data, including retinal nerve\nfiber layer (RNFL) values and other ganglion cell count (GCC) data, from Zeiss\nCirrus optical coherence tomography (OCT) scan reports. Methods: DICOM files\nthat contained encapsulated PDF reports with RNFL or Ganglion Cell in their\ndocument titles were identified from a clinical imaging repository at a single\nacademic ophthalmic center. PDF reports were then converted into image files\nand processed using the PaddleOCR Python package for optical character\nrecognition. Rule-based algorithms were designed and iteratively optimized for\nimproved performance in extracting RNFL and GCC data. Evaluation of the\nalgorithms was conducted through manual review of a set of RNFL and GCC\nreports. Results: The developed algorithms demonstrated high precision in\nextracting data from both RNFL and GCC scans. Precision was slightly better for\nthe right eye in RNFL extraction (OD: 0.9803 vs. OS: 0.9046), and for the left\neye in GCC extraction (OD: 0.9567 vs. OS: 0.9677). Some values presented more\nchallenges in extraction, particularly clock hours 5 and 6 for RNFL thickness,\nand signal strength for GCC. Conclusions: A customized optical character\nrecognition algorithm can identify numeric results from optical coherence scan\nreports with high precision. Automated processing of PDF reports can greatly\nreduce the time to extract OCT results on a large scale.\n","authors":["Iyad Majid","Youchen Victor Zhang","Robert Chang","Sophia Y. Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06671v2","updated":"2023-08-21T15:29:17Z","published":"2023-05-11T09:10:21Z","title":"WeditGAN: Few-shot Image Generation via Latent Space Relocation","summary":" In few-shot image generation, directly training GAN models on just a handful\nof images faces the risk of overfitting. A popular solution is to transfer the\nmodels pretrained on large source domains to small target ones. In this work,\nwe introduce WeditGAN, which realizes model transfer by editing the\nintermediate latent codes $w$ in StyleGANs with learned constant offsets\n($\\Delta w$), discovering and constructing target latent spaces via simply\nrelocating the distribution of source latent spaces. The established one-to-one\nmapping between latent spaces can naturally prevents mode collapse and\noverfitting. Besides, we also propose variants of WeditGAN to further enhance\nthe relocation process by regularizing the direction or finetuning the\nintensity of $\\Delta w$. Experiments on a collection of widely used\nsource/target datasets manifest the capability of WeditGAN in generating\nrealistic and diverse images, which is simple yet highly effective in the\nresearch area of few-shot image generation.\n","authors":["Yuxuan Duan","Li Niu","Yan Hong","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.06671v2.pdf","comment":"under review, see supplementary material for updates of this version"},{"id":"http://arxiv.org/abs/2012.04841v4","updated":"2023-08-21T15:23:10Z","published":"2020-12-09T03:20:06Z","title":"One-Vote Veto: Semi-Supervised Learning for Low-Shot Glaucoma Diagnosis","summary":" Convolutional neural networks (CNNs) are a promising technique for automated\nglaucoma diagnosis from images of the fundus, and these images are routinely\nacquired as part of an ophthalmic exam. Nevertheless, CNNs typically require a\nlarge amount of well-labeled data for training, which may not be available in\nmany biomedical image classification applications, especially when diseases are\nrare and where labeling by experts is costly. This article makes two\ncontributions to address this issue: (1) It extends the conventional Siamese\nnetwork and introduces a training method for low-shot learning when labeled\ndata are limited and imbalanced, and (2) it introduces a novel semi-supervised\nlearning strategy that uses additional unlabeled training data to achieve\ngreater accuracy. Our proposed multi-task Siamese network (MTSN) can employ any\nbackbone CNN, and we demonstrate with four backbone CNNs that its accuracy with\nlimited training data approaches the accuracy of backbone CNNs trained with a\ndataset that is 50 times larger. We also introduce One-Vote Veto (OVV)\nself-training, a semi-supervised learning strategy that is designed\nspecifically for MTSNs. By taking both self-predictions and contrastive\npredictions of the unlabeled training data into account, OVV self-training\nprovides additional pseudo labels for fine-tuning a pre-trained MTSN. Using a\nlarge (imbalanced) dataset with 66,715 fundus photographs acquired over 15\nyears, extensive experimental results demonstrate the effectiveness of low-shot\nlearning with MTSN and semi-supervised learning with OVV self-training. Three\nadditional, smaller clinical datasets of fundus images acquired under different\nconditions (cameras, instruments, locations, populations) are used to\ndemonstrate the generalizability of the proposed methods.\n","authors":["Rui Fan","Christopher Bowd","Nicole Brye","Mark Christopher","Robert N. Weinreb","David Kriegman","Linda M. Zangwill"],"pdf_url":"https://arxiv.org/pdf/2012.04841v4.pdf","comment":"accepted by IEEE Transactions on Medical Imaging (T-MI). DOI:\n 10.1109/TMI.2023.3307689"},{"id":"http://arxiv.org/abs/2308.10784v1","updated":"2023-08-21T15:19:32Z","published":"2023-08-21T15:19:32Z","title":"Dense Error Map Estimation for MRI-Ultrasound Registration in Brain\n Tumor Surgery Using Swin UNETR","summary":" Early surgical treatment of brain tumors is crucial in reducing patient\nmortality rates. However, brain tissue deformation (called brain shift) occurs\nduring the surgery, rendering pre-operative images invalid. As a cost-effective\nand portable tool, intra-operative ultrasound (iUS) can track brain shift, and\naccurate MRI-iUS registration techniques can update pre-surgical plans and\nfacilitate the interpretation of iUS. This can boost surgical safety and\noutcomes by maximizing tumor removal while avoiding eloquent regions. However,\nmanual assessment of MRI-iUS registration results in real-time is difficult and\nprone to errors due to the 3D nature of the data. Automatic algorithms that can\nquantify the quality of inter-modal medical image registration outcomes can be\nhighly beneficial. Therefore, we propose a novel deep-learning (DL) based\nframework with the Swin UNETR to automatically assess 3D-patch-wise dense error\nmaps for MRI-iUS registration in iUS-guided brain tumor resection and show its\nperformance with real clinical data for the first time.\n","authors":["Soorena Salari","Amirhossein Rasoulian","Hassan Rivaz","Yiming Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.10784v1.pdf","comment":"Accepted in IEEE IUS 2023"},{"id":"http://arxiv.org/abs/2211.15660v3","updated":"2023-08-21T15:09:13Z","published":"2022-11-28T18:59:26Z","title":"SatlasPretrain: A Large-Scale Dataset for Remote Sensing Image\n Understanding","summary":" Remote sensing images are useful for a wide variety of planet monitoring\napplications, from tracking deforestation to tackling illegal fishing. The\nEarth is extremely diverse -- the amount of potential tasks in remote sensing\nimages is massive, and the sizes of features range from several kilometers to\njust tens of centimeters. However, creating generalizable computer vision\nmethods is a challenge in part due to the lack of a large-scale dataset that\ncaptures these diverse features for many tasks. In this paper, we present\nSatlasPretrain, a remote sensing dataset that is large in both breadth and\nscale, combining Sentinel-2 and NAIP images with 302M labels under 137\ncategories and seven label types. We evaluate eight baselines and a proposed\nmethod on SatlasPretrain, and find that there is substantial room for\nimprovement in addressing research challenges specific to remote sensing,\nincluding processing image time series that consist of images from very\ndifferent types of sensors, and taking advantage of long-range spatial context.\nMoreover, we find that pre-training on SatlasPretrain substantially improves\nperformance on downstream tasks, increasing average accuracy by 18% over\nImageNet and 6% over the next best baseline. The dataset, pre-trained model\nweights, and code are available at https://satlas-pretrain.allen.ai/.\n","authors":["Favyen Bastani","Piper Wolters","Ritwik Gupta","Joe Ferdinando","Aniruddha Kembhavi"],"pdf_url":"https://arxiv.org/pdf/2211.15660v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2303.11329v2","updated":"2023-08-21T14:59:10Z","published":"2023-03-20T17:59:55Z","title":"Sound Localization from Motion: Jointly Learning Sound Direction and\n Camera Rotation","summary":" The images and sounds that we perceive undergo subtle but geometrically\nconsistent changes as we rotate our heads. In this paper, we use these cues to\nsolve a problem we call Sound Localization from Motion (SLfM): jointly\nestimating camera rotation and localizing sound sources. We learn to solve\nthese tasks solely through self-supervision. A visual model predicts camera\nrotation from a pair of images, while an audio model predicts the direction of\nsound sources from binaural sounds. We train these models to generate\npredictions that agree with one another. At test time, the models can be\ndeployed independently. To obtain a feature representation that is well-suited\nto solving this challenging problem, we also propose a method for learning an\naudio-visual representation through cross-view binauralization: estimating\nbinaural sound from one view, given images and sound from another. Our model\ncan successfully estimate accurate rotations on both real and synthetic scenes,\nand localize sound sources with accuracy competitive with state-of-the-art\nself-supervised approaches. Project site: https://ificl.github.io/SLfM/\n","authors":["Ziyang Chen","Shengyi Qian","Andrew Owens"],"pdf_url":"https://arxiv.org/pdf/2303.11329v2.pdf","comment":"ICCV 2023. Project site: https://ificl.github.io/SLfM/"},{"id":"http://arxiv.org/abs/2308.10761v1","updated":"2023-08-21T14:49:37Z","published":"2023-08-21T14:49:37Z","title":"CoNe: Contrast Your Neighbours for Supervised Image Classification","summary":" Image classification is a longstanding problem in computer vision and machine\nlearning research. Most recent works (e.g. SupCon , Triplet, and max-margin)\nmainly focus on grouping the intra-class samples aggressively and compactly,\nwith the assumption that all intra-class samples should be pulled tightly\ntowards their class centers. However, such an objective will be very hard to\nachieve since it ignores the intra-class variance in the dataset. (i.e.\ndifferent instances from the same class can have significant differences).\nThus, such a monotonous objective is not sufficient. To provide a more\ninformative objective, we introduce Contrast Your Neighbours (CoNe) - a simple\nyet practical learning framework for supervised image classification.\nSpecifically, in CoNe, each sample is not only supervised by its class center\nbut also directly employs the features of its similar neighbors as anchors to\ngenerate more adaptive and refined targets. Moreover, to further boost the\nperformance, we propose ``distributional consistency\" as a more informative\nregularization to enable similar instances to have a similar probability\ndistribution. Extensive experimental results demonstrate that CoNe achieves\nstate-of-the-art performance across different benchmark datasets, network\narchitectures, and settings. Notably, even without a complicated training\nrecipe, our CoNe achieves 80.8\\% Top-1 accuracy on ImageNet with ResNet-50,\nwhich surpasses the recent Timm training recipe (80.4\\%). Code and pre-trained\nmodels are available at\n\\href{https://github.com/mingkai-zheng/CoNe}{https://github.com/mingkai-zheng/CoNe}.\n","authors":["Mingkai Zheng","Shan You","Lang Huang","Xiu Su","Fei Wang","Chen Qian","Xiaogang Wang","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.10761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07615v2","updated":"2023-08-21T14:48:09Z","published":"2023-08-15T07:51:53Z","title":"Self-supervised Hypergraphs for Learning Multiple World Interpretations","summary":" We present a method for learning multiple scene representations given a small\nlabeled set, by exploiting the relationships between such representations in\nthe form of a multi-task hypergraph. We also show how we can use the hypergraph\nto improve a powerful pretrained VisTransformer model without any additional\nlabeled data. In our hypergraph, each node is an interpretation layer (e.g.,\ndepth or segmentation) of the scene. Within each hyperedge, one or several\ninput nodes predict the layer at the output node. Thus, each node could be an\ninput node in some hyperedges and an output node in others. In this way,\nmultiple paths can reach the same node, to form ensembles from which we obtain\nrobust pseudolabels, which allow self-supervised learning in the hypergraph. We\ntest different ensemble models and different types of hyperedges and show\nsuperior performance to other multi-task graph models in the field. We also\nintroduce Dronescapes, a large video dataset captured with UAVs in different\ncomplex real-world scenes, with multiple representations, suitable for\nmulti-task learning.\n","authors":["Alina Marcu","Mihai Pirvu","Dragos Costea","Emanuela Haller","Emil Slusanschi","Ahmed Nabil Belbachir","Rahul Sukthankar","Marius Leordeanu"],"pdf_url":"https://arxiv.org/pdf/2308.07615v2.pdf","comment":"Accepted in ICCV 2023 Workshops"},{"id":"http://arxiv.org/abs/2308.10755v1","updated":"2023-08-21T14:40:48Z","published":"2023-08-21T14:40:48Z","title":"WanJuan: A Comprehensive Multimodal Dataset for Advancing English and\n Chinese Large Models","summary":" The rise in popularity of ChatGPT and GPT-4 has significantly accelerated the\ndevelopment of large models, leading to the creation of numerous impressive\nlarge language models(LLMs) and multimodal large language models (MLLMs). These\ncutting-edge models owe their remarkable performance to high-quality data.\nHowever, the details of the training data used in leading paradigms are often\nkept confidential. This lack of transparency, coupled with the scarcity of\nopen-source data, impedes further developments within the community. As a\nresponse, this paper presents \"Wan Juan\", a large-scale multimodal dataset\ncomposed of both Chinese and English data, collected from a wide range of web\nsources. The dataset incorporates text, image-text, and video modalities, with\na total volume exceeding 2TB. It was utilized in the training of InternLM, a\nmodel that demonstrated significant advantages in multi-dimensional evaluations\nwhen compared to models of a similar scale. All data can be accessed at\nhttps://opendatalab.org.cn/WanJuan1.0.\n","authors":["Conghui He","Zhenjiang Jin","Chao Xu","Jiantao Qiu","Bin Wang","Wei Li","Hang Yan","JiaQi Wang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.10755v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2109.02081v2","updated":"2023-08-21T14:36:56Z","published":"2021-09-05T14:15:24Z","title":"Deep Person Generation: A Survey from the Perspective of Face, Pose and\n Cloth Synthesis","summary":" Deep person generation has attracted extensive research attention due to its\nwide applications in virtual agents, video conferencing, online shopping and\nart/movie production. With the advancement of deep learning, visual appearances\n(face, pose, cloth) of a person image can be easily generated or manipulated on\ndemand. In this survey, we first summarize the scope of person generation, and\nthen systematically review recent progress and technical trends in deep person\ngeneration, covering three major tasks: talking-head generation (face),\npose-guided person generation (pose) and garment-oriented person generation\n(cloth). More than two hundred papers are covered for a thorough overview, and\nthe milestone works are highlighted to witness the major technical\nbreakthrough. Based on these fundamental tasks, a number of applications are\ninvestigated, e.g., virtual fitting, digital human, generative data\naugmentation. We hope this survey could shed some light on the future prospects\nof deep person generation, and provide a helpful foundation for full\napplications towards digital human.\n","authors":["Tong Sha","Wei Zhang","Tong Shen","Zhoujun Li","Tao Mei"],"pdf_url":"https://arxiv.org/pdf/2109.02081v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16053v2","updated":"2023-08-21T14:18:55Z","published":"2023-03-28T15:35:25Z","title":"Real-time Multi-person Eyeblink Detection in the Wild for Untrimmed\n Video","summary":" Real-time eyeblink detection in the wild can widely serve for fatigue\ndetection, face anti-spoofing, emotion analysis, etc. The existing research\nefforts generally focus on single-person cases towards trimmed video. However,\nmulti-person scenario within untrimmed videos is also important for practical\napplications, which has not been well concerned yet. To address this, we shed\nlight on this research field for the first time with essential contributions on\ndataset, theory, and practices. In particular, a large-scale dataset termed\nMPEblink that involves 686 untrimmed videos with 8748 eyeblink events is\nproposed under multi-person conditions. The samples are captured from\nunconstrained films to reveal \"in the wild\" characteristics. Meanwhile, a\nreal-time multi-person eyeblink detection method is also proposed. Being\ndifferent from the existing counterparts, our proposition runs in a one-stage\nspatio-temporal way with end-to-end learning capacity. Specifically, it\nsimultaneously addresses the sub-tasks of face detection, face tracking, and\nhuman instance-level eyeblink detection. This paradigm holds 2 main advantages:\n(1) eyeblink features can be facilitated via the face's global context (e.g.,\nhead pose and illumination condition) with joint optimization and interaction,\nand (2) addressing these sub-tasks in parallel instead of sequential manner can\nsave time remarkably to meet the real-time running requirement. Experiments on\nMPEblink verify the essential challenges of real-time multi-person eyeblink\ndetection in the wild for untrimmed video. Our method also outperforms existing\napproaches by large margins and with a high inference speed.\n","authors":["Wenzheng Zeng","Yang Xiao","Sicheng Wei","Jinfang Gan","Xintao Zhang","Zhiguo Cao","Zhiwen Fang","Joey Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2303.16053v2.pdf","comment":"Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2308.10743v1","updated":"2023-08-21T14:16:36Z","published":"2023-08-21T14:16:36Z","title":"Boosting Adversarial Attack with Similar Target","summary":" Deep neural networks are vulnerable to adversarial examples, posing a threat\nto the models' applications and raising security concerns. An intriguing\nproperty of adversarial examples is their strong transferability. Several\nmethods have been proposed to enhance transferability, including ensemble\nattacks which have demonstrated their efficacy. However, prior approaches\nsimply average logits, probabilities, or losses for model ensembling, lacking a\ncomprehensive analysis of how and why model ensembling significantly improves\ntransferability. In this paper, we propose a similar targeted attack method\nnamed Similar Target~(ST). By promoting cosine similarity between the gradients\nof each model, our method regularizes the optimization direction to\nsimultaneously attack all surrogate models. This strategy has been proven to\nenhance generalization ability. Experimental results on ImageNet validate the\neffectiveness of our approach in improving adversarial transferability. Our\nmethod outperforms state-of-the-art attackers on 18 discriminative classifiers\nand adversarially trained models.\n","authors":["Shuo Zhang","Ziruo Wang","Zikai Zhou","Huanran Chen"],"pdf_url":"https://arxiv.org/pdf/2308.10743v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10729v1","updated":"2023-08-21T13:54:00Z","published":"2023-08-21T13:54:00Z","title":"Patch Is Not All You Need","summary":" Vision Transformers have achieved great success in computer visions,\ndelivering exceptional performance across various tasks. However, their\ninherent reliance on sequential input enforces the manual partitioning of\nimages into patch sequences, which disrupts the image's inherent structural and\nsemantic continuity. To handle this, we propose a novel Pattern Transformer\n(Patternformer) to adaptively convert images to pattern sequences for\nTransformer input. Specifically, we employ the Convolutional Neural Network to\nextract various patterns from the input image, with each channel representing a\nunique pattern that is fed into the succeeding Transformer as a visual token.\nBy enabling the network to optimize these patterns, each pattern concentrates\non its local region of interest, thereby preserving its intrinsic structural\nand semantic information. Only employing the vanilla ResNet and Transformer, we\nhave accomplished state-of-the-art performance on CIFAR-10 and CIFAR-100, and\nhave achieved competitive results on ImageNet.\n","authors":["Changzhen Li","Jie Zhang","Yang Wei","Zhilong Ji","Jinfeng Bai","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2308.10729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10727v1","updated":"2023-08-21T13:50:41Z","published":"2023-08-21T13:50:41Z","title":"Test-time augmentation-based active learning and self-training for\n label-efficient segmentation","summary":" Deep learning techniques depend on large datasets whose annotation is\ntime-consuming. To reduce annotation burden, the self-training (ST) and\nactive-learning (AL) methods have been developed as well as methods that\ncombine them in an iterative fashion. However, it remains unclear when each\nmethod is the most useful, and when it is advantageous to combine them. In this\npaper, we propose a new method that combines ST with AL using Test-Time\nAugmentations (TTA). First, TTA is performed on an initial teacher network.\nThen, cases for annotation are selected based on the lowest estimated Dice\nscore. Cases with high estimated scores are used as soft pseudo-labels for ST.\nThe selected annotated cases are trained with existing annotated cases and ST\ncases with border slices annotations. We demonstrate the method on MRI fetal\nbody and placenta segmentation tasks with different data variability\ncharacteristics. Our results indicate that ST is highly effective for both\ntasks, boosting performance for in-distribution (ID) and out-of-distribution\n(OOD) data. However, while self-training improved the performance of\nsingle-sequence fetal body segmentation when combined with AL, it slightly\ndeteriorated performance of multi-sequence placenta segmentation on ID data. AL\nwas helpful for the high variability placenta data, but did not improve upon\nrandom selection for the single-sequence body data. For fetal body segmentation\nsequence transfer, combining AL with ST following ST iteration yielded a Dice\nof 0.961 with only 6 original scans and 2 new sequence scans. Results using\nonly 15 high-variability placenta cases were similar to those using 50 cases.\nCode is available at: https://github.com/Bella31/TTA-quality-estimation-ST-AL\n","authors":["Bella Specktor-Fadida","Anna Levchakov","Dana Schonberger","Liat Ben-Sira","Dafna Ben-Bashat","Leo Joskowicz"],"pdf_url":"https://arxiv.org/pdf/2308.10727v1.pdf","comment":"Accepted to MICCAI MILLanD workshop 2023"},{"id":"http://arxiv.org/abs/2303.12048v2","updated":"2023-08-21T13:45:55Z","published":"2023-03-21T17:36:36Z","title":"Vox-E: Text-guided Voxel Editing of 3D Objects","summary":" Large scale text-guided diffusion models have garnered significant attention\ndue to their ability to synthesize diverse images that convey complex visual\nconcepts. This generative power has more recently been leveraged to perform\ntext-to-3D synthesis. In this work, we present a technique that harnesses the\npower of latent diffusion models for editing existing 3D objects. Our method\ntakes oriented 2D images of a 3D object as input and learns a grid-based\nvolumetric representation of it. To guide the volumetric representation to\nconform to a target text prompt, we follow unconditional text-to-3D methods and\noptimize a Score Distillation Sampling (SDS) loss. However, we observe that\ncombining this diffusion-guided loss with an image-based regularization loss\nthat encourages the representation not to deviate too strongly from the input\nobject is challenging, as it requires achieving two conflicting goals while\nviewing only structure-and-appearance coupled 2D projections. Thus, we\nintroduce a novel volumetric regularization loss that operates directly in 3D\nspace, utilizing the explicit nature of our 3D representation to enforce\ncorrelation between the global structure of the original and edited object.\nFurthermore, we present a technique that optimizes cross-attention volumetric\ngrids to refine the spatial extent of the edits. Extensive experiments and\ncomparisons demonstrate the effectiveness of our approach in creating a myriad\nof edits which cannot be achieved by prior works.\n","authors":["Etai Sella","Gal Fiebelman","Peter Hedman","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2303.12048v2.pdf","comment":"Project webpage: https://tau-vailab.github.io/Vox-E/"},{"id":"http://arxiv.org/abs/2308.10718v1","updated":"2023-08-21T13:39:04Z","published":"2023-08-21T13:39:04Z","title":"Backdooring Textual Inversion for Concept Censorship","summary":" Recent years have witnessed success in AIGC (AI Generated Content). People\ncan make use of a pre-trained diffusion model to generate images of high\nquality or freely modify existing pictures with only prompts in nature\nlanguage. More excitingly, the emerging personalization techniques make it\nfeasible to create specific-desired images with only a few images as\nreferences. However, this induces severe threats if such advanced techniques\nare misused by malicious users, such as spreading fake news or defaming\nindividual reputations. Thus, it is necessary to regulate personalization\nmodels (i.e., concept censorship) for their development and advancement.\n In this paper, we focus on the personalization technique dubbed Textual\nInversion (TI), which is becoming prevailing for its lightweight nature and\nexcellent performance. TI crafts the word embedding that contains detailed\ninformation about a specific object. Users can easily download the word\nembedding from public websites like Civitai and add it to their own stable\ndiffusion model without fine-tuning for personalization. To achieve the concept\ncensorship of a TI model, we propose leveraging the backdoor technique for good\nby injecting backdoors into the Textual Inversion embeddings. Briefly, we\nselect some sensitive words as triggers during the training of TI, which will\nbe censored for normal use. In the subsequent generation stage, if the triggers\nare combined with personalized embeddings as final prompts, the model will\noutput a pre-defined target image rather than images including the desired\nmalicious concept.\n To demonstrate the effectiveness of our approach, we conduct extensive\nexperiments on Stable Diffusion, a prevailing open-sourced text-to-image model.\nOur code, data, and results are available at\nhttps://concept-censorship.github.io.\n","authors":["Yutong wu","Jie Zhang","Florian Kerschbaum","Tianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10717v1","updated":"2023-08-21T13:38:10Z","published":"2023-08-21T13:38:10Z","title":"Rethinking Person Re-identification from a Projection-on-Prototypes\n Perspective","summary":" Person Re-IDentification (Re-ID) as a retrieval task, has achieved tremendous\ndevelopment over the past decade. Existing state-of-the-art methods follow an\nanalogous framework to first extract features from the input images and then\ncategorize them with a classifier. However, since there is no identity overlap\nbetween training and testing sets, the classifier is often discarded during\ninference. Only the extracted features are used for person retrieval via\ndistance metrics. In this paper, we rethink the role of the classifier in\nperson Re-ID, and advocate a new perspective to conceive the classifier as a\nprojection from image features to class prototypes. These prototypes are\nexactly the learned parameters of the classifier. In this light, we describe\nthe identity of input images as similarities to all prototypes, which are then\nutilized as more discriminative features to perform person Re-ID. We thereby\npropose a new baseline ProNet, which innovatively reserves the function of the\nclassifier at the inference stage. To facilitate the learning of class\nprototypes, both triplet loss and identity classification loss are applied to\nfeatures that undergo the projection by the classifier. An improved version of\nProNet++ is presented by further incorporating multi-granularity designs.\nExperiments on four benchmarks demonstrate that our proposed ProNet is simple\nyet effective, and significantly beats previous baselines. ProNet++ also\nachieves competitive or even better results than transformer-based competitors.\n","authors":["Qizao Wang","Xuelin Qian","Bin Li","Yanwei Fu","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2308.10717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10716v1","updated":"2023-08-21T13:38:09Z","published":"2023-08-21T13:38:09Z","title":"Color Prompting for Data-Free Continual Unsupervised Domain Adaptive\n Person Re-Identification","summary":" Unsupervised domain adaptive person re-identification (Re-ID) methods\nalleviate the burden of data annotation through generating pseudo supervision\nmessages. However, real-world Re-ID systems, with continuously accumulating\ndata streams, simultaneously demand more robust adaptation and anti-forgetting\ncapabilities. Methods based on image rehearsal addresses the forgetting issue\nwith limited extra storage but carry the risk of privacy leakage. In this work,\nwe propose a Color Prompting (CoP) method for data-free continual unsupervised\ndomain adaptive person Re-ID. Specifically, we employ a light-weighted prompter\nnetwork to fit the color distribution of the current task together with Re-ID\ntraining. Then for the incoming new tasks, the learned color distribution\nserves as color style transfer guidance to transfer the images into past\nstyles. CoP achieves accurate color style recovery for past tasks with adequate\ndata diversity, leading to superior anti-forgetting effects compared with image\nrehearsal methods. Moreover, CoP demonstrates strong generalization performance\nfor fast adaptation into new domains, given only a small amount of unlabeled\nimages. Extensive experiments demonstrate that after the continual training\npipeline the proposed CoP achieves 6.7% and 8.1% average rank-1 improvements\nover the replay method on seen and unseen domains, respectively. The source\ncode for this work is publicly available in\nhttps://github.com/vimar-gu/ColorPromptReID.\n","authors":["Jianyang Gu","Hao Luo","Kai Wang","Wei Jiang","Yang You","Jian Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.10716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10704v1","updated":"2023-08-21T13:18:12Z","published":"2023-08-21T13:18:12Z","title":"Sampling From Autoencoders' Latent Space via Quantization And\n Probability Mass Function Concepts","summary":" In this study, we focus on sampling from the latent space of generative\nmodels built upon autoencoders so as the reconstructed samples are lifelike\nimages. To do to, we introduce a novel post-training sampling algorithm rooted\nin the concept of probability mass functions, coupled with a quantization\nprocess. Our proposed algorithm establishes a vicinity around each latent\nvector from the input data and then proceeds to draw samples from these defined\nneighborhoods. This strategic approach ensures that the sampled latent vectors\npredominantly inhabit high-probability regions, which, in turn, can be\neffectively transformed into authentic real-world images. A noteworthy point of\ncomparison for our sampling algorithm is the sampling technique based on\nGaussian mixture models (GMM), owing to its inherent capability to represent\nclusters. Remarkably, we manage to improve the time complexity from the\nprevious $\\mathcal{O}(n\\times d \\times k \\times i)$ associated with GMM\nsampling to a much more streamlined $\\mathcal{O}(n\\times d)$, thereby resulting\nin substantial speedup during runtime. Moreover, our experimental results,\ngauged through the Fr\\'echet inception distance (FID) for image generation,\nunderscore the superior performance of our sampling algorithm across a diverse\nrange of models and datasets. On the MNIST benchmark dataset, our approach\noutperforms GMM sampling by yielding a noteworthy improvement of up to $0.89$\nin FID value. Furthermore, when it comes to generating images of faces and\nocular images, our approach showcases substantial enhancements with FID\nimprovements of $1.69$ and $0.87$ respectively, as compared to GMM sampling, as\nevidenced on the CelebA and MOBIUS datasets. Lastly, we substantiate our\nmethodology's efficacy in estimating latent space distributions in contrast to\nGMM sampling, particularly through the lens of the Wasserstein distance.\n","authors":["Aymene Mohammed Bouayed","Adrian Iaccovelli","David Naccache"],"pdf_url":"https://arxiv.org/pdf/2308.10704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12966v3","updated":"2023-08-21T13:16:27Z","published":"2023-05-22T12:18:20Z","title":"Hierarchical Integration Diffusion Model for Realistic Image Deblurring","summary":" Diffusion models (DMs) have recently been introduced in image deblurring and\nexhibited promising performance, particularly in terms of details\nreconstruction. However, the diffusion model requires a large number of\ninference iterations to recover the clean image from pure Gaussian noise, which\nconsumes massive computational resources. Moreover, the distribution\nsynthesized by the diffusion model is often misaligned with the target results,\nleading to restrictions in distortion-based metrics. To address the above\nissues, we propose the Hierarchical Integration Diffusion Model (HI-Diff), for\nrealistic image deblurring. Specifically, we perform the DM in a highly\ncompacted latent space to generate the prior feature for the deblurring\nprocess. The deblurring process is implemented by a regression-based method to\nobtain better distortion accuracy. Meanwhile, the highly compact latent space\nensures the efficiency of the DM. Furthermore, we design the hierarchical\nintegration module to fuse the prior into the regression-based model from\nmultiple scales, enabling better generalization in complex blurry scenarios.\nComprehensive experiments on synthetic and real-world blur datasets demonstrate\nthat our HI-Diff outperforms state-of-the-art methods. Code and trained models\nare available at https://github.com/zhengchen1999/HI-Diff.\n","authors":["Zheng Chen","Yulun Zhang","Ding Liu","Bin Xia","Jinjin Gu","Linghe Kong","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2305.12966v3.pdf","comment":"Code is available at https://github.com/zhengchen1999/HI-Diff"},{"id":"http://arxiv.org/abs/2307.10816v4","updated":"2023-08-21T13:07:10Z","published":"2023-07-20T12:25:06Z","title":"BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained\n Diffusion","summary":" Recent text-to-image diffusion models have demonstrated an astonishing\ncapacity to generate high-quality images. However, researchers mainly studied\nthe way of synthesizing images with only text prompts. While some works have\nexplored using other modalities as conditions, considerable paired data, e.g.,\nbox/mask-image pairs, and fine-tuning time are required for nurturing models.\nAs such paired data is time-consuming and labor-intensive to acquire and\nrestricted to a closed set, this potentially becomes the bottleneck for\napplications in an open world. This paper focuses on the simplest form of\nuser-provided conditions, e.g., box or scribble. To mitigate the aforementioned\nproblem, we propose a training-free method to control objects and contexts in\nthe synthesized images adhering to the given spatial conditions. Specifically,\nthree spatial constraints, i.e., Inner-Box, Outer-Box, and Corner Constraints,\nare designed and seamlessly integrated into the denoising step of diffusion\nmodels, requiring no additional training and massive annotated layout data.\nExtensive experimental results demonstrate that the proposed constraints can\ncontrol what and where to present in the images while retaining the ability of\nDiffusion models to synthesize with high fidelity and diverse concept coverage.\nThe code is publicly available at https://github.com/showlab/BoxDiff.\n","authors":["Jinheng Xie","Yuexiang Li","Yawen Huang","Haozhe Liu","Wentian Zhang","Yefeng Zheng","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2307.10816v4.pdf","comment":"Accepted by ICCV 2023. Code is available at:\n https://github.com/showlab/BoxDiff"},{"id":"http://arxiv.org/abs/2308.10694v1","updated":"2023-08-21T13:03:25Z","published":"2023-08-21T13:03:25Z","title":"Vanishing Point Estimation in Uncalibrated Images with Prior Gravity\n Direction","summary":" We tackle the problem of estimating a Manhattan frame, i.e. three orthogonal\nvanishing points, and the unknown focal length of the camera, leveraging a\nprior vertical direction. The direction can come from an Inertial Measurement\nUnit that is a standard component of recent consumer devices, e.g.,\nsmartphones. We provide an exhaustive analysis of minimal line configurations\nand derive two new 2-line solvers, one of which does not suffer from\nsingularities affecting existing solvers. Additionally, we design a new\nnon-minimal method, running on an arbitrary number of lines, to boost the\nperformance in local optimization. Combining all solvers in a hybrid robust\nestimator, our method achieves increased accuracy even with a rough prior.\nExperiments on synthetic and real-world datasets demonstrate the superior\naccuracy of our method compared to the state of the art, while having\ncomparable runtimes. We further demonstrate the applicability of our solvers\nfor relative rotation estimation. The code is available at\nhttps://github.com/cvg/VP-Estimation-with-Prior-Gravity.\n","authors":["Rémi Pautrat","Shaohui Liu","Petr Hruby","Marc Pollefeys","Daniel Barath"],"pdf_url":"https://arxiv.org/pdf/2308.10694v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10692v1","updated":"2023-08-21T12:59:48Z","published":"2023-08-21T12:59:48Z","title":"Exploring Fine-Grained Representation and Recomposition for\n Cloth-Changing Person Re-Identification","summary":" Cloth-changing person Re-IDentification (Re-ID) is a particularly challenging\ntask, suffering from two limitations of inferior identity-relevant features and\nlimited training samples. Existing methods mainly leverage auxiliary\ninformation to facilitate discriminative feature learning, including\nsoft-biometrics features of shapes and gaits, and additional labels of\nclothing. However, these information may be unavailable in real-world\napplications. In this paper, we propose a novel FIne-grained Representation and\nRecomposition (FIRe$^{2}$) framework to tackle both limitations without any\nauxiliary information. Specifically, we first design a Fine-grained Feature\nMining (FFM) module to separately cluster images of each person. Images with\nsimilar so-called fine-grained attributes (e.g., clothes and viewpoints) are\nencouraged to cluster together. An attribute-aware classification loss is\nintroduced to perform fine-grained learning based on cluster labels, which are\nnot shared among different people, promoting the model to learn\nidentity-relevant features. Furthermore, by taking full advantage of the\nclustered fine-grained attributes, we present a Fine-grained Attribute\nRecomposition (FAR) module to recompose image features with different\nattributes in the latent space. It can significantly enhance representations\nfor robust feature learning. Extensive experiments demonstrate that FIRe$^{2}$\ncan achieve state-of-the-art performance on five widely-used cloth-changing\nperson Re-ID benchmarks.\n","authors":["Qizao Wang","Xuelin Qian","Bin Li","Ying Fu","Yanwei Fu","Xiangyang Xue"],"pdf_url":"https://arxiv.org/pdf/2308.10692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09100v3","updated":"2023-08-21T12:53:09Z","published":"2022-12-18T14:56:22Z","title":"SPARF: Large-Scale Learning of 3D Sparse Radiance Fields from Few Input\n Images","summary":" Recent advances in Neural Radiance Fields (NeRFs) treat the problem of novel\nview synthesis as Sparse Radiance Field (SRF) optimization using sparse voxels\nfor efficient and fast rendering (plenoxels,InstantNGP). In order to leverage\nmachine learning and adoption of SRFs as a 3D representation, we present SPARF,\na large-scale ShapeNet-based synthetic dataset for novel view synthesis\nconsisting of $\\sim$ 17 million images rendered from nearly 40,000 shapes at\nhigh resolution (400 X 400 pixels). The dataset is orders of magnitude larger\nthan existing synthetic datasets for novel view synthesis and includes more\nthan one million 3D-optimized radiance fields with multiple voxel resolutions.\nFurthermore, we propose a novel pipeline (SuRFNet) that learns to generate\nsparse voxel radiance fields from only few views. This is done by using the\ndensely collected SPARF dataset and 3D sparse convolutions. SuRFNet employs\npartial SRFs from few/one images and a specialized SRF loss to learn to\ngenerate high-quality sparse voxel radiance fields that can be rendered from\nnovel views. Our approach achieves state-of-the-art results in the task of\nunconstrained novel view synthesis based on few views on ShapeNet as compared\nto recent baselines. The SPARF dataset is made public with the code and models\non the project website https://abdullahamdi.com/sparf/ .\n","authors":["Abdullah Hamdi","Bernard Ghanem","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2212.09100v3.pdf","comment":"published at ICCV 2023 workshop proceedings"},{"id":"http://arxiv.org/abs/2303.12782v3","updated":"2023-08-21T12:46:09Z","published":"2023-03-22T17:52:11Z","title":"Tube-Link: A Flexible Cross Tube Framework for Universal Video\n Segmentation","summary":" Video segmentation aims to segment and track every pixel in diverse scenarios\naccurately. In this paper, we present Tube-Link, a versatile framework that\naddresses multiple core tasks of video segmentation with a unified\narchitecture. Our framework is a near-online approach that takes a short\nsubclip as input and outputs the corresponding spatial-temporal tube masks. To\nenhance the modeling of cross-tube relationships, we propose an effective way\nto perform tube-level linking via attention along the queries. In addition, we\nintroduce temporal contrastive learning to instance-wise discriminative\nfeatures for tube-level association. Our approach offers flexibility and\nefficiency for both short and long video inputs, as the length of each subclip\ncan be varied according to the needs of datasets or scenarios. Tube-Link\noutperforms existing specialized architectures by a significant margin on five\nvideo segmentation datasets. Specifically, it achieves almost 13% relative\nimprovements on VIPSeg and 4% improvements on KITTI-STEP over the strong\nbaseline Video K-Net. When using a ResNet50 backbone on Youtube-VIS-2019 and\n2021, Tube-Link boosts IDOL by 3% and 4%, respectively.\n","authors":["Xiangtai Li","Haobo Yuan","Wenwei Zhang","Guangliang Cheng","Jiangmiao Pang","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2303.12782v3.pdf","comment":"ICCV-2023, Project page: https://github.com/lxtGH/Tube-Link (fix\n typos and errors, update the results)"},{"id":"http://arxiv.org/abs/2301.04011v3","updated":"2023-08-21T12:30:12Z","published":"2023-01-08T09:27:41Z","title":"Learning Support and Trivial Prototypes for Interpretable Image\n Classification","summary":" Prototypical part network (ProtoPNet) methods have been designed to achieve\ninterpretable classification by associating predictions with a set of training\nprototypes, which we refer to as trivial prototypes because they are trained to\nlie far from the classification boundary in the feature space. Note that it is\npossible to make an analogy between ProtoPNet and support vector machine (SVM)\ngiven that the classification from both methods relies on computing similarity\nwith a set of training points (i.e., trivial prototypes in ProtoPNet, and\nsupport vectors in SVM). However, while trivial prototypes are located far from\nthe classification boundary, support vectors are located close to this\nboundary, and we argue that this discrepancy with the well-established SVM\ntheory can result in ProtoPNet models with inferior classification accuracy. In\nthis paper, we aim to improve the classification of ProtoPNet with a new method\nto learn support prototypes that lie near the classification boundary in the\nfeature space, as suggested by the SVM theory. In addition, we target the\nimprovement of classification results with a new model, named ST-ProtoPNet,\nwhich exploits our support prototypes and the trivial prototypes to provide\nmore effective classification. Experimental results on CUB-200-2011, Stanford\nCars, and Stanford Dogs datasets demonstrate that ST-ProtoPNet achieves\nstate-of-the-art classification accuracy and interpretability results. We also\nshow that the proposed support prototypes tend to be better localised in the\nobject of interest rather than in the background region.\n","authors":["Chong Wang","Yuyuan Liu","Yuanhong Chen","Fengbei Liu","Yu Tian","Davis J. McCarthy","Helen Frazer","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2301.04011v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10680v1","updated":"2023-08-21T12:27:18Z","published":"2023-08-21T12:27:18Z","title":"Co-Speech Gesture Detection through Multi-phase Sequence Labeling","summary":" Gestures are integral components of face-to-face communication. They unfold\nover time, often following predictable movement phases of preparation, stroke,\nand retraction. Yet, the prevalent approach to automatic gesture detection\ntreats the problem as binary classification, classifying a segment as either\ncontaining a gesture or not, thus failing to capture its inherently sequential\nand contextual nature. To address this, we introduce a novel framework that\nreframes the task as a multi-phase sequence labeling problem rather than binary\nclassification. Our model processes sequences of skeletal movements over time\nwindows, uses Transformer encoders to learn contextual embeddings, and\nleverages Conditional Random Fields to perform sequence labeling. We evaluate\nour proposal on a large dataset of diverse co-speech gestures in task-oriented\nface-to-face dialogues. The results consistently demonstrate that our method\nsignificantly outperforms strong baseline models in detecting gesture strokes.\nFurthermore, applying Transformer encoders to learn contextual embeddings from\nmovement sequences substantially improves gesture unit detection. These results\nhighlight our framework's capacity to capture the fine-grained dynamics of\nco-speech gesture phases, paving the way for more nuanced and accurate gesture\ndetection and analysis.\n","authors":["Esam Ghaleb","Ilya Burenko","Marlou Rasenberg","Wim Pouw","Peter Uhrig","Judith Holler","Ivan Toni","Aslı Özyürek","Raquel Fernández"],"pdf_url":"https://arxiv.org/pdf/2308.10680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10677v1","updated":"2023-08-21T12:24:20Z","published":"2023-08-21T12:24:20Z","title":"Visual Crowd Analysis: Open Research Problems","summary":" Over the last decade, there has been a remarkable surge in interest in\nautomated crowd monitoring within the computer vision community. Modern\ndeep-learning approaches have made it possible to develop fully-automated\nvision-based crowd-monitoring applications. However, despite the magnitude of\nthe issue at hand, the significant technological advancements, and the\nconsistent interest of the research community, there are still numerous\nchallenges that need to be overcome. In this article, we delve into six major\nareas of visual crowd analysis, emphasizing the key developments in each of\nthese areas. We outline the crucial unresolved issues that must be tackled in\nfuture works, in order to ensure that the field of automated crowd monitoring\ncontinues to progress and thrive. Several surveys related to this topic have\nbeen conducted in the past. Nonetheless, this article thoroughly examines and\npresents a more intuitive categorization of works, while also depicting the\nlatest breakthroughs within the field, incorporating more recent studies\ncarried out within the last few years in a concise manner. By carefully\nchoosing prominent works with significant contributions in terms of novelty or\nperformance gains, this paper presents a more comprehensive exposition of\nadvancements in the current state-of-the-art.\n","authors":["Muhammad Asif Khan","Hamid Menouar","Ridha Hamila"],"pdf_url":"https://arxiv.org/pdf/2308.10677v1.pdf","comment":"Accepted in AI Magazine published by Wiley Periodicals LLC on behalf\n of the Association for the Advancement of Artificial Intelligence"},{"id":"http://arxiv.org/abs/2306.15706v2","updated":"2023-08-21T12:18:57Z","published":"2023-06-27T05:43:47Z","title":"Approximated Prompt Tuning for Vision-Language Pre-trained Models","summary":" Prompt tuning is a parameter-efficient way to deploy large-scale pre-trained\nmodels to downstream tasks by adding task-specific tokens. In terms of\nvision-language pre-trained (VLP) models, prompt tuning often requires a large\nnumber of learnable tokens to bridge the gap between the pre-training and\ndownstream tasks, which greatly exacerbates the already high computational\noverhead. In this paper, we revisit the principle of prompt tuning for\nTransformer-based VLP models, and reveal that the impact of soft prompt tokens\ncan be actually approximated via independent information diffusion steps,\nthereby avoiding the expensive global attention modeling and reducing the\ncomputational complexity to a large extent. Based on this finding, we propose a\nnovel Approximated Prompt Tuning (APT) approach towards efficient VL transfer\nlearning. To validate APT, we apply it to two representative VLP models, namely\nViLT and METER, and conduct extensive experiments on a bunch of downstream\ntasks. Meanwhile, the generalization of APT is also validated on CLIP for image\nclassification and StableDiffusion for text-to-image generation. The\nexperimental results not only show the superior performance gains and\ncomputation efficiency of APT against the conventional prompt tuning methods,\ne.g., +7.01% accuracy and -82.30% additional computation overhead on METER, but\nalso confirm its merits over other parameter-efficient transfer learning\napproaches.\n","authors":["Qiong Wu","Shubin Huang","Yiyi Zhou","Pingyang Dai","Annan Shu","Guannan Jiang","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2306.15706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09629v2","updated":"2023-08-21T12:15:42Z","published":"2023-06-16T05:22:25Z","title":"Fusing Structural and Functional Connectivities using Disentangled VAE\n for Detecting MCI","summary":" Brain network analysis is a useful approach to studying human brain disorders\nbecause it can distinguish patients from healthy people by detecting abnormal\nconnections. Due to the complementary information from multiple modal\nneuroimages, multimodal fusion technology has a lot of potential for improving\nprediction performance. However, effective fusion of multimodal medical images\nto achieve complementarity is still a challenging problem. In this paper, a\nnovel hierarchical structural-functional connectivity fusing (HSCF) model is\nproposed to construct brain structural-functional connectivity matrices and\npredict abnormal brain connections based on functional magnetic resonance\nimaging (fMRI) and diffusion tensor imaging (DTI). Specifically, the prior\nknowledge is incorporated into the separators for disentangling each modality\nof information by the graph convolutional networks (GCN). And a disentangled\ncosine distance loss is devised to ensure the disentanglement's effectiveness.\nMoreover, the hierarchical representation fusion module is designed to\neffectively maximize the combination of relevant and effective features between\nmodalities, which makes the generated structural-functional connectivity more\nrobust and discriminative in the cognitive disease analysis. Results from a\nwide range of tests performed on the public Alzheimer's Disease Neuroimaging\nInitiative (ADNI) database show that the proposed model performs better than\ncompeting approaches in terms of classification evaluation. In general, the\nproposed HSCF model is a promising model for generating brain\nstructural-functional connectivities and identifying abnormal brain connections\nas cognitive disease progresses.\n","authors":["Qiankun Zuo","Yanfei Zhu","Libin Lu","Zhi Yang","Yuhui Li","Ning Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.09629v2.pdf","comment":"4 figures"},{"id":"http://arxiv.org/abs/2103.00852v2","updated":"2023-08-21T12:08:58Z","published":"2021-03-01T09:03:50Z","title":"CrossMap Transformer: A Crossmodal Masked Path Transformer Using Double\n Back-Translation for Vision-and-Language Navigation","summary":" Navigation guided by natural language instructions is particularly suitable\nfor Domestic Service Robots that interacts naturally with users. This task\ninvolves the prediction of a sequence of actions that leads to a specified\ndestination given a natural language navigation instruction. The task thus\nrequires the understanding of instructions, such as ``Walk out of the bathroom\nand wait on the stairs that are on the right''. The Visual and Language\nNavigation remains challenging, notably because it requires the exploration of\nthe environment and at the accurate following of a path specified by the\ninstructions to model the relationship between language and vision. To address\nthis, we propose the CrossMap Transformer network, which encodes the linguistic\nand visual features to sequentially generate a path. The CrossMap transformer\nis tied to a Transformer-based speaker that generates navigation instructions.\nThe two networks share common latent features, for mutual enhancement through a\ndouble back translation model: Generated paths are translated into instructions\nwhile generated instructions are translated into path The experimental results\nshow the benefits of our approach in terms of instruction understanding and\ninstruction generation.\n","authors":["Aly Magassouba","Komei Sugiura","Hisashi Kawai"],"pdf_url":"https://arxiv.org/pdf/2103.00852v2.pdf","comment":"8 pages, 5 figures, 5 tables. Submitted to IEEE Robotics and\n Automation Letters"},{"id":"http://arxiv.org/abs/2308.09357v2","updated":"2023-08-21T11:58:14Z","published":"2023-08-18T07:38:30Z","title":"Multi-scale Target-Aware Framework for Constrained Image Splicing\n Detection and Localization","summary":" Constrained image splicing detection and localization (CISDL) is a\nfundamental task of multimedia forensics, which detects splicing operation\nbetween two suspected images and localizes the spliced region on both images.\nRecent works regard it as a deep matching problem and have made significant\nprogress. However, existing frameworks typically perform feature extraction and\ncorrelation matching as separate processes, which may hinder the model's\nability to learn discriminative features for matching and can be susceptible to\ninterference from ambiguous background pixels. In this work, we propose a\nmulti-scale target-aware framework to couple feature extraction and correlation\nmatching in a unified pipeline. In contrast to previous methods, we design a\ntarget-aware attention mechanism that jointly learns features and performs\ncorrelation matching between the probe and donor images. Our approach can\neffectively promote the collaborative learning of related patches, and perform\nmutual promotion of feature learning and correlation matching. Additionally, in\norder to handle scale transformations, we introduce a multi-scale projection\nmethod, which can be readily integrated into our target-aware framework that\nenables the attention process to be conducted between tokens containing\ninformation of varying scales. Our experiments demonstrate that our model,\nwhich uses a unified pipeline, outperforms state-of-the-art methods on several\nbenchmark datasets and is robust against scale transformations.\n","authors":["Yuxuan Tan","Yuanman Li","Limin Zeng","Jiaxiong Ye","Wei wang","Xia Li"],"pdf_url":"https://arxiv.org/pdf/2308.09357v2.pdf","comment":"accepted by ACMMM2023"},{"id":"http://arxiv.org/abs/2308.10658v1","updated":"2023-08-21T11:51:46Z","published":"2023-08-21T11:51:46Z","title":"Learning Clothing and Pose Invariant 3D Shape Representation for\n Long-Term Person Re-Identification","summary":" Long-Term Person Re-Identification (LT-ReID) has become increasingly crucial\nin computer vision and biometrics. In this work, we aim to extend LT-ReID\nbeyond pedestrian recognition to include a wider range of real-world human\nactivities while still accounting for cloth-changing scenarios over large time\ngaps. This setting poses additional challenges due to the geometric\nmisalignment and appearance ambiguity caused by the diversity of human pose and\nclothing. To address these challenges, we propose a new approach 3DInvarReID\nfor (i) disentangling identity from non-identity components (pose, clothing\nshape, and texture) of 3D clothed humans, and (ii) reconstructing accurate 3D\nclothed body shapes and learning discriminative features of naked body shapes\nfor person ReID in a joint manner. To better evaluate our study of LT-ReID, we\ncollect a real-world dataset called CCDA, which contains a wide variety of\nhuman activities and clothing changes. Experimentally, we show the superior\nperformance of our approach for person ReID.\n","authors":["Feng Liu","Minchul Kim","ZiAng Gu","Anil Jian","Xiaoming Liu"],"pdf_url":"https://arxiv.org/pdf/2308.10658v1.pdf","comment":"10 pages, 7 figures, accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10648v1","updated":"2023-08-21T11:36:46Z","published":"2023-08-21T11:36:46Z","title":"EVE: Efficient zero-shot text-based Video Editing with Depth Map\n Guidance and Temporal Consistency Constraints","summary":" Motivated by the superior performance of image diffusion models, more and\nmore researchers strive to extend these models to the text-based video editing\ntask. Nevertheless, current video editing tasks mainly suffer from the dilemma\nbetween the high fine-tuning cost and the limited generation capacity. Compared\nwith images, we conjecture that videos necessitate more constraints to preserve\nthe temporal consistency during editing. Towards this end, we propose EVE, a\nrobust and efficient zero-shot video editing method. Under the guidance of\ndepth maps and temporal consistency constraints, EVE derives satisfactory video\nediting results with an affordable computational and time cost. Moreover,\nrecognizing the absence of a publicly available video editing dataset for fair\ncomparisons, we construct a new benchmark ZVE-50 dataset. Through comprehensive\nexperimentation, we validate that EVE could achieve a satisfactory trade-off\nbetween performance and efficiency. We will release our dataset and codebase to\nfacilitate future researchers.\n","authors":["Yutao Chen","Xingning Dong","Tian Gan","Chunluan Zhou","Ming Yang","Qingpei Guo"],"pdf_url":"https://arxiv.org/pdf/2308.10648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10647v1","updated":"2023-08-21T11:35:28Z","published":"2023-08-21T11:35:28Z","title":"bbOCR: An Open-source Multi-domain OCR Pipeline for Bengali Documents","summary":" Despite the existence of numerous Optical Character Recognition (OCR) tools,\nthe lack of comprehensive open-source systems hampers the progress of document\ndigitization in various low resource languages, including Bengali. Low-resource\nlanguages, especially those with an alphasyllabary writing system, suffer from\nthe lack of large-scale datasets for various document OCR components such as\nword-level OCR, document layout extraction, and distortion correction; which\nare available as individual modules in high-resource languages. In this paper,\nwe introduce Bengali.AI-BRACU-OCR (bbOCR): an open-source scalable document OCR\nsystem that can reconstruct Bengali documents into a structured searchable\ndigitized format that leverages a novel Bengali text recognition model and two\nnovel synthetic datasets. We present extensive component-level and system-level\nevaluation: both use a novel diversified evaluation dataset and comprehensive\nevaluation metrics. Our extensive evaluation suggests that our proposed\nsolution is preferable over the current state-of-the-art Bengali OCR systems.\nThe source codes and datasets are available here:\nhttps://bengaliai.github.io/bbocr.\n","authors":["Imam Mohammad Zulkarnain","Shayekh Bin Islam","Md. Zami Al Zunaed Farabe","Md. Mehedi Hasan Shawon","Jawaril Munshad Abedin","Beig Rajibul Hasan","Marsia Haque","Istiak Shihab","Syed Mobassir","MD. Nazmuddoha Ansary","Asif Sushmit","Farig Sadeque"],"pdf_url":"https://arxiv.org/pdf/2308.10647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10638v1","updated":"2023-08-21T11:23:25Z","published":"2023-08-21T11:23:25Z","title":"SCULPT: Shape-Conditioned Unpaired Learning of Pose-dependent Clothed\n and Textured Human Meshes","summary":" We present SCULPT, a novel 3D generative model for clothed and textured 3D\nmeshes of humans. Specifically, we devise a deep neural network that learns to\nrepresent the geometry and appearance distribution of clothed human bodies.\nTraining such a model is challenging, as datasets of textured 3D meshes for\nhumans are limited in size and accessibility. Our key observation is that there\nexist medium-sized 3D scan datasets like CAPE, as well as large-scale 2D image\ndatasets of clothed humans and multiple appearances can be mapped to a single\ngeometry. To effectively learn from the two data modalities, we propose an\nunpaired learning procedure for pose-dependent clothed and textured human\nmeshes. Specifically, we learn a pose-dependent geometry space from 3D scan\ndata. We represent this as per vertex displacements w.r.t. the SMPL model.\nNext, we train a geometry conditioned texture generator in an unsupervised way\nusing the 2D image data. We use intermediate activations of the learned\ngeometry model to condition our texture generator. To alleviate entanglement\nbetween pose and clothing type, and pose and clothing appearance, we condition\nboth the texture and geometry generators with attribute labels such as clothing\ntypes for the geometry, and clothing colors for the texture generator. We\nautomatically generated these conditioning labels for the 2D images based on\nthe visual question answering model BLIP and CLIP. We validate our method on\nthe SCULPT dataset, and compare to state-of-the-art 3D generative models for\nclothed human bodies. We will release the codebase for research purposes.\n","authors":["Soubhik Sanyal","Partha Ghosh","Jinlong Yang","Michael J. Black","Justus Thies","Timo Bolkart"],"pdf_url":"https://arxiv.org/pdf/2308.10638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10636v1","updated":"2023-08-21T11:14:49Z","published":"2023-08-21T11:14:49Z","title":"Automated Identification of Failure Cases in Organ at Risk Segmentation\n Using Distance Metrics: A Study on CT Data","summary":" Automated organ at risk (OAR) segmentation is crucial for radiation therapy\nplanning in CT scans, but the generated contours by automated models can be\ninaccurate, potentially leading to treatment planning issues. The reasons for\nthese inaccuracies could be varied, such as unclear organ boundaries or\ninaccurate ground truth due to annotation errors. To improve the model's\nperformance, it is necessary to identify these failure cases during the\ntraining process and to correct them with some potential post-processing\ntechniques. However, this process can be time-consuming, as traditionally it\nrequires manual inspection of the predicted output. This paper proposes a\nmethod to automatically identify failure cases by setting a threshold for the\ncombination of Dice and Hausdorff distances. This approach reduces the\ntime-consuming task of visually inspecting predicted outputs, allowing for\nfaster identification of failure case candidates. The method was evaluated on\n20 cases of six different organs in CT images from clinical expert curated\ndatasets. By setting the thresholds for the Dice and Hausdorff distances, the\nstudy was able to differentiate between various states of failure cases and\nevaluate over 12 cases visually. This thresholding approach could be extended\nto other organs, leading to faster identification of failure cases and thereby\nimproving the quality of radiation therapy planning.\n","authors":["Amin Honarmandi Shandiz","Attila Rádics","Rajesh Tamada","Makk Árpád","Karolina Glowacka","Lehel Ferenczi","Sandeep Dutta","Michael Fanariotis"],"pdf_url":"https://arxiv.org/pdf/2308.10636v1.pdf","comment":"11 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.08242v2","updated":"2023-08-21T11:13:59Z","published":"2023-08-16T09:16:05Z","title":"Contrastive Learning for Lane Detection via Cross-Similarity","summary":" Detecting road lanes is challenging due to intricate markings vulnerable to\nunfavorable conditions. Lane markings have strong shape priors, but their\nvisibility is easily compromised. Factors like lighting, weather, vehicles,\npedestrians, and aging colors challenge the detection. A large amount of data\nis required to train a lane detection approach that can withstand natural\nvariations caused by low visibility. This is because there are numerous lane\nshapes and natural variations that exist. Our solution, Contrastive Learning\nfor Lane Detection via cross-similarity (CLLD), is a self-supervised learning\nmethod that tackles this challenge by enhancing lane detection models\nresilience to real-world conditions that cause lane low visibility. CLLD is a\nnovel multitask contrastive learning that trains lane detection approaches to\ndetect lane markings even in low visible situations by integrating local\nfeature contrastive learning (CL) with our new proposed operation\ncross-similarity. Local feature CL focuses on extracting features for small\nimage parts, which is necessary to localize lane segments, while\ncross-similarity captures global features to detect obscured lane segments\nusing their surrounding. We enhance cross-similarity by randomly masking parts\nof input images for augmentation. Evaluated on benchmark datasets, CLLD\noutperforms state-of-the-art contrastive learning, especially in\nvisibility-impairing conditions like shadows. Compared to supervised learning,\nCLLD excels in scenarios like shadows and crowded scenes.\n","authors":["Ali Zoljodi","Sadegh Abadijou","Mina Alibeigi","Masoud Daneshtalab"],"pdf_url":"https://arxiv.org/pdf/2308.08242v2.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.10632v1","updated":"2023-08-21T11:07:27Z","published":"2023-08-21T11:07:27Z","title":"Foundation Model-oriented Robustness: Robust Image Model Evaluation with\n Pretrained Models","summary":" Machine learning has demonstrated remarkable performance over finite\ndatasets, yet whether the scores over the fixed benchmarks can sufficiently\nindicate the model's performance in the real world is still in discussion. In\nreality, an ideal robust model will probably behave similarly to the oracle\n(e.g., the human users), thus a good evaluation protocol is probably to\nevaluate the models' behaviors in comparison to the oracle. In this paper, we\nintroduce a new robustness measurement that directly measures the image\nclassification model's performance compared with a surrogate oracle (i.e., a\nfoundation model). Besides, we design a simple method that can accomplish the\nevaluation beyond the scope of the benchmarks. Our method extends the image\ndatasets with new samples that are sufficiently perturbed to be distinct from\nthe ones in the original sets, but are still bounded within the same\nimage-label structure the original test image represents, constrained by a\nfoundation model pretrained with a large amount of samples. As a result, our\nnew method will offer us a new way to evaluate the models' robustness\nperformance, free of limitations of fixed benchmarks or constrained\nperturbations, although scoped by the power of the oracle. In addition to the\nevaluation results, we also leverage our generated data to understand the\nbehaviors of the model and our new evaluation strategies.\n","authors":["Peiyan Zhang","Haoyang Liu","Chaozhuo Li","Xing Xie","Sunghun Kim","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10631v1","updated":"2023-08-21T11:06:43Z","published":"2023-08-21T11:06:43Z","title":"PsyMo: A Dataset for Estimating Self-Reported Psychological Traits from\n Gait","summary":" Psychological trait estimation from external factors such as movement and\nappearance is a challenging and long-standing problem in psychology, and is\nprincipally based on the psychological theory of embodiment. To date, attempts\nto tackle this problem have utilized private small-scale datasets with\nintrusive body-attached sensors. Potential applications of an automated system\nfor psychological trait estimation include estimation of occupational fatigue\nand psychology, and marketing and advertisement. In this work, we propose PsyMo\n(Psychological traits from Motion), a novel, multi-purpose and multi-modal\ndataset for exploring psychological cues manifested in walking patterns. We\ngathered walking sequences from 312 subjects in 7 different walking variations\nand 6 camera angles. In conjunction with walking sequences, participants filled\nin 6 psychological questionnaires, totalling 17 psychometric attributes related\nto personality, self-esteem, fatigue, aggressiveness and mental health. We\npropose two evaluation protocols for psychological trait estimation. Alongside\nthe estimation of self-reported psychological traits from gait, the dataset can\nbe used as a drop-in replacement to benchmark methods for gait recognition. We\nanonymize all cues related to the identity of the subjects and publicly release\nonly silhouettes, 2D / 3D human skeletons and 3D SMPL human meshes.\n","authors":["Adrian Cosma","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2308.10631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16572v2","updated":"2023-08-21T11:05:22Z","published":"2023-07-31T11:05:55Z","title":"Transferable Attack for Semantic Segmentation","summary":" We analysis performance of semantic segmentation models wrt. adversarial\nattacks, and observe that the adversarial examples generated from a source\nmodel fail to attack the target models. i.e The conventional attack methods,\nsuch as PGD and FGSM, do not transfer well to target models, making it\nnecessary to study the transferable attacks, especially transferable attacks\nfor semantic segmentation. We find two main factors to achieve transferable\nattack. Firstly, the attack should come with effective data augmentation and\ntranslation-invariant features to deal with unseen models. Secondly, stabilized\noptimization strategies are needed to find the optimal attack direction. Based\non the above observations, we propose an ensemble attack for semantic\nsegmentation to achieve more effective attacks with higher transferability. The\nsource code and experimental results are publicly available via our project\npage: https://github.com/anucvers/TASS.\n","authors":["Mengqi He","Jing Zhang","Zhaoyuan Yang","Mingyi He","Nick Barnes","Yuchao Dai"],"pdf_url":"https://arxiv.org/pdf/2307.16572v2.pdf","comment":"Source code is available at: https://github.com/anucvers/TASS"},{"id":"http://arxiv.org/abs/2308.10627v1","updated":"2023-08-21T10:56:00Z","published":"2023-08-21T10:56:00Z","title":"Polarimetric Information for Multi-Modal 6D Pose Estimation of\n Photometrically Challenging Objects with Limited Data","summary":" 6D pose estimation pipelines that rely on RGB-only or RGB-D data show\nlimitations for photometrically challenging objects with e.g. textureless\nsurfaces, reflections or transparency. A supervised learning-based method\nutilising complementary polarisation information as input modality is proposed\nto overcome such limitations. This supervised approach is then extended to a\nself-supervised paradigm by leveraging physical characteristics of polarised\nlight, thus eliminating the need for annotated real data. The methods achieve\nsignificant advancements in pose estimation by leveraging geometric information\nfrom polarised light and incorporating shape priors and invertible physical\nconstraints.\n","authors":["Patrick Ruhkamp","Daoyi Gao","HyunJun Jung","Nassir Navab","Benjamin Busam"],"pdf_url":"https://arxiv.org/pdf/2308.10627v1.pdf","comment":"Accepted at ICCV 2023 TRICKY Workshop"},{"id":"http://arxiv.org/abs/2308.10623v1","updated":"2023-08-21T10:47:52Z","published":"2023-08-21T10:47:52Z","title":"GaitPT: Skeletons Are All You Need For Gait Recognition","summary":" The analysis of patterns of walking is an important area of research that has\nnumerous applications in security, healthcare, sports and human-computer\ninteraction. Lately, walking patterns have been regarded as a unique\nfingerprinting method for automatic person identification at a distance. In\nthis work, we propose a novel gait recognition architecture called Gait Pyramid\nTransformer (GaitPT) that leverages pose estimation skeletons to capture unique\nwalking patterns, without relying on appearance information. GaitPT adopts a\nhierarchical transformer architecture that effectively extracts both spatial\nand temporal features of movement in an anatomically consistent manner, guided\nby the structure of the human skeleton. Our results show that GaitPT achieves\nstate-of-the-art performance compared to other skeleton-based gait recognition\nworks, in both controlled and in-the-wild scenarios. GaitPT obtains 82.6%\naverage accuracy on CASIA-B, surpassing other works by a margin of 6%.\nMoreover, it obtains 52.16% Rank-1 accuracy on GREW, outperforming both\nskeleton-based and appearance-based approaches.\n","authors":["Andy Catruna","Adrian Cosma","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2308.10623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10621v1","updated":"2023-08-21T10:38:32Z","published":"2023-08-21T10:38:32Z","title":"Multi-Modal Dataset Acquisition for Photometrically Challenging Object","summary":" This paper addresses the limitations of current datasets for 3D vision tasks\nin terms of accuracy, size, realism, and suitable imaging modalities for\nphotometrically challenging objects. We propose a novel annotation and\nacquisition pipeline that enhances existing 3D perception and 6D object pose\ndatasets. Our approach integrates robotic forward-kinematics, external infrared\ntrackers, and improved calibration and annotation procedures. We present a\nmulti-modal sensor rig, mounted on a robotic end-effector, and demonstrate how\nit is integrated into the creation of highly accurate datasets. Additionally,\nwe introduce a freehand procedure for wider viewpoint coverage. Both approaches\nyield high-quality 3D data with accurate object and camera pose annotations.\nOur methods overcome the limitations of existing datasets and provide valuable\nresources for 3D vision research.\n","authors":["HyunJun Jung","Patrick Ruhkamp","Nassir Navab","Benjamin Busam"],"pdf_url":"https://arxiv.org/pdf/2308.10621v1.pdf","comment":"Accepted at ICCV 2023 TRICKY Workshop"},{"id":"http://arxiv.org/abs/2203.05186v2","updated":"2023-08-21T10:31:12Z","published":"2022-03-10T06:41:07Z","title":"Suspected Object Matters: Rethinking Model's Prediction for One-stage\n Visual Grounding","summary":" Recently, one-stage visual grounders attract high attention due to their\ncomparable accuracy but significantly higher efficiency than two-stage\ngrounders. However, inter-object relation modeling has not been well studied\nfor one-stage grounders. Inter-object relationship modeling, though important,\nis not necessarily performed among all objects, as only part of them are\nrelated to the text query and may confuse the model. We call these objects\nsuspected objects. However, exploring their relationships in the one-stage\nparadigm is non-trivial because: First, no object proposals are available as\nthe basis on which to select suspected objects and perform relationship\nmodeling. Second, suspected objects are more confusing than others, as they may\nshare similar semantics, be entangled with certain relationships, etc, and\nthereby more easily mislead the model prediction. Toward this end, we propose a\nSuspected Object Transformation mechanism (SOT), which can be seamlessly\nintegrated into existing CNN and Transformer-based one-stage visual grounders\nto encourage the target object selection among the suspected ones. Suspected\nobjects are dynamically discovered from a learned activation map adapted to the\nmodel current discrimination ability during training. Afterward, on top of\nsuspected objects, a Keyword-Aware Discrimination module (KAD) and an\nExploration by Random Connection strategy (ERC) are concurrently proposed to\nhelp the model rethink its initial prediction. On the one hand, KAD leverages\nkeywords contributing high to suspected object discrimination. On the other\nhand, ERC allows the model to seek the correct object instead of being trapped\nin a situation that always exploits the current false prediction. Extensive\nexperiments demonstrate the effectiveness of our proposed method.\n","authors":["Yang Jiao","Zequn Jie","Jingjing Chen","Lin Ma","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2203.05186v2.pdf","comment":"Accepted to ACM MM 23"},{"id":"http://arxiv.org/abs/2308.10610v1","updated":"2023-08-21T10:20:46Z","published":"2023-08-21T10:20:46Z","title":"Ultrafast and Ultralight Network-Based Intelligent System for Real-time\n Diagnosis of Ear diseases in Any Devices","summary":" Traditional ear disease diagnosis heavily depends on experienced specialists\nand specialized equipment, frequently resulting in misdiagnoses, treatment\ndelays, and financial burdens for some patients. Utilizing deep learning models\nfor efficient ear disease diagnosis has proven effective and affordable.\nHowever, existing research overlooked model inference speed and parameter size\nrequired for deployment. To tackle these challenges, we constructed a\nlarge-scale dataset comprising eight ear disease categories and normal ear\ncanal samples from two hospitals. Inspired by ShuffleNetV2, we developed\nBest-EarNet, an ultrafast and ultralight network enabling real-time ear disease\ndiagnosis. Best-EarNet incorporates the novel Local-Global Spatial Feature\nFusion Module which can capture global and local spatial information\nsimultaneously and guide the network to focus on crucial regions within feature\nmaps at various levels, mitigating low accuracy issues. Moreover, our network\nuses multiple auxiliary classification heads for efficient parameter\noptimization. With 0.77M parameters, Best-EarNet achieves an average frames per\nsecond of 80 on CPU. Employing transfer learning and five-fold cross-validation\nwith 22,581 images from Hospital-1, the model achieves an impressive 95.23%\naccuracy. External testing on 1,652 images from Hospital-2 validates its\nperformance, yielding 92.14% accuracy. Compared to state-of-the-art networks,\nBest-EarNet establishes a new state-of-the-art (SOTA) in practical\napplications. Most importantly, we developed an intelligent diagnosis system\ncalled Ear Keeper, which can be deployed on common electronic devices. By\nmanipulating a compact electronic otoscope, users can perform comprehensive\nscanning and diagnosis of the ear canal using real-time video. This study\nprovides a novel paradigm for ear endoscopy and other medical endoscopic image\nrecognition applications.\n","authors":["Yubiao Yue","Xinyu Zeng","Xiaoqiang Shi","Meiping Zhang","Haihua Liang","Fan Zhang","Yanmei Chen","Zefeng Xie","Wenrui Wu","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2308.10610v1.pdf","comment":"This manuscript has been submitted to Neural Networks"},{"id":"http://arxiv.org/abs/2308.10608v1","updated":"2023-08-21T10:16:52Z","published":"2023-08-21T10:16:52Z","title":"FocalDreamer: Text-driven 3D Editing via Focal-fusion Assembly","summary":" While text-3D editing has made significant strides in leveraging score\ndistillation sampling, emerging approaches still fall short in delivering\nseparable, precise and consistent outcomes that are vital to content creation.\nIn response, we introduce FocalDreamer, a framework that merges base shape with\neditable parts according to text prompts for fine-grained editing within\ndesired regions. Specifically, equipped with geometry union and dual-path\nrendering, FocalDreamer assembles independent 3D parts into a complete object,\ntailored for convenient instance reuse and part-wise control. We propose\ngeometric focal loss and style consistency regularization, which encourage\nfocal fusion and congruent overall appearance. Furthermore, FocalDreamer\ngenerates high-fidelity geometry and PBR textures which are compatible with\nwidely-used graphics engines. Extensive experiments have highlighted the\nsuperior editing capabilities of FocalDreamer in both quantitative and\nqualitative evaluations.\n","authors":["Yuhan Li","Yishun Dou","Yue Shi","Yu Lei","Xuanhong Chen","Yi Zhang","Peng Zhou","Bingbing Ni"],"pdf_url":"https://arxiv.org/pdf/2308.10608v1.pdf","comment":"Project website: https://fantasia3d.github.io"},{"id":"http://arxiv.org/abs/2308.06300v2","updated":"2023-08-21T10:08:11Z","published":"2023-08-11T07:57:12Z","title":"Automatic Classification of Blood Cell Images Using Convolutional Neural\n Network","summary":" Human blood primarily comprises plasma, red blood cells, white blood cells,\nand platelets. It plays a vital role in transporting nutrients to different\norgans, where it stores essential health-related data about the human body.\nBlood cells are utilized to defend the body against diverse infections,\nincluding fungi, viruses, and bacteria. Hence, blood analysis can help\nphysicians assess an individual's physiological condition. Blood cells have\nbeen sub-classified into eight groups: Neutrophils, eosinophils, basophils,\nlymphocytes, monocytes, immature granulocytes (promyelocytes, myelocytes, and\nmetamyelocytes), erythroblasts, and platelets or thrombocytes on the basis of\ntheir nucleus, shape, and cytoplasm. Traditionally, pathologists and\nhematologists in laboratories have examined these blood cells using a\nmicroscope before manually classifying them. The manual approach is slower and\nmore prone to human error. Therefore, it is essential to automate this process.\nIn our paper, transfer learning with CNN pre-trained models. VGG16, VGG19,\nResNet-50, ResNet-101, ResNet-152, InceptionV3, MobileNetV2, and DenseNet-20\napplied to the PBC dataset's normal DIB. The overall accuracy achieved with\nthese models lies between 91.375 and 94.72%. Hence, inspired by these\npre-trained architectures, a model has been proposed to automatically classify\nthe ten types of blood cells with increased accuracy. A novel CNN-based\nframework has been presented to improve accuracy. The proposed CNN model has\nbeen tested on the PBC dataset normal DIB. The outcomes of the experiments\ndemonstrate that our CNN-based framework designed for blood cell classification\nattains an accuracy of 99.91% on the PBC dataset. Our proposed convolutional\nneural network model performs competitively when compared to earlier results\nreported in the literature.\n","authors":["Rabia Asghar","Sanjay Kumar","Paul Hynds","Abeera Mahfooz"],"pdf_url":"https://arxiv.org/pdf/2308.06300v2.pdf","comment":"15"},{"id":"http://arxiv.org/abs/2308.10604v1","updated":"2023-08-21T10:00:59Z","published":"2023-08-21T10:00:59Z","title":"BackTrack: Robust template update via Backward Tracking of candidate\n template","summary":" Variations of target appearance such as deformations, illumination variance,\nocclusion, etc., are the major challenges of visual object tracking that\nnegatively impact the performance of a tracker. An effective method to tackle\nthese challenges is template update, which updates the template to reflect the\nchange of appearance in the target object during tracking. However, with\ntemplate updates, inadequate quality of new templates or inappropriate timing\nof updates may induce a model drift problem, which severely degrades the\ntracking performance. Here, we propose BackTrack, a robust and reliable method\nto quantify the confidence of the candidate template by backward tracking it on\nthe past frames. Based on the confidence score of candidates from BackTrack, we\ncan update the template with a reliable candidate at the right time while\nrejecting unreliable candidates. BackTrack is a generic template update scheme\nand is applicable to any template-based trackers. Extensive experiments on\nvarious tracking benchmarks verify the effectiveness of BackTrack over existing\ntemplate update algorithms, as it achieves SOTA performance on various tracking\nbenchmarks.\n","authors":["Dongwook Lee","Wonjun Choi","Seohyung Lee","ByungIn Yoo","Eunho Yang","Seongju Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.10604v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.10603v1","updated":"2023-08-21T10:00:46Z","published":"2023-08-21T10:00:46Z","title":"A step towards understanding why classification helps regression","summary":" A number of computer vision deep regression approaches report improved\nresults when adding a classification loss to the regression loss. Here, we\nexplore why this is useful in practice and when it is beneficial. To do so, we\nstart from precisely controlled dataset variations and data samplings and find\nthat the effect of adding a classification loss is the most pronounced for\nregression with imbalanced data. We explain these empirical findings by\nformalizing the relation between the balanced and imbalanced regression losses.\nFinally, we show that our findings hold on two real imbalanced image datasets\nfor depth estimation (NYUD2-DIR), and age estimation (IMDB-WIKI-DIR), and on\nthe problem of imbalanced video progress prediction (Breakfast). Our main\ntakeaway is: for a regression task, if the data sampling is imbalanced, then\nadd a classification loss.\n","authors":["Silvia L. Pintea","Yancong Lin","Jouke Dijkstra","Jan C. van Gemert"],"pdf_url":"https://arxiv.org/pdf/2308.10603v1.pdf","comment":"Accepted at ICCV-2023"},{"id":"http://arxiv.org/abs/2308.10601v1","updated":"2023-08-21T09:58:13Z","published":"2023-08-21T09:58:13Z","title":"Improving the Transferability of Adversarial Examples with Arbitrary\n Style Transfer","summary":" Deep neural networks are vulnerable to adversarial examples crafted by\napplying human-imperceptible perturbations on clean inputs. Although many\nattack methods can achieve high success rates in the white-box setting, they\nalso exhibit weak transferability in the black-box setting. Recently, various\nmethods have been proposed to improve adversarial transferability, in which the\ninput transformation is one of the most effective methods. In this work, we\nnotice that existing input transformation-based works mainly adopt the\ntransformed data in the same domain for augmentation. Inspired by domain\ngeneralization, we aim to further improve the transferability using the data\naugmented from different domains. Specifically, a style transfer network can\nalter the distribution of low-level visual features in an image while\npreserving semantic content for humans. Hence, we propose a novel attack method\nnamed Style Transfer Method (STM) that utilizes a proposed arbitrary style\ntransfer network to transform the images into different domains. To avoid\ninconsistent semantic information of stylized images for the classification\nnetwork, we fine-tune the style transfer network and mix up the generated\nimages added by random noise with the original images to maintain semantic\nconsistency and boost input diversity. Extensive experimental results on the\nImageNet-compatible dataset show that our proposed method can significantly\nimprove the adversarial transferability on either normally trained models or\nadversarially trained models than state-of-the-art input transformation-based\nattacks. Code is available at: https://github.com/Zhijin-Ge/STM.\n","authors":["Zhijin Ge","Fanhua Shang","Hongying Liu","Yuanyuan Liu","Liang Wan","Wei Feng","Xiaosen Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10601v1.pdf","comment":"10 pages, 2 figures, accepted by the 31st ACM International\n Conference on Multimedia (MM '23)"},{"id":"http://arxiv.org/abs/2308.10599v1","updated":"2023-08-21T09:56:48Z","published":"2023-08-21T09:56:48Z","title":"Image-free Classifier Injection for Zero-Shot Classification","summary":" Zero-shot learning models achieve remarkable results on image classification\nfor samples from classes that were not seen during training. However, such\nmodels must be trained from scratch with specialised methods: therefore, access\nto a training dataset is required when the need for zero-shot classification\narises. In this paper, we aim to equip pre-trained models with zero-shot\nclassification capabilities without the use of image data. We achieve this with\nour proposed Image-free Classifier Injection with Semantics (ICIS) that injects\nclassifiers for new, unseen classes into pre-trained classification models in a\npost-hoc fashion without relying on image data. Instead, the existing\nclassifier weights and simple class-wise descriptors, such as class names or\nattributes, are used. ICIS has two encoder-decoder networks that learn to\nreconstruct classifier weights from descriptors (and vice versa), exploiting\n(cross-)reconstruction and cosine losses to regularise the decoding process.\nNotably, ICIS can be cheaply trained and applied directly on top of pre-trained\nclassification models. Experiments on benchmark ZSL datasets show that ICIS\nproduces unseen classifier weights that achieve strong (generalised) zero-shot\nclassification performance. Code is available at\nhttps://github.com/ExplainableML/ImageFreeZSL .\n","authors":["Anders Christensen","Massimiliano Mancini","A. Sophia Koepke","Ole Winther","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2308.10599v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.16670v2","updated":"2023-08-21T09:36:07Z","published":"2023-07-31T13:47:33Z","title":"Conditioning Generative Latent Optimization to solve Imaging Inverse\n Problems","summary":" Computed Tomography (CT) is a prominent example of Imaging Inverse Problem\n(IIP), highlighting the unrivalled performances of data-driven methods in\ndegraded measurements setups like sparse X-ray projections. Although a\nsignificant proportion of deep learning approaches benefit from large\nsupervised datasets to directly map experimental measurements to medical scans,\nthey cannot generalize to unknown acquisition setups. In contrast, fully\nunsupervised techniques, most notably using score-based generative models, have\nrecently demonstrated similar or better performances compared to supervised\napproaches to solve IIPs while being flexible at test time regarding the\nimaging setup. However, their use cases are limited by two factors: (a) they\nneed considerable amounts of training data to have good generalization\nproperties and (b) they require a backward operator, like\nFiltered-Back-Projection in the case of CT, to condition the learned prior\ndistribution of medical scans to experimental measurements. To overcome these\nissues, we propose an unsupervised conditional approach to the Generative\nLatent Optimization framework (cGLO), in which the parameters of a decoder\nnetwork are initialized on an unsupervised dataset. The decoder is then used\nfor reconstruction purposes, by performing Generative Latent Optimization with\na loss function directly comparing simulated measurements from proposed\nreconstructions to experimental measurements. The resulting approach, tested on\nsparse-view CT using multiple training dataset sizes, demonstrates better\nreconstruction quality compared to state-of-the-art score-based strategies in\nmost data regimes and shows an increasing performance advantage for smaller\ntraining datasets and reduced projection angles. Furthermore, cGLO does not\nrequire any backward operator and could expand use cases even to non-linear\nIIPs.\n","authors":["Thomas Braure","Kévin Ginsburger"],"pdf_url":"https://arxiv.org/pdf/2307.16670v2.pdf","comment":"comments: 20 pages, 9 figures; typos corrected"},{"id":"http://arxiv.org/abs/2306.16527v2","updated":"2023-08-21T09:35:52Z","published":"2023-06-21T14:01:01Z","title":"OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text\n Documents","summary":" Large multimodal models trained on natural documents, which interleave images\nand text, outperform models trained on image-text pairs on various multimodal\nbenchmarks. However, the datasets used to train these models have not been\nreleased, and the collection process has not been fully specified. We introduce\nthe OBELICS dataset, an open web-scale filtered dataset of interleaved\nimage-text documents comprising 141 million web pages extracted from Common\nCrawl, 353 million associated images, and 115 billion text tokens. We describe\nthe dataset creation process, present comprehensive filtering rules, and\nprovide an analysis of the dataset's content. To show the viability of OBELICS,\nwe train vision and language models of 9 and 80 billion parameters named\nIDEFICS, and obtain competitive performance on different multimodal benchmarks.\nWe release our dataset, models and code.\n","authors":["Hugo Laurençon","Lucile Saulnier","Léo Tronchon","Stas Bekman","Amanpreet Singh","Anton Lozhkov","Thomas Wang","Siddharth Karamcheti","Alexander M. Rush","Douwe Kiela","Matthieu Cord","Victor Sanh"],"pdf_url":"https://arxiv.org/pdf/2306.16527v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06296v2","updated":"2023-08-21T09:35:24Z","published":"2023-08-11T06:32:25Z","title":"Classification of White Blood Cells Using Machine and Deep Learning\n Models: A Systematic Review","summary":" Machine learning (ML) and deep learning (DL) models have been employed to\nsignificantly improve analyses of medical imagery, with these approaches used\nto enhance the accuracy of prediction and classification. Model predictions and\nclassifications assist diagnoses of various cancers and tumors. This review\npresents an in-depth analysis of modern techniques applied within the domain of\nmedical image analysis for white blood cell classification. The methodologies\nthat use blood smear images, magnetic resonance imaging (MRI), X-rays, and\nsimilar medical imaging domains are identified and discussed, with a detailed\nanalysis of ML/DL techniques applied to the classification of white blood cells\n(WBCs) representing the primary focus of the review. The data utilized in this\nresearch has been extracted from a collection of 136 primary papers that were\npublished between the years 2006 and 2023. The most widely used techniques and\nbest-performing white blood cell classification methods are identified. While\nthe use of ML and DL for white blood cell classification has concurrently\nincreased and improved in recent year, significant challenges remain - 1)\nAvailability of appropriate datasets remain the primary challenge, and may be\nresolved using data augmentation techniques. 2) Medical training of researchers\nis recommended to improve current understanding of white blood cell structure\nand subsequent selection of appropriate classification models. 3) Advanced DL\nnetworks including Generative Adversarial Networks, R-CNN, Fast R-CNN, and\nfaster R-CNN will likely be increasingly employed to supplement or replace\ncurrent techniques.\n","authors":["Rabia Asghar","Sanjay Kumar","Paul Hynds","Arslan Shaukat"],"pdf_url":"https://arxiv.org/pdf/2308.06296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.16475v3","updated":"2023-08-21T09:30:58Z","published":"2022-03-30T17:12:18Z","title":"Concept Evolution in Deep Learning Training: A Unified Interpretation\n Framework and Discoveries","summary":" We present ConceptEvo, a unified interpretation framework for deep neural\nnetworks (DNNs) that reveals the inception and evolution of learned concepts\nduring training. Our work addresses a critical gap in DNN interpretation\nresearch, as existing methods primarily focus on post-training interpretation.\nConceptEvo introduces two novel technical contributions: (1) an algorithm that\ngenerates a unified semantic space, enabling side-by-side comparison of\ndifferent models during training, and (2) an algorithm that discovers and\nquantifies important concept evolutions for class predictions. Through a\nlarge-scale human evaluation and quantitative experiments, we demonstrate that\nConceptEvo successfully identifies concept evolutions across different models,\nwhich are not only comprehensible to humans but also crucial for class\npredictions. ConceptEvo is applicable to both modern DNN architectures, such as\nConvNeXt, and classic DNNs, such as VGGs and InceptionV3.\n","authors":["Haekyu Park","Seongmin Lee","Benjamin Hoover","Austin P. Wright","Omar Shaikh","Rahul Duggal","Nilaksh Das","Judy Hoffman","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2203.16475v3.pdf","comment":"Accepted at CIKM'23"},{"id":"http://arxiv.org/abs/2305.17934v2","updated":"2023-08-21T09:18:03Z","published":"2023-05-29T07:54:04Z","title":"ZeroPose: CAD-Model-based Zero-Shot Pose Estimation","summary":" In this paper, we present a CAD model-based zero-shot pose estimation\npipeline called ZeroPose. Existing pose estimation methods remain to require\nexpensive training when applied to an unseen object, which greatly hinders\ntheir scalability in the practical application of industry. In contrast, the\nproposed method enables the accurate estimation of pose parameters for\npreviously unseen objects without the need for training. Specifically, we\ndesign a two-step pipeline consisting of CAD model-based zero-shot instance\nsegmentation and a zero-shot pose estimator. For the first step, there is a\nsimple but effective way to leverage CAD models and visual foundation models\nSAM and Imagebind to segment the interest unseen object at the instance level.\nFor the second step, we based on the intensive geometric information in the CAD\nmodel of the rigid object to propose a lightweight hierarchical geometric\nstructure matching mechanism achieving zero-shot pose estimation. Extensive\nexperimental results on the seven core datasets on the BOP challenge show that\nthe proposed zero-shot instance segmentation methods achieve comparable\nperformance with supervised MaskRCNN and the zero-shot pose estimation results\noutperform the SOTA pose estimators with better efficiency.\n","authors":["Jianqiu Chen","Mingshan Sun","Tianpeng Bao","Rui Zhao","Liwei Wu","Zhenyu He"],"pdf_url":"https://arxiv.org/pdf/2305.17934v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10574v1","updated":"2023-08-21T09:14:18Z","published":"2023-08-21T09:14:18Z","title":"CHORD: Category-level Hand-held Object Reconstruction via Shape\n Deformation","summary":" In daily life, humans utilize hands to manipulate objects. Modeling the shape\nof objects that are manipulated by the hand is essential for AI to comprehend\ndaily tasks and to learn manipulation skills. However, previous approaches have\nencountered difficulties in reconstructing the precise shapes of hand-held\nobjects, primarily owing to a deficiency in prior shape knowledge and\ninadequate data for training. As illustrated, given a particular type of tool,\nsuch as a mug, despite its infinite variations in shape and appearance, humans\nhave a limited number of 'effective' modes and poses for its manipulation. This\ncan be attributed to the fact that humans have mastered the shape prior of the\n'mug' category, and can quickly establish the corresponding relations between\ndifferent mug instances and the prior, such as where the rim and handle are\nlocated. In light of this, we propose a new method, CHORD, for Category-level\nHand-held Object Reconstruction via shape Deformation. CHORD deforms a\ncategorical shape prior for reconstructing the intra-class objects. To ensure\naccurate reconstruction, we empower CHORD with three types of awareness:\nappearance, shape, and interacting pose. In addition, we have constructed a new\ndataset, COMIC, of category-level hand-object interaction. COMIC contains a\nrich array of object instances, materials, hand interactions, and viewing\ndirections. Extensive evaluation shows that CHORD outperforms state-of-the-art\napproaches in both quantitative and qualitative measures. Code, model, and\ndatasets are available at https://kailinli.github.io/CHORD.\n","authors":["Kailin Li","Lixin Yang","Haoyu Zhen","Zenan Lin","Xinyu Zhan","Licheng Zhong","Jian Xu","Kejian Wu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2308.10574v1.pdf","comment":"To be presented at ICCV 2023, Paris"},{"id":"http://arxiv.org/abs/2305.12411v3","updated":"2023-08-21T09:07:07Z","published":"2023-05-21T09:22:24Z","title":"Synthesizing Diverse Human Motions in 3D Indoor Scenes","summary":" We present a novel method for populating 3D indoor scenes with virtual humans\nthat can navigate in the environment and interact with objects in a realistic\nmanner. Existing approaches rely on training sequences that contain captured\nhuman motions and the 3D scenes they interact with. However, such interaction\ndata are costly, difficult to capture, and can hardly cover all plausible\nhuman-scene interactions in complex environments. To address these challenges,\nwe propose a reinforcement learning-based approach that enables virtual humans\nto navigate in 3D scenes and interact with objects realistically and\nautonomously, driven by learned motion control policies. The motion control\npolicies employ latent motion action spaces, which correspond to realistic\nmotion primitives and are learned from large-scale motion capture data using a\npowerful generative motion model. For navigation in a 3D environment, we\npropose a scene-aware policy with novel state and reward designs for collision\navoidance. Combined with navigation mesh-based path-finding algorithms to\ngenerate intermediate waypoints, our approach enables the synthesis of diverse\nhuman motions navigating in 3D indoor scenes and avoiding obstacles. To\ngenerate fine-grained human-object interactions, we carefully curate\ninteraction goal guidance using a marker-based body representation and leverage\nfeatures based on the signed distance field (SDF) to encode human-scene\nproximity relations. Our method can synthesize realistic and diverse\nhuman-object interactions (e.g.,~sitting on a chair and then getting up) even\nfor out-of-distribution test scenarios with different object shapes,\norientations, starting body positions, and poses. Experimental results\ndemonstrate that our approach outperforms state-of-the-art methods in terms of\nboth motion naturalness and diversity. Code and video results are available at:\nhttps://zkf1997.github.io/DIMOS.\n","authors":["Kaifeng Zhao","Yan Zhang","Shaofei Wang","Thabo Beeler","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2305.12411v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10570v1","updated":"2023-08-21T09:01:14Z","published":"2023-08-21T09:01:14Z","title":"Self-Feedback DETR for Temporal Action Detection","summary":" Temporal Action Detection (TAD) is challenging but fundamental for real-world\nvideo applications. Recently, DETR-based models have been devised for TAD but\nhave not performed well yet. In this paper, we point out the problem in the\nself-attention of DETR for TAD; the attention modules focus on a few key\nelements, called temporal collapse problem. It degrades the capability of the\nencoder and decoder since their self-attention modules play no role. To solve\nthe problem, we propose a novel framework, Self-DETR, which utilizes\ncross-attention maps of the decoder to reactivate self-attention modules. We\nrecover the relationship between encoder features by simple matrix\nmultiplication of the cross-attention map and its transpose. Likewise, we also\nget the information within decoder queries. By guiding collapsed self-attention\nmaps with the guidance map calculated, we settle down the temporal collapse of\nself-attention modules in the encoder and decoder. Our extensive experiments\ndemonstrate that Self-DETR resolves the temporal collapse problem by keeping\nhigh diversity of attention over all layers.\n","authors":["Jihwan Kim","Miso Lee","Jae-Pil Heo"],"pdf_url":"https://arxiv.org/pdf/2308.10570v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10569v1","updated":"2023-08-21T08:59:59Z","published":"2023-08-21T08:59:59Z","title":"RT-MonoDepth: Real-time Monocular Depth Estimation on Embedded Systems","summary":" Depth sensing is a crucial function of unmanned aerial vehicles and\nautonomous vehicles. Due to the small size and simple structure of monocular\ncameras, there has been a growing interest in depth estimation from a single\nRGB image. However, state-of-the-art monocular CNN-based depth estimation\nmethods using fairly complex deep neural networks are too slow for real-time\ninference on embedded platforms. This paper addresses the problem of real-time\ndepth estimation on embedded systems. We propose two efficient and lightweight\nencoder-decoder network architectures, RT-MonoDepth and RT-MonoDepth-S, to\nreduce computational complexity and latency. Our methodologies demonstrate that\nit is possible to achieve similar accuracy as prior state-of-the-art works on\ndepth estimation at a faster inference speed. Our proposed networks,\nRT-MonoDepth and RT-MonoDepth-S, runs at 18.4\\&30.5 FPS on NVIDIA Jetson Nano\nand 253.0\\&364.1 FPS on NVIDIA Jetson AGX Orin on a single RGB image of\nresolution 640$\\times$192, and achieve relative state-of-the-art accuracy on\nthe KITTI dataset. To the best of the authors' knowledge, this paper achieves\nthe best accuracy and fastest inference speed compared with existing fast\nmonocular depth estimation methods.\n","authors":["Cheng Feng","Zhen Chen","Congxuan Zhang","Weiming Hu","Bing Li","Feng Lu"],"pdf_url":"https://arxiv.org/pdf/2308.10569v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2212.02469v3","updated":"2023-08-21T08:59:06Z","published":"2022-12-05T18:24:06Z","title":"One-shot Implicit Animatable Avatars with Model-based Priors","summary":" Existing neural rendering methods for creating human avatars typically either\nrequire dense input signals such as video or multi-view images, or leverage a\nlearned prior from large-scale specific 3D human datasets such that\nreconstruction can be performed with sparse-view inputs. Most of these methods\nfail to achieve realistic reconstruction when only a single image is available.\nTo enable the data-efficient creation of realistic animatable 3D humans, we\npropose ELICIT, a novel method for learning human-specific neural radiance\nfields from a single image. Inspired by the fact that humans can effortlessly\nestimate the body geometry and imagine full-body clothing from a single image,\nwe leverage two priors in ELICIT: 3D geometry prior and visual semantic prior.\nSpecifically, ELICIT utilizes the 3D body shape geometry prior from a skinned\nvertex-based template model (i.e., SMPL) and implements the visual clothing\nsemantic prior with the CLIP-based pretrained models. Both priors are used to\njointly guide the optimization for creating plausible content in the invisible\nareas. Taking advantage of the CLIP models, ELICIT can use text descriptions to\ngenerate text-conditioned unseen regions. In order to further improve visual\ndetails, we propose a segmentation-based sampling strategy that locally refines\ndifferent parts of the avatar. Comprehensive evaluations on multiple popular\nbenchmarks, including ZJU-MoCAP, Human3.6M, and DeepFashion, show that ELICIT\nhas outperformed strong baseline methods of avatar creation when only a single\nimage is available. The code is public for research purposes at\nhttps://huangyangyi.github.io/ELICIT/.\n","authors":["Yangyi Huang","Hongwei Yi","Weiyang Liu","Haofan Wang","Boxi Wu","Wenxiao Wang","Binbin Lin","Debing Zhang","Deng Cai"],"pdf_url":"https://arxiv.org/pdf/2212.02469v3.pdf","comment":"To appear at ICCV 2023. Project website:\n https://huangyangyi.github.io/ELICIT/"},{"id":"http://arxiv.org/abs/2307.00811v2","updated":"2023-08-21T08:45:07Z","published":"2023-07-03T07:51:08Z","title":"Review helps learn better: Temporal Supervised Knowledge Distillation","summary":" Reviewing plays an important role when learning knowledge. The knowledge\nacquisition at a certain time point may be strongly inspired with the help of\nprevious experience. Thus the knowledge growing procedure should show strong\nrelationship along the temporal dimension. In our research, we find that during\nthe network training, the evolution of feature map follows temporal sequence\nproperty. A proper temporal supervision may further improve the network\ntraining performance. Inspired by this observation, we propose Temporal\nSupervised Knowledge Distillation (TSKD). Specifically, we extract the\nspatiotemporal features in the different training phases of student by\nconvolutional Long Short-term memory network (Conv-LSTM). Then, we train the\nstudent net through a dynamic target, rather than static teacher network\nfeatures. This process realizes the refinement of old knowledge in student\nnetwork, and utilizes it to assist current learning. Extensive experiments\nverify the effectiveness and advantages of our method over existing knowledge\ndistillation methods, including various network architectures and different\ntasks (image classification and object detection) .\n","authors":["Dongwei Wang","Zhi Han","Yanmei Wang","Xiai Chen","Baichen Liu","Yandong Tang"],"pdf_url":"https://arxiv.org/pdf/2307.00811v2.pdf","comment":"Under review in AAAI 2024"},{"id":"http://arxiv.org/abs/2308.10562v1","updated":"2023-08-21T08:37:04Z","published":"2023-08-21T08:37:04Z","title":"Seeing the Intangible: Surveying Automatic High-Level Visual\n Understanding from Still Images","summary":" The field of Computer Vision (CV) was born with the single grand goal of\ncomplete image understanding: providing a complete semantic interpretation of\nan input image. What exactly this goal entails is not immediately\nstraightforward, but theoretical hierarchies of visual understanding point\ntowards a top level of full semantics, within which sits the most complex and\nsubjective information humans can detect from visual data. In particular,\nnon-concrete concepts including emotions, social values and ideologies seem to\nbe protagonists of this \"high-level\" visual semantic understanding. While such\n\"abstract concepts\" are critical tools for image management and retrieval,\ntheir automatic recognition is still a challenge, exactly because they rest at\nthe top of the \"semantic pyramid\": the well-known semantic gap problem is\nworsened given their lack of unique perceptual referents, and their reliance on\nmore unspecific features than concrete concepts. Given that there seems to be\nvery scarce explicit work within CV on the task of abstract social concept\n(ASC) detection, and that many recent works seem to discuss similar\nnon-concrete entities by using different terminology, in this survey we provide\na systematic review of CV work that explicitly or implicitly approaches the\nproblem of abstract (specifically social) concept detection from still images.\nSpecifically, this survey performs and provides: (1) A study and clustering of\nhigh level visual understanding semantic elements from a multidisciplinary\nperspective (computer science, visual studies, and cognitive perspectives); (2)\nA study and clustering of high level visual understanding computer vision tasks\ndealing with the identified semantic elements, so as to identify current CV\nwork that implicitly deals with AC detection.\n","authors":["Delfina Sol Martinez Pandiani","Valentina Presutti"],"pdf_url":"https://arxiv.org/pdf/2308.10562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10561v1","updated":"2023-08-21T08:36:23Z","published":"2023-08-21T08:36:23Z","title":"Spatial Transform Decoupling for Oriented Object Detection","summary":" Vision Transformers (ViTs) have achieved remarkable success in computer\nvision tasks. However, their potential in rotation-sensitive scenarios has not\nbeen fully explored, and this limitation may be inherently attributed to the\nlack of spatial invariance in the data-forwarding process. In this study, we\npresent a novel approach, termed Spatial Transform Decoupling (STD), providing\na simple-yet-effective solution for oriented object detection with ViTs. Built\nupon stacked ViT blocks, STD utilizes separate network branches to predict the\nposition, size, and angle of bounding boxes, effectively harnessing the spatial\ntransform potential of ViTs in a divide-and-conquer fashion. Moreover, by\naggregating cascaded activation masks (CAMs) computed upon the regressed\nparameters, STD gradually enhances features within regions of interest (RoIs),\nwhich complements the self-attention mechanism. Without bells and whistles, STD\nachieves state-of-the-art performance on the benchmark datasets including\nDOTA-v1.0 (82.24% mAP) and HRSC2016 (98.55% mAP), which demonstrates the\neffectiveness of the proposed method. Source code is available at\nhttps://github.com/yuhongtian17/Spatial-Transform-Decoupling.\n","authors":["Hongtian Yu","Yunjie Tian","Qixiang Ye","Yunfan Liu"],"pdf_url":"https://arxiv.org/pdf/2308.10561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10557v1","updated":"2023-08-21T08:17:42Z","published":"2023-08-21T08:17:42Z","title":"Local Spherical Harmonics Improve Skeleton-Based Hand Action Recognition","summary":" Hand action recognition is essential. Communication, human-robot\ninteractions, and gesture control are dependent on it. Skeleton-based action\nrecognition traditionally includes hands, which belong to the classes which\nremain challenging to correctly recognize to date. We propose a method\nspecifically designed for hand action recognition which uses relative angular\nembeddings and local Spherical Harmonics to create novel hand representations.\nThe use of Spherical Harmonics creates rotation-invariant representations which\nmake hand action recognition even more robust against inter-subject differences\nand viewpoint changes. We conduct extensive experiments on the hand joints in\nthe First-Person Hand Action Benchmark with RGB-D Videos and 3D Hand Pose\nAnnotations, and on the NTU RGB+D 120 dataset, demonstrating the benefit of\nusing Local Spherical Harmonics Representations. Our code is available at\nhttps://github.com/KathPra/LSHR_LSHT.\n","authors":["Katharina Prasse","Steffen Jung","Yuxuan Zhou","Margret Keuper"],"pdf_url":"https://arxiv.org/pdf/2308.10557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10554v1","updated":"2023-08-21T08:12:28Z","published":"2023-08-21T08:12:28Z","title":"Improving Diversity in Zero-Shot GAN Adaptation with Semantic Variations","summary":" Training deep generative models usually requires a large amount of data. To\nalleviate the data collection cost, the task of zero-shot GAN adaptation aims\nto reuse well-trained generators to synthesize images of an unseen target\ndomain without any further training samples. Due to the data absence, the\ntextual description of the target domain and the vision-language models, e.g.,\nCLIP, are utilized to effectively guide the generator. However, with only a\nsingle representative text feature instead of real images, the synthesized\nimages gradually lose diversity as the model is optimized, which is also known\nas mode collapse. To tackle the problem, we propose a novel method to find\nsemantic variations of the target text in the CLIP space. Specifically, we\nexplore diverse semantic variations based on the informative text feature of\nthe target domain while regularizing the uncontrolled deviation of the semantic\ninformation. With the obtained variations, we design a novel directional moment\nloss that matches the first and second moments of image and text direction\ndistributions. Moreover, we introduce elastic weight consolidation and a\nrelation consistency loss to effectively preserve valuable content information\nfrom the source domain, e.g., appearances. Through extensive experiments, we\ndemonstrate the efficacy of the proposed methods in ensuring sample diversity\nin various scenarios of zero-shot GAN adaptation. We also conduct ablation\nstudies to validate the effect of each proposed component. Notably, our model\nachieves a new state-of-the-art on zero-shot GAN adaptation in terms of both\ndiversity and quality.\n","authors":["Seogkyu Jeon","Bei Liu","Pilhyeon Lee","Kibeom Hong","Jianlong Fu","Hyeran Byun"],"pdf_url":"https://arxiv.org/pdf/2308.10554v1.pdf","comment":"Accepted to ICCV 2023 (poster)"},{"id":"http://arxiv.org/abs/2212.05853v3","updated":"2023-08-21T08:11:41Z","published":"2022-12-12T12:31:46Z","title":"DeepCut: Unsupervised Segmentation using Graph Neural Networks\n Clustering","summary":" Image segmentation is a fundamental task in computer vision. Data annotation\nfor training supervised methods can be labor-intensive, motivating unsupervised\nmethods. Current approaches often rely on extracting deep features from\npre-trained networks to construct a graph, and classical clustering methods\nlike k-means and normalized-cuts are then applied as a post-processing step.\nHowever, this approach reduces the high-dimensional information encoded in the\nfeatures to pair-wise scalar affinities. To address this limitation, this study\nintroduces a lightweight Graph Neural Network (GNN) to replace classical\nclustering methods while optimizing for the same clustering objective function.\nUnlike existing methods, our GNN takes both the pair-wise affinities between\nlocal image features and the raw features as input. This direct connection\nbetween the raw features and the clustering objective enables us to implicitly\nperform classification of the clusters between different graphs, resulting in\npart semantic segmentation without the need for additional post-processing\nsteps. We demonstrate how classical clustering objectives can be formulated as\nself-supervised loss functions for training an image segmentation GNN.\nFurthermore, we employ the Correlation-Clustering (CC) objective to perform\nclustering without defining the number of clusters, allowing for k-less\nclustering. We apply the proposed method for object localization, segmentation,\nand semantic part segmentation tasks, surpassing state-of-the-art performance\non multiple benchmarks.\n","authors":["Amit Aflalo","Shai Bagon","Tamar Kashti","Yonina Eldar"],"pdf_url":"https://arxiv.org/pdf/2212.05853v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10542v1","updated":"2023-08-21T07:52:39Z","published":"2023-08-21T07:52:39Z","title":"Learning Weakly Convex Regularizers for Convergent Image-Reconstruction\n Algorithms","summary":" We propose to learn non-convex regularizers with a prescribed upper bound on\ntheir weak-convexity modulus. Such regularizers give rise to variational\ndenoisers that minimize a convex energy. They rely on few parameters (less than\n15,000) and offer a signal-processing interpretation as they mimic handcrafted\nsparsity-promoting regularizers. Through numerical experiments, we show that\nsuch denoisers outperform convex-regularization methods as well as the popular\nBM3D denoiser. Additionally, the learned regularizer can be deployed to solve\ninverse problems with iterative schemes that provably converge. For both CT and\nMRI reconstruction, the regularizer generalizes well and offers an excellent\ntradeoff between performance, number of parameters, guarantees, and\ninterpretability when compared to other data-driven approaches.\n","authors":["Alexis Goujon","Sebastian Neumayer","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2308.10542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09300v2","updated":"2023-08-21T07:51:00Z","published":"2023-08-18T04:49:38Z","title":"V2A-Mapper: A Lightweight Solution for Vision-to-Audio Generation by\n Connecting Foundation Models","summary":" Building artificial intelligence (AI) systems on top of a set of foundation\nmodels (FMs) is becoming a new paradigm in AI research. Their representative\nand generative abilities learnt from vast amounts of data can be easily adapted\nand transferred to a wide range of downstream tasks without extra training from\nscratch. However, leveraging FMs in cross-modal generation remains\nunder-researched when audio modality is involved. On the other hand,\nautomatically generating semantically-relevant sound from visual input is an\nimportant problem in cross-modal generation studies. To solve this\nvision-to-audio (V2A) generation problem, existing methods tend to design and\nbuild complex systems from scratch using modestly sized datasets. In this\npaper, we propose a lightweight solution to this problem by leveraging\nfoundation models, specifically CLIP, CLAP, and AudioLDM. We first investigate\nthe domain gap between the latent space of the visual CLIP and the auditory\nCLAP models. Then we propose a simple yet effective mapper mechanism\n(V2A-Mapper) to bridge the domain gap by translating the visual input between\nCLIP and CLAP spaces. Conditioned on the translated CLAP embedding, pretrained\naudio generative FM AudioLDM is adopted to produce high-fidelity and\nvisually-aligned sound. Compared to previous approaches, our method only\nrequires a quick training of the V2A-Mapper. We further analyze and conduct\nextensive experiments on the choice of the V2A-Mapper and show that a\ngenerative mapper is better at fidelity and variability (FD) while a regression\nmapper is slightly better at relevance (CS). Both objective and subjective\nevaluation on two V2A datasets demonstrate the superiority of our proposed\nmethod compared to current state-of-the-art approaches - trained with 86% fewer\nparameters but achieving 53% and 19% improvement in FD and CS, respectively.\n","authors":["Heng Wang","Jianbo Ma","Santiago Pascual","Richard Cartwright","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2308.09300v2.pdf","comment":"13 pages, 10 figures. Demo page: https://v2a-mapper.github.io/"},{"id":"http://arxiv.org/abs/2308.09228v2","updated":"2023-08-21T07:39:35Z","published":"2023-08-18T01:20:25Z","title":"Generalized Sum Pooling for Metric Learning","summary":" A common architectural choice for deep metric learning is a convolutional\nneural network followed by global average pooling (GAP). Albeit simple, GAP is\na highly effective way to aggregate information. One possible explanation for\nthe effectiveness of GAP is considering each feature vector as representing a\ndifferent semantic entity and GAP as a convex combination of them. Following\nthis perspective, we generalize GAP and propose a learnable generalized sum\npooling method (GSP). GSP improves GAP with two distinct abilities: i) the\nability to choose a subset of semantic entities, effectively learning to ignore\nnuisance information, and ii) learning the weights corresponding to the\nimportance of each entity. Formally, we propose an entropy-smoothed optimal\ntransport problem and show that it is a strict generalization of GAP, i.e., a\nspecific realization of the problem gives back GAP. We show that this\noptimization problem enjoys analytical gradients enabling us to use it as a\ndirect learnable replacement for GAP. We further propose a zero-shot loss to\nease the learning of GSP. We show the effectiveness of our method with\nextensive evaluations on 4 popular metric learning benchmarks. Code is\navailable at: GSP-DML Framework\n","authors":["Yeti Z. Gurbuz","Ozan Sener","A. Aydın Alatan"],"pdf_url":"https://arxiv.org/pdf/2308.09228v2.pdf","comment":"Accepted as a conference paper at International Conference on\n Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2308.10533v1","updated":"2023-08-21T07:38:33Z","published":"2023-08-21T07:38:33Z","title":"Joint learning of images and videos with a single Vision Transformer","summary":" In this study, we propose a method for jointly learning of images and videos\nusing a single model. In general, images and videos are often trained by\nseparate models. We propose in this paper a method that takes a batch of images\nas input to Vision Transformer IV-ViT, and also a set of video frames with\ntemporal aggregation by late fusion. Experimental results on two image datasets\nand two action recognition datasets are presented.\n","authors":["Shuki Shimizu","Toru Tamaki"],"pdf_url":"https://arxiv.org/pdf/2308.10533v1.pdf","comment":"MVA2023 (18th International Conference on Machine Vision\n Applications), Hamamatsu, Japan, 23-25 July 2023"},{"id":"http://arxiv.org/abs/2308.03272v2","updated":"2023-08-21T07:37:56Z","published":"2023-08-07T03:27:04Z","title":"Feature-Suppressed Contrast for Self-Supervised Food Pre-training","summary":" Most previous approaches for analyzing food images have relied on extensively\nannotated datasets, resulting in significant human labeling expenses due to the\nvaried and intricate nature of such images. Inspired by the effectiveness of\ncontrastive self-supervised methods in utilizing unlabelled data, weiqing\nexplore leveraging these techniques on unlabelled food images. In contrastive\nself-supervised methods, two views are randomly generated from an image by data\naugmentations. However, regarding food images, the two views tend to contain\nsimilar informative contents, causing large mutual information, which impedes\nthe efficacy of contrastive self-supervised learning. To address this problem,\nwe propose Feature Suppressed Contrast (FeaSC) to reduce mutual information\nbetween views. As the similar contents of the two views are salient or highly\nresponsive in the feature map, the proposed FeaSC uses a response-aware scheme\nto localize salient features in an unsupervised manner. By suppressing some\nsalient features in one view while leaving another contrast view unchanged, the\nmutual information between the two views is reduced, thereby enhancing the\neffectiveness of contrast learning for self-supervised food pre-training. As a\nplug-and-play module, the proposed method consistently improves BYOL and\nSimSiam by 1.70\\% $\\sim$ 6.69\\% classification accuracy on four publicly\navailable food recognition datasets. Superior results have also been achieved\non downstream segmentation tasks, demonstrating the effectiveness of the\nproposed method.\n","authors":["Xinda Liu","Yaohui Zhu","Linhu Liu","Jiang Tian","Lili Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03272v2.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.10531v1","updated":"2023-08-21T07:34:31Z","published":"2023-08-21T07:34:31Z","title":"SRFormer: Empowering Regression-Based Text Detection Transformer with\n Segmentation","summary":" Existing techniques for text detection can be broadly classified into two\nprimary groups: segmentation-based methods and regression-based methods.\nSegmentation models offer enhanced robustness to font variations but require\nintricate post-processing, leading to high computational overhead.\nRegression-based methods undertake instance-aware prediction but face\nlimitations in robustness and data efficiency due to their reliance on\nhigh-level representations. In our academic pursuit, we propose SRFormer, a\nunified DETR-based model with amalgamated Segmentation and Regression, aiming\nat the synergistic harnessing of the inherent robustness in segmentation\nrepresentations, along with the straightforward post-processing of\ninstance-level regression. Our empirical analysis indicates that favorable\nsegmentation predictions can be obtained at the initial decoder layers. In\nlight of this, we constrain the incorporation of segmentation branches to the\nfirst few decoder layers and employ progressive regression refinement in\nsubsequent layers, achieving performance gains while minimizing additional\ncomputational load from the mask. Furthermore, we propose a Mask-informed Query\nEnhancement module. We take the segmentation result as a natural soft-ROI to\npool and extract robust pixel representations, which are then employed to\nenhance and diversify instance queries. Extensive experimentation across\nmultiple benchmarks has yielded compelling findings, highlighting our method's\nexceptional robustness, superior training and data efficiency, as well as its\nstate-of-the-art performance.\n","authors":["Qingwen Bu","Sungrae Park","Minsoo Khang","Yichuan Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.10531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10525v1","updated":"2023-08-21T07:25:58Z","published":"2023-08-21T07:25:58Z","title":"LightDepth: Single-View Depth Self-Supervision from Illumination Decline","summary":" Single-view depth estimation can be remarkably effective if there is enough\nground-truth depth data for supervised training. However, there are scenarios,\nespecially in medicine in the case of endoscopies, where such data cannot be\nobtained. In such cases, multi-view self-supervision and synthetic-to-real\ntransfer serve as alternative approaches, however, with a considerable\nperformance reduction in comparison to supervised case. Instead, we propose a\nsingle-view self-supervised method that achieves a performance similar to the\nsupervised case. In some medical devices, such as endoscopes, the camera and\nlight sources are co-located at a small distance from the target surfaces.\nThus, we can exploit that, for any given albedo and surface orientation, pixel\nbrightness is inversely proportional to the square of the distance to the\nsurface, providing a strong single-view self-supervisory signal. In our\nexperiments, our self-supervised models deliver accuracies comparable to those\nof fully supervised ones, while being applicable without depth ground-truth\ndata.\n","authors":["Javier Rodríguez-Puigvert","Víctor M. Batlle","J. M. M. Montiel","Ruben Martinez Cantin","Pascal Fua","Juan D. Tardós","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2308.10525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10524v1","updated":"2023-08-21T07:24:29Z","published":"2023-08-21T07:24:29Z","title":"Dataset Quantization","summary":" State-of-the-art deep neural networks are trained with large amounts\n(millions or even billions) of data. The expensive computation and memory costs\nmake it difficult to train them on limited hardware resources, especially for\nrecent popular large language models (LLM) and computer vision models (CV).\nRecent popular dataset distillation methods are thus developed, aiming to\nreduce the number of training samples via synthesizing small-scale datasets via\ngradient matching. However, as the gradient calculation is coupled with the\nspecific network architecture, the synthesized dataset is biased and performs\npoorly when used for training unseen architectures. To address these\nlimitations, we present dataset quantization (DQ), a new framework to compress\nlarge-scale datasets into small subsets which can be used for training any\nneural network architectures. Extensive experiments demonstrate that DQ is able\nto generate condensed small datasets for training unseen network architectures\nwith state-of-the-art compression ratios for lossless model training. To the\nbest of our knowledge, DQ is the first method that can successfully distill\nlarge-scale datasets such as ImageNet-1k with a state-of-the-art compression\nratio. Notably, with 60% data from ImageNet and 20% data from Alpaca's\ninstruction tuning data, the models can be trained with negligible or no\nperformance drop for both vision tasks (including classification, semantic\nsegmentation, and object detection) as well as language tasks (including\ninstruction tuning tasks such as BBH and DROP).\n","authors":["Daquan Zhou","Kai Wang","Jianyang Gu","Xiangyu Peng","Dongze Lian","Yifan Zhang","Yang You","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2308.10524v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.08530v3","updated":"2023-08-21T07:22:53Z","published":"2023-08-16T17:40:18Z","title":"Ref-DVGO: Reflection-Aware Direct Voxel Grid Optimization for an\n Improved Quality-Efficiency Trade-Off in Reflective Scene Reconstruction","summary":" Neural Radiance Fields (NeRFs) have revolutionized the field of novel view\nsynthesis, demonstrating remarkable performance. However, the modeling and\nrendering of reflective objects remain challenging problems. Recent methods\nhave shown significant improvements over the baselines in handling reflective\nscenes, albeit at the expense of efficiency. In this work, we aim to strike a\nbalance between efficiency and quality. To this end, we investigate an\nimplicit-explicit approach based on conventional volume rendering to enhance\nthe reconstruction quality and accelerate the training and rendering processes.\nWe adopt an efficient density-based grid representation and reparameterize the\nreflected radiance in our pipeline. Our proposed reflection-aware approach\nachieves a competitive quality efficiency trade-off compared to competing\nmethods. Based on our experimental results, we propose and discuss hypotheses\nregarding the factors influencing the results of density-based methods for\nreconstructing reflective objects. The source code is available at\nhttps://github.com/gkouros/ref-dvgo.\n","authors":["Georgios Kouros","Minye Wu","Shubham Shrivastava","Sushruth Nagesh","Punarjay Chakravarty","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2308.08530v3.pdf","comment":"5 pages, 4 figures, 3 tables, ICCV TRICKY 2023 Workshop"},{"id":"http://arxiv.org/abs/2308.10522v1","updated":"2023-08-21T07:19:47Z","published":"2023-08-21T07:19:47Z","title":"Information Theory-Guided Heuristic Progressive Multi-View Coding","summary":" Multi-view representation learning aims to capture comprehensive information\nfrom multiple views of a shared context. Recent works intuitively apply\ncontrastive learning to different views in a pairwise manner, which is still\nscalable: view-specific noise is not filtered in learning view-shared\nrepresentations; the fake negative pairs, where the negative terms are actually\nwithin the same class as the positive, and the real negative pairs are\ncoequally treated; evenly measuring the similarities between terms might\ninterfere with optimization. Importantly, few works study the theoretical\nframework of generalized self-supervised multi-view learning, especially for\nmore than two views. To this end, we rethink the existing multi-view learning\nparadigm from the perspective of information theory and then propose a novel\ninformation theoretical framework for generalized multi-view learning. Guided\nby it, we build a multi-view coding method with a three-tier progressive\narchitecture, namely Information theory-guided hierarchical Progressive\nMulti-view Coding (IPMC). In the distribution-tier, IPMC aligns the\ndistribution between views to reduce view-specific noise. In the set-tier, IPMC\nconstructs self-adjusted contrasting pools, which are adaptively modified by a\nview filter. Lastly, in the instance-tier, we adopt a designed unified loss to\nlearn representations and reduce the gradient interference. Theoretically and\nempirically, we demonstrate the superiority of IPMC over state-of-the-art\nmethods.\n","authors":["Jiangmeng Li","Hang Gao","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.10522v1.pdf","comment":"This paper is accepted y the jourcal of Elsevier Neural Networks by\n 2023. arXiv admin note: substantial text overlap with arXiv:2109.02344"},{"id":"http://arxiv.org/abs/2308.10521v1","updated":"2023-08-21T07:18:51Z","published":"2023-08-21T07:18:51Z","title":"PHE-SICH-CT-IDS: A Benchmark CT Image Dataset for Evaluation Semantic\n Segmentation, Object Detection and Radiomic Feature Extraction of\n Perihematomal Edema in Spontaneous Intracerebral Hemorrhage","summary":" Intracerebral hemorrhage is one of the diseases with the highest mortality\nand poorest prognosis worldwide. Spontaneous intracerebral hemorrhage (SICH)\ntypically presents acutely, prompt and expedited radiological examination is\ncrucial for diagnosis, localization, and quantification of the hemorrhage.\nEarly detection and accurate segmentation of perihematomal edema (PHE) play a\ncritical role in guiding appropriate clinical intervention and enhancing\npatient prognosis. However, the progress and assessment of computer-aided\ndiagnostic methods for PHE segmentation and detection face challenges due to\nthe scarcity of publicly accessible brain CT image datasets. This study\nestablishes a publicly available CT dataset named PHE-SICH-CT-IDS for\nperihematomal edema in spontaneous intracerebral hemorrhage. The dataset\ncomprises 120 brain CT scans and 7,022 CT images, along with corresponding\nmedical information of the patients. To demonstrate its effectiveness,\nclassical algorithms for semantic segmentation, object detection, and radiomic\nfeature extraction are evaluated. The experimental results confirm the\nsuitability of PHE-SICH-CT-IDS for assessing the performance of segmentation,\ndetection and radiomic feature extraction methods. To the best of our\nknowledge, this is the first publicly available dataset for PHE in SICH,\ncomprising various data formats suitable for applications across diverse\nmedical scenarios. We believe that PHE-SICH-CT-IDS will allure researchers to\nexplore novel algorithms, providing valuable support for clinicians and\npatients in the clinical setting. PHE-SICH-CT-IDS is freely published for\nnon-commercial purpose at:\nhttps://figshare.com/articles/dataset/PHE-SICH-CT-IDS/23957937.\n","authors":["Deguo Ma","Chen Li","Lin Qiao","Tianming Du","Dechao Tang","Zhiyu Ma","Marcin Grzegorzek Hongzan","Hongzan Sun"],"pdf_url":"https://arxiv.org/pdf/2308.10521v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14512v3","updated":"2023-08-21T07:14:04Z","published":"2022-11-26T08:32:28Z","title":"Residual Pattern Learning for Pixel-wise Out-of-Distribution Detection\n in Semantic Segmentation","summary":" Semantic segmentation models classify pixels into a set of known\n(``in-distribution'') visual classes. When deployed in an open world, the\nreliability of these models depends on their ability not only to classify\nin-distribution pixels but also to detect out-of-distribution (OoD) pixels.\nHistorically, the poor OoD detection performance of these models has motivated\nthe design of methods based on model re-training using synthetic training\nimages that include OoD visual objects. Although successful, these re-trained\nmethods have two issues: 1) their in-distribution segmentation accuracy may\ndrop during re-training, and 2) their OoD detection accuracy does not\ngeneralise well to new contexts (e.g., country surroundings) outside the\ntraining set (e.g., city surroundings). In this paper, we mitigate these issues\nwith: (i) a new residual pattern learning (RPL) module that assists the\nsegmentation model to detect OoD pixels without affecting the inlier\nsegmentation performance; and (ii) a novel context-robust contrastive learning\n(CoroCL) that enforces RPL to robustly detect OoD pixels among various\ncontexts. Our approach improves by around 10\\% FPR and 7\\% AuPRC the previous\nstate-of-the-art in Fishyscapes, Segment-Me-If-You-Can, and RoadAnomaly\ndatasets. Our code is available at: https://github.com/yyliu01/RPL.\n","authors":["Yuyuan Liu","Choubo Ding","Yu Tian","Guansong Pang","Vasileios Belagiannis","Ian Reid","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2211.14512v3.pdf","comment":"The paper contains 16 pages and it is accepted by ICCV'23"},{"id":"http://arxiv.org/abs/2303.11629v2","updated":"2023-08-21T07:07:43Z","published":"2023-03-21T06:51:31Z","title":"TMA: Temporal Motion Aggregation for Event-based Optical Flow","summary":" Event cameras have the ability to record continuous and detailed trajectories\nof objects with high temporal resolution, thereby providing intuitive motion\ncues for optical flow estimation. Nevertheless, most existing learning-based\napproaches for event optical flow estimation directly remould the paradigm of\nconventional images by representing the consecutive event stream as static\nframes, ignoring the inherent temporal continuity of event data. In this paper,\nwe argue that temporal continuity is a vital element of event-based optical\nflow and propose a novel Temporal Motion Aggregation (TMA) approach to unlock\nits potential. Technically, TMA comprises three components: an event splitting\nstrategy to incorporate intermediate motion information underlying the temporal\ncontext, a linear lookup strategy to align temporally fine-grained motion\nfeatures and a novel motion pattern aggregation module to emphasize consistent\npatterns for motion feature enhancement. By incorporating temporally\nfine-grained motion information, TMA can derive better flow estimates than\nexisting methods at early stages, which not only enables TMA to obtain more\naccurate final predictions, but also greatly reduces the demand for a number of\nrefinements. Extensive experiments on DSEC-Flow and MVSEC datasets verify the\neffectiveness and superiority of our TMA. Remarkably, compared to E-RAFT, TMA\nachieves a 6\\% improvement in accuracy and a 40\\% reduction in inference time\non DSEC-Flow. Code will be available at \\url{https://github.com/ispc-lab/TMA}.\n","authors":["Haotian Liu","Guang Chen","Sanqing Qu","Yanping Zhang","Zhijun Li","Alois Knoll","Changjun Jiang"],"pdf_url":"https://arxiv.org/pdf/2303.11629v2.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.10515v1","updated":"2023-08-21T07:06:49Z","published":"2023-08-21T07:06:49Z","title":"QD-BEV : Quantization-aware View-guided Distillation for Multi-view 3D\n Object Detection","summary":" Multi-view 3D detection based on BEV (bird-eye-view) has recently achieved\nsignificant improvements. However, the huge memory consumption of\nstate-of-the-art models makes it hard to deploy them on vehicles, and the\nnon-trivial latency will affect the real-time perception of streaming\napplications. Despite the wide application of quantization to lighten models,\nwe show in our paper that directly applying quantization in BEV tasks will 1)\nmake the training unstable, and 2) lead to intolerable performance degradation.\nTo solve these issues, our method QD-BEV enables a novel view-guided\ndistillation (VGD) objective, which can stabilize the quantization-aware\ntraining (QAT) while enhancing the model performance by leveraging both image\nfeatures and BEV features. Our experiments show that QD-BEV achieves similar or\neven better accuracy than previous methods with significant efficiency gains.\nOn the nuScenes datasets, the 4-bit weight and 6-bit activation quantized\nQD-BEV-Tiny model achieves 37.2% NDS with only 15.8 MB model size,\noutperforming BevFormer-Tiny by 1.8% with an 8x model compression. On the Small\nand Base variants, QD-BEV models also perform superbly and achieve 47.9% NDS\n(28.2 MB) and 50.9% NDS (32.9 MB), respectively.\n","authors":["Yifan Zhang","Zhen Dong","Huanrui Yang","Ming Lu","Cheng-Ching Tseng","Yuan Du","Kurt Keutzer","Li Du","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10515v1.pdf","comment":"ICCV 2023 Accept"},{"id":"http://arxiv.org/abs/2303.08757v4","updated":"2023-08-21T07:02:23Z","published":"2023-03-15T16:53:19Z","title":"CT Perfusion is All We Need: 4D CNN Segmentation of Penumbra and Core in\n Patients With Suspected Ischemic Stroke","summary":" Precise and fast prediction methods for ischemic areas comprised of dead\ntissue, core, and salvageable tissue, penumbra, in acute ischemic stroke (AIS)\npatients are of significant clinical interest. They play an essential role in\nimproving diagnosis and treatment planning. Computed Tomography (CT) scan is\none of the primary modalities for early assessment in patients with suspected\nAIS. CT Perfusion (CTP) is often used as a primary assessment to determine\nstroke location, severity, and volume of ischemic lesions. Current automatic\nsegmentation methods for CTP mostly use already processed 3D parametric maps\nconventionally used for clinical interpretation by radiologists as input.\nAlternatively, the raw CTP data is used on a slice-by-slice basis as 2D+time\ninput, where the spatial information over the volume is ignored. In addition,\nthese methods are only interested in segmenting core regions, while predicting\npenumbra can be essential for treatment planning. This paper investigates\ndifferent methods to utilize the entire 4D CTP as input to fully exploit the\nspatio-temporal information, leading us to propose a novel 4D convolution\nlayer. Our comprehensive experiments on a local dataset of 152 patients divided\ninto three groups show that our proposed models generate more precise results\nthan other methods explored. Adopting the proposed 4D mJ-Net, a Dice\nCoefficient of 0.53 and 0.23 is achieved for segmenting penumbra and core\nareas, respectively. The code is available on\nhttps://github.com/Biomedical-Data-Analysis-Laboratory/4D-mJ-Net.git.\n","authors":["Luca Tomasetti","Kjersti Engan","Liv Jorunn Høllesli","Kathinka Dæhli Kurz","Mahdieh Khanmohammadi"],"pdf_url":"https://arxiv.org/pdf/2303.08757v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10511v1","updated":"2023-08-21T06:51:58Z","published":"2023-08-21T06:51:58Z","title":"Performance Enhancement Leveraging Mask-RCNN on Bengali Document Layout\n Analysis","summary":" Understanding digital documents is like solving a puzzle, especially\nhistorical ones. Document Layout Analysis (DLA) helps with this puzzle by\ndividing documents into sections like paragraphs, images, and tables. This is\ncrucial for machines to read and understand these documents.In the DL Sprint\n2.0 competition, we worked on understanding Bangla documents. We used a dataset\ncalled BaDLAD with lots of examples. We trained a special model called Mask\nR-CNN to help with this understanding. We made this model better by\nstep-by-step hyperparameter tuning, and we achieved a good dice score of\n0.889.However, not everything went perfectly. We tried using a model trained\nfor English documents, but it didn't fit well with Bangla. This showed us that\neach language has its own challenges. Our solution for the DL Sprint 2.0 is\npublicly available at\nhttps://www.kaggle.com/competitions/dlsprint2/discussion/432201 along with\nnotebooks, weights, and inference notebook.\n","authors":["Shrestha Datta","Md Adith Mollah","Raisa Fairooz","Tariful Islam Fahim"],"pdf_url":"https://arxiv.org/pdf/2308.10511v1.pdf","comment":"Contest paper, Conest: DL sprint 2.0 (Link:\n https://www.kaggle.com/competitions/dlsprint2), Solution link:\n https://www.kaggle.com/competitions/dlsprint2/discussion/432201"},{"id":"http://arxiv.org/abs/2308.10510v1","updated":"2023-08-21T06:50:44Z","published":"2023-08-21T06:50:44Z","title":"Frequency Compensated Diffusion Model for Real-scene Dehazing","summary":" Due to distribution shift, deep learning based methods for image dehazing\nsuffer from performance degradation when applied to real-world hazy images. In\nthis paper, we consider a dehazing framework based on conditional diffusion\nmodels for improved generalization to real haze. First, we find that optimizing\nthe training objective of diffusion models, i.e., Gaussian noise vectors, is\nnon-trivial. The spectral bias of deep networks hinders the higher frequency\nmodes in Gaussian vectors from being learned and hence impairs the\nreconstruction of image details. To tackle this issue, we design a network\nunit, named Frequency Compensation block (FCB), with a bank of filters that\njointly emphasize the mid-to-high frequencies of an input signal. We\ndemonstrate that diffusion models with FCB achieve significant gains in both\nperceptual and distortion metrics. Second, to further boost the generalization\nperformance, we propose a novel data synthesis pipeline, HazeAug, to augment\nhaze in terms of degree and diversity. Within the framework, a solid baseline\nfor blind dehazing is set up where models are trained on synthetic hazy-clean\npairs, and directly generalize to real data. Extensive evaluations show that\nthe proposed dehazing diffusion model significantly outperforms\nstate-of-the-art methods on real-world images.\n","authors":["Jing Wang","Songtao Wu","Kuanhong Xu","Zhiqiang Yuan"],"pdf_url":"https://arxiv.org/pdf/2308.10510v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2308.10509v1","updated":"2023-08-21T06:50:29Z","published":"2023-08-21T06:50:29Z","title":"An Examination of the Compositionality of Large Generative\n Vision-Language Models","summary":" With the success of Large Language Models (LLMs), a surge of Generative\nVision-Language Models (GVLMs) have been constructed via multimodal instruction\ntuning. The tuning recipe substantially deviates from the common contrastive\nvision-language learning. However, the performance of GVLMs in multimodal\ncompositional reasoning remains largely unexplored, as existing evaluation\nmetrics and benchmarks focus predominantly on assessing contrastive models like\nCLIP. In this paper, we examine the potential evaluation metrics to assess the\nGVLMs and hypothesize generative score methods are suitable for evaluating\ncompositionality. In addition, current benchmarks tend to prioritize syntactic\ncorrectness over semantics. The presence of morphological bias in these\nbenchmarks can be exploited by GVLMs, leading to ineffective evaluations. To\ncombat this, we define a MorphoBias Score to quantify the morphological bias\nand propose a novel LLM-based strategy to calibrate the bias. Moreover, a\nchallenging task is added to evaluate the robustness of GVLMs against inherent\ninclination toward syntactic correctness. We include the calibrated dataset and\nthe task into a new benchmark, namely MOrphologicall De-biased Benchmark\n(MODE). Our study provides the first unbiased benchmark for the\ncompositionality of GVLMs, facilitating future research in this direction. We\nwill release our code and datasets.\n","authors":["Teli Ma","Rong Li","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2308.10509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14770v2","updated":"2023-08-21T06:35:44Z","published":"2023-07-27T11:02:36Z","title":"3DPortraitGAN: Learning One-Quarter Headshot 3D GANs from a Single-View\n Portrait Dataset with Diverse Body Poses","summary":" 3D-aware face generators are typically trained on 2D real-life face image\ndatasets that primarily consist of near-frontal face data, and as such, they\nare unable to construct one-quarter headshot 3D portraits with complete head,\nneck, and shoulder geometry. Two reasons account for this issue: First,\nexisting facial recognition methods struggle with extracting facial data\ncaptured from large camera angles or back views. Second, it is challenging to\nlearn a distribution of 3D portraits covering the one-quarter headshot region\nfrom single-view data due to significant geometric deformation caused by\ndiverse body poses. To this end, we first create the dataset\n360{\\deg}-Portrait-HQ (360{\\deg}PHQ for short) which consists of high-quality\nsingle-view real portraits annotated with a variety of camera parameters (the\nyaw angles span the entire 360{\\deg} range) and body poses. We then propose\n3DPortraitGAN, the first 3D-aware one-quarter headshot portrait generator that\nlearns a canonical 3D avatar distribution from the 360{\\deg}PHQ dataset with\nbody pose self-learning. Our model can generate view-consistent portrait images\nfrom all camera angles with a canonical one-quarter headshot 3D representation.\nOur experiments show that the proposed framework can accurately predict\nportrait body poses and generate view-consistent, realistic portrait images\nwith complete geometry from all camera angles.\n","authors":["Yiqian Wu","Hao Xu","Xiangjun Tang","Hongbo Fu","Xiaogang Jin"],"pdf_url":"https://arxiv.org/pdf/2307.14770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09436v2","updated":"2023-08-21T06:32:29Z","published":"2023-08-18T10:07:38Z","title":"Transformer-based Detection of Microorganisms on High-Resolution Petri\n Dish Images","summary":" Many medical or pharmaceutical processes have strict guidelines regarding\ncontinuous hygiene monitoring. This often involves the labor-intensive task of\nmanually counting microorganisms in Petri dishes by trained personnel.\nAutomation attempts often struggle due to major challenges: significant scaling\ndifferences, low separation, low contrast, etc. To address these challenges, we\nintroduce AttnPAFPN, a high-resolution detection pipeline that leverages a\nnovel transformer variation, the efficient-global self-attention mechanism. Our\nstreamlined approach can be easily integrated in almost any multi-scale object\ndetection pipeline. In a comprehensive evaluation on the publicly available\nAGAR dataset, we demonstrate the superior accuracy of our network over the\ncurrent state-of-the-art. In order to demonstrate the task-independent\nperformance of our approach, we perform further experiments on COCO and\nLIVECell datasets.\n","authors":["Nikolas Ebert","Didier Stricker","Oliver Wasenmüller"],"pdf_url":"https://arxiv.org/pdf/2308.09436v2.pdf","comment":"This paper has been accepted at IEEE International Conference on\n Computer Vision Workshops (ICCV workshop), 2023"},{"id":"http://arxiv.org/abs/2308.10493v1","updated":"2023-08-21T06:23:41Z","published":"2023-08-21T06:23:41Z","title":"Semantic Graph Representation Learning for Handwritten Mathematical\n Expression Recognition","summary":" Handwritten mathematical expression recognition (HMER) has attracted\nextensive attention recently. However, current methods cannot explicitly study\nthe interactions between different symbols, which may fail when faced similar\nsymbols. To alleviate this issue, we propose a simple but efficient method to\nenhance semantic interaction learning (SIL). Specifically, we firstly construct\na semantic graph based on the statistical symbol co-occurrence probabilities.\nThen we design a semantic aware module (SAM), which projects the visual and\nclassification feature into semantic space. The cosine distance between\ndifferent projected vectors indicates the correlation between symbols. And\njointly optimizing HMER and SIL can explicitly enhances the model's\nunderstanding of symbol relationships. In addition, SAM can be easily plugged\ninto existing attention-based models for HMER and consistently bring\nimprovement. Extensive experiments on public benchmark datasets demonstrate\nthat our proposed module can effectively enhance the recognition performance.\nOur method achieves better recognition performance than prior arts on both\nCROHME and HME100K datasets.\n","authors":["Zhuang Liu","Ye Yuan","Zhilong Ji","Jingfeng Bai","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2308.10493v1.pdf","comment":"12 Pages"},{"id":"http://arxiv.org/abs/2308.10491v1","updated":"2023-08-21T06:22:10Z","published":"2023-08-21T06:22:10Z","title":"SynDrone -- Multi-modal UAV Dataset for Urban Scenarios","summary":" The development of computer vision algorithms for Unmanned Aerial Vehicles\n(UAVs) imagery heavily relies on the availability of annotated high-resolution\naerial data. However, the scarcity of large-scale real datasets with\npixel-level annotations poses a significant challenge to researchers as the\nlimited number of images in existing datasets hinders the effectiveness of deep\nlearning models that require a large amount of training data. In this paper, we\npropose a multimodal synthetic dataset containing both images and 3D data taken\nat multiple flying heights to address these limitations. In addition to\nobject-level annotations, the provided data also include pixel-level labeling\nin 28 classes, enabling exploration of the potential advantages in tasks like\nsemantic segmentation. In total, our dataset contains 72k labeled samples that\nallow for effective training of deep architectures showing promising results in\nsynthetic-to-real adaptation. The dataset will be made publicly available to\nsupport the development of novel computer vision methods targeting UAV\napplications.\n","authors":["Giulia Rizzoli","Francesco Barbato","Matteo Caligiuri","Pietro Zanuttigh"],"pdf_url":"https://arxiv.org/pdf/2308.10491v1.pdf","comment":"Accepted at ICCV Workshops, downloadable dataset with CC-BY license,\n 8 pages, 4 figures, 8 tables"},{"id":"http://arxiv.org/abs/2308.10490v1","updated":"2023-08-21T06:20:54Z","published":"2023-08-21T06:20:54Z","title":"Texture Generation on 3D Meshes with Point-UV Diffusion","summary":" In this work, we focus on synthesizing high-quality textures on 3D meshes. We\npresent Point-UV diffusion, a coarse-to-fine pipeline that marries the\ndenoising diffusion model with UV mapping to generate 3D consistent and\nhigh-quality texture images in UV space. We start with introducing a point\ndiffusion model to synthesize low-frequency texture components with our\ntailored style guidance to tackle the biased color distribution. The derived\ncoarse texture offers global consistency and serves as a condition for the\nsubsequent UV diffusion stage, aiding in regularizing the model to generate a\n3D consistent UV texture image. Then, a UV diffusion model with hybrid\nconditions is developed to enhance the texture fidelity in the 2D UV space. Our\nmethod can process meshes of any genus, generating diversified,\ngeometry-compatible, and high-fidelity textures. Code is available at\nhttps://cvmi-lab.github.io/Point-UV-Diffusion\n","authors":["Xin Yu","Peng Dai","Wenbo Li","Lan Ma","Zhengzhe Liu","Xiaojuan Qi"],"pdf_url":"https://arxiv.org/pdf/2308.10490v1.pdf","comment":"Accepted to ICCV 2023, Oral"},{"id":"http://arxiv.org/abs/2308.10488v1","updated":"2023-08-21T06:09:00Z","published":"2023-08-21T06:09:00Z","title":"Enhancing Medical Image Segmentation: Optimizing Cross-Entropy Weights\n and Post-Processing with Autoencoders","summary":" The task of medical image segmentation presents unique challenges,\nnecessitating both localized and holistic semantic understanding to accurately\ndelineate areas of interest, such as critical tissues or aberrant features.\nThis complexity is heightened in medical image segmentation due to the high\ndegree of inter-class similarities, intra-class variations, and possible image\nobfuscation. The segmentation task further diversifies when considering the\nstudy of histopathology slides for autoimmune diseases like dermatomyositis.\nThe analysis of cell inflammation and interaction in these cases has been less\nstudied due to constraints in data acquisition pipelines. Despite the\nprogressive strides in medical science, we lack a comprehensive collection of\nautoimmune diseases. As autoimmune diseases globally escalate in prevalence and\nexhibit associations with COVID-19, their study becomes increasingly essential.\nWhile there is existing research that integrates artificial intelligence in the\nanalysis of various autoimmune diseases, the exploration of dermatomyositis\nremains relatively underrepresented. In this paper, we present a deep-learning\napproach tailored for Medical image segmentation. Our proposed method\noutperforms the current state-of-the-art techniques by an average of 12.26% for\nU-Net and 12.04% for U-Net++ across the ResNet family of encoders on the\ndermatomyositis dataset. Furthermore, we probe the importance of optimizing\nloss function weights and benchmark our methodology on three challenging\nmedical image segmentation tasks\n","authors":["Pranav Singh","Luoyao Chen","Mei Chen","Jinqian Pan","Raviteja Chukkapalli","Shravan Chaudhari","Jacopo Cirrone"],"pdf_url":"https://arxiv.org/pdf/2308.10488v1.pdf","comment":"Accepted at ICCV CVAMD 2023"},{"id":"http://arxiv.org/abs/2303.17606v2","updated":"2023-08-21T06:04:24Z","published":"2023-03-30T17:59:59Z","title":"AvatarCraft: Transforming Text into Neural Human Avatars with\n Parameterized Shape and Pose Control","summary":" Neural implicit fields are powerful for representing 3D scenes and generating\nhigh-quality novel views, but it remains challenging to use such implicit\nrepresentations for creating a 3D human avatar with a specific identity and\nartistic style that can be easily animated. Our proposed method, AvatarCraft,\naddresses this challenge by using diffusion models to guide the learning of\ngeometry and texture for a neural avatar based on a single text prompt. We\ncarefully design the optimization framework of neural implicit fields,\nincluding a coarse-to-fine multi-bounding box training strategy, shape\nregularization, and diffusion-based constraints, to produce high-quality\ngeometry and texture. Additionally, we make the human avatar animatable by\ndeforming the neural implicit field with an explicit warping field that maps\nthe target human mesh to a template human mesh, both represented using\nparametric human models. This simplifies animation and reshaping of the\ngenerated avatar by controlling pose and shape parameters. Extensive\nexperiments on various text descriptions show that AvatarCraft is effective and\nrobust in creating human avatars and rendering novel views, poses, and shapes.\nOur project page is: https://avatar-craft.github.io/.\n","authors":["Ruixiang Jiang","Can Wang","Jingbo Zhang","Menglei Chai","Mingming He","Dongdong Chen","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2303.17606v2.pdf","comment":"ICCV 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2108.07851v6","updated":"2023-08-21T05:47:52Z","published":"2021-08-17T19:45:28Z","title":"Boosting Salient Object Detection with Transformer-based Asymmetric\n Bilateral U-Net","summary":" Existing salient object detection (SOD) methods mainly rely on U-shaped\nconvolution neural networks (CNNs) with skip connections to combine the global\ncontexts and local spatial details that are crucial for locating salient\nobjects and refining object details, respectively. Despite great successes, the\nability of CNNs in learning global contexts is limited. Recently, the vision\ntransformer has achieved revolutionary progress in computer vision owing to its\npowerful modeling of global dependencies. However, directly applying the\ntransformer to SOD is suboptimal because the transformer lacks the ability to\nlearn local spatial representations. To this end, this paper explores the\ncombination of transformers and CNNs to learn both global and local\nrepresentations for SOD. We propose a transformer-based Asymmetric Bilateral\nU-Net (ABiU-Net). The asymmetric bilateral encoder has a transformer path and a\nlightweight CNN path, where the two paths communicate at each encoder stage to\nlearn complementary global contexts and local spatial details, respectively.\nThe asymmetric bilateral decoder also consists of two paths to process features\nfrom the transformer and CNN encoder paths, with communication at each decoder\nstage for decoding coarse salient object locations and fine-grained object\ndetails, respectively. Such communication between the two encoder/decoder paths\nenables AbiU-Net to learn complementary global and local representations,\ntaking advantage of the natural merits of transformers and CNNs, respectively.\nHence, ABiU-Net provides a new perspective for transformer-based SOD. Extensive\nexperiments demonstrate that ABiU-Net performs favorably against previous\nstate-of-the-art SOD methods. The code is available at\nhttps://github.com/yuqiuyuqiu/ABiU-Net.\n","authors":["Yu Qiu","Yun Liu","Le Zhang","Jing Xu"],"pdf_url":"https://arxiv.org/pdf/2108.07851v6.pdf","comment":"Accepted by IEEE Transactions on Circuits and Systems for Video\n Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2308.10481v1","updated":"2023-08-21T05:42:13Z","published":"2023-08-21T05:42:13Z","title":"ADNet: Lane Shape Prediction via Anchor Decomposition","summary":" In this paper, we revisit the limitations of anchor-based lane detection\nmethods, which have predominantly focused on fixed anchors that stem from the\nedges of the image, disregarding their versatility and quality. To overcome the\ninflexibility of anchors, we decompose them into learning the heat map of\nstarting points and their associated directions. This decomposition removes the\nlimitations on the starting point of anchors, making our algorithm adaptable to\ndifferent lane types in various datasets. To enhance the quality of anchors, we\nintroduce the Large Kernel Attention (LKA) for Feature Pyramid Network (FPN).\nThis significantly increases the receptive field, which is crucial in capturing\nthe sufficient context as lane lines typically run throughout the entire image.\nWe have named our proposed system the Anchor Decomposition Network (ADNet).\nAdditionally, we propose the General Lane IoU (GLIoU) loss, which significantly\nimproves the performance of ADNet in complex scenarios. Experimental results on\nthree widely used lane detection benchmarks, VIL-100, CULane, and TuSimple,\ndemonstrate that our approach outperforms the state-of-the-art methods on\nVIL-100 and exhibits competitive accuracy on CULane and TuSimple. Code and\nmodels will be released on https://github.com/ Sephirex-X/ADNet.\n","authors":["Lingyu Xiao","Xiang Li","Sen Yang","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2308.10481v1.pdf","comment":"ICCV2023 accepted"},{"id":"http://arxiv.org/abs/2308.10468v1","updated":"2023-08-21T05:09:07Z","published":"2023-08-21T05:09:07Z","title":"STEERER: Resolving Scale Variations for Counting and Localization via\n Selective Inheritance Learning","summary":" Scale variation is a deep-rooted problem in object counting, which has not\nbeen effectively addressed by existing scale-aware algorithms. An important\nfactor is that they typically involve cooperative learning across\nmulti-resolutions, which could be suboptimal for learning the most\ndiscriminative features from each scale. In this paper, we propose a novel\nmethod termed STEERER (\\textbf{S}elec\\textbf{T}iv\\textbf{E}\ninh\\textbf{ER}itance l\\textbf{E}a\\textbf{R}ning) that addresses the issue of\nscale variations in object counting. STEERER selects the most suitable scale\nfor patch objects to boost feature extraction and only inherits discriminative\nfeatures from lower to higher resolution progressively. The main insights of\nSTEERER are a dedicated Feature Selection and Inheritance Adaptor (FSIA), which\nselectively forwards scale-customized features at each scale, and a Masked\nSelection and Inheritance Loss (MSIL) that helps to achieve high-quality\ndensity maps across all scales. Our experimental results on nine datasets with\ncounting and localization tasks demonstrate the unprecedented scale\ngeneralization ability of STEERER. Code is available at\n\\url{https://github.com/taohan10200/STEERER}.\n","authors":["Tao Han","Lei Bai","Lingbo Liu","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2308.10468v1.pdf","comment":"Accepted by ICCV2023, 9 pages"},{"id":"http://arxiv.org/abs/2308.10461v1","updated":"2023-08-21T04:31:02Z","published":"2023-08-21T04:31:02Z","title":"Privacy-Preserving Face Recognition Using Random Frequency Components","summary":" The ubiquitous use of face recognition has sparked increasing privacy\nconcerns, as unauthorized access to sensitive face images could compromise the\ninformation of individuals. This paper presents an in-depth study of the\nprivacy protection of face images' visual information and against recovery.\nDrawing on the perceptual disparity between humans and models, we propose to\nconceal visual information by pruning human-perceivable low-frequency\ncomponents. For impeding recovery, we first elucidate the seeming paradox\nbetween reducing model-exploitable information and retaining high recognition\naccuracy. Based on recent theoretical insights and our observation on model\nattention, we propose a solution to the dilemma, by advocating for the training\nand inference of recognition models on randomly selected frequency components.\nWe distill our findings into a novel privacy-preserving face recognition\nmethod, PartialFace. Extensive experiments demonstrate that PartialFace\neffectively balances privacy protection goals and recognition accuracy. Code is\navailable at: https://github.com/Tencent/TFace.\n","authors":["Yuxi Mi","Yuge Huang","Jiazhen Ji","Minyi Zhao","Jiaxiang Wu","Xingkun Xu","Shouhong Ding","Shuigeng Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.10461v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10453v1","updated":"2023-08-21T03:58:04Z","published":"2023-08-21T03:58:04Z","title":"DOMINO++: Domain-aware Loss Regularization for Deep Learning\n Generalizability","summary":" Out-of-distribution (OOD) generalization poses a serious challenge for modern\ndeep learning (DL). OOD data consists of test data that is significantly\ndifferent from the model's training data. DL models that perform well on\nin-domain test data could struggle on OOD data. Overcoming this discrepancy is\nessential to the reliable deployment of DL. Proper model calibration decreases\nthe number of spurious connections that are made between model features and\nclass outputs. Hence, calibrated DL can improve OOD generalization by only\nlearning features that are truly indicative of the respective classes. Previous\nwork proposed domain-aware model calibration (DOMINO) to improve DL\ncalibration, but it lacks designs for model generalizability to OOD data. In\nthis work, we propose DOMINO++, a dual-guidance and dynamic domain-aware loss\nregularization focused on OOD generalizability. DOMINO++ integrates\nexpert-guided and data-guided knowledge in its regularization. Unlike DOMINO\nwhich imposed a fixed scaling and regularization rate, DOMINO++ designs a\ndynamic scaling factor and an adaptive regularization rate. Comprehensive\nevaluations compare DOMINO++ with DOMINO and the baseline model for head tissue\nsegmentation from magnetic resonance images (MRIs) on OOD data. The OOD data\nconsists of synthetic noisy and rotated datasets, as well as real data using a\ndifferent MRI scanner from a separate site. DOMINO++'s superior performance\ndemonstrates its potential to improve the trustworthy deployment of DL on real\nclinical data.\n","authors":["Skylar E. Stolte","Kyle Volle","Aprinda Indahlastari","Alejandro Albizu","Adam J. Woods","Kevin Brink","Matthew Hale","Ruogu Fang"],"pdf_url":"https://arxiv.org/pdf/2308.10453v1.pdf","comment":"12 pages, 5 figures, 5 tables, Accepted by the International\n Conference on Medical Image Computing and Computer Assisted Intervention\n (MICCAI) 2023"},{"id":"http://arxiv.org/abs/2303.16817v2","updated":"2023-08-21T03:55:12Z","published":"2023-03-29T16:07:06Z","title":"Adaptive Superpixel for Active Learning in Semantic Segmentation","summary":" Learning semantic segmentation requires pixel-wise annotations, which can be\ntime-consuming and expensive. To reduce the annotation cost, we propose a\nsuperpixel-based active learning (AL) framework, which collects a dominant\nlabel per superpixel instead. To be specific, it consists of adaptive\nsuperpixel and sieving mechanisms, fully dedicated to AL. At each round of AL,\nwe adaptively merge neighboring pixels of similar learned features into\nsuperpixels. We then query a selected subset of these superpixels using an\nacquisition function assuming no uniform superpixel size. This approach is more\nefficient than existing methods, which rely only on innate features such as RGB\ncolor and assume uniform superpixel sizes. Obtaining a dominant label per\nsuperpixel drastically reduces annotators' burden as it requires fewer clicks.\nHowever, it inevitably introduces noisy annotations due to mismatches between\nsuperpixel and ground truth segmentation. To address this issue, we further\ndevise a sieving mechanism that identifies and excludes potentially noisy\nannotations from learning. Our experiments on both Cityscapes and PASCAL VOC\ndatasets demonstrate the efficacy of adaptive superpixel and sieving\nmechanisms.\n","authors":["Hoyoung Kim","Minhyeon Oh","Sehyun Hwang","Suha Kwak","Jungseul Ok"],"pdf_url":"https://arxiv.org/pdf/2303.16817v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10450v1","updated":"2023-08-21T03:50:19Z","published":"2023-08-21T03:50:19Z","title":"COCA: Classifier-Oriented Calibration for Source-Free Universal Domain\n Adaptation via Textual Prototype","summary":" Universal Domain Adaptation (UniDA) aims to distinguish common and private\nclasses between the source and target domains where domain shift exists.\nRecently, due to more stringent data restrictions, researchers have introduced\nSource-Free UniDA (SF-UniDA) in more realistic scenarios. SF-UniDA methods\neliminate the need for direct access to source samples when performing\nadaptation to the target domain. However, existing SF-UniDA methods still\nrequire an extensive quantity of labeled source samples to train a source\nmodel, resulting in significant labeling costs. To tackle this issue, we\npresent a novel Classifier-Oriented Calibration (COCA) method. This method,\nwhich leverages textual prototypes, is formulated for the source model based on\nfew-shot learning. Specifically, we propose studying few-shot learning, usually\nexplored for closed-set scenarios, to identify common and domain-private\nclasses despite a significant domain shift between source and target domains.\nEssentially, we present a novel paradigm based on the vision-language model to\nlearn SF-UniDA and hugely reduce the labeling costs on the source domain.\nExperimental results demonstrate that our approach outperforms state-of-the-art\nUniDA and SF-UniDA models.\n","authors":["Xinghong Liu","Yi Zhou","Tao Zhou","Chun-Mei Feng","Ling Shao"],"pdf_url":"https://arxiv.org/pdf/2308.10450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10449v1","updated":"2023-08-21T03:50:09Z","published":"2023-08-21T03:50:09Z","title":"CVFC: Attention-Based Cross-View Feature Consistency for Weakly\n Supervised Semantic Segmentation of Pathology Images","summary":" Histopathology image segmentation is the gold standard for diagnosing cancer,\nand can indicate cancer prognosis. However, histopathology image segmentation\nrequires high-quality masks, so many studies now use imagelevel labels to\nachieve pixel-level segmentation to reduce the need for fine-grained\nannotation. To solve this problem, we propose an attention-based cross-view\nfeature consistency end-to-end pseudo-mask generation framework named CVFC\nbased on the attention mechanism. Specifically, CVFC is a three-branch joint\nframework composed of two Resnet38 and one Resnet50, and the independent branch\nmulti-scale integrated feature map to generate a class activation map (CAM); in\neach branch, through down-sampling and The expansion method adjusts the size of\nthe CAM; the middle branch projects the feature matrix to the query and key\nfeature spaces, and generates a feature space perception matrix through the\nconnection layer and inner product to adjust and refine the CAM of each branch;\nfinally, through the feature consistency loss and feature cross loss to\noptimize the parameters of CVFC in co-training mode. After a large number of\nexperiments, An IoU of 0.7122 and a fwIoU of 0.7018 are obtained on the\nWSSS4LUAD dataset, which outperforms HistoSegNet, SEAM, C-CAM, WSSS-Tissue, and\nOEEM, respectively.\n","authors":["Liangrui Pan","Lian Wang","Zhichao Feng","Liwen Xu","Shaoliang Peng"],"pdf_url":"https://arxiv.org/pdf/2308.10449v1.pdf","comment":"Submitted to BIBM2023"},{"id":"http://arxiv.org/abs/2302.02410v2","updated":"2023-08-21T03:46:50Z","published":"2023-02-05T15:46:57Z","title":"Decoupled Iterative Refinement Framework for Interacting Hands\n Reconstruction from a Single RGB Image","summary":" Reconstructing interacting hands from a single RGB image is a very\nchallenging task. On the one hand, severe mutual occlusion and similar local\nappearance between two hands confuse the extraction of visual features,\nresulting in the misalignment of estimated hand meshes and the image. On the\nother hand, there are complex spatial relationship between interacting hands,\nwhich significantly increases the solution space of hand poses and increases\nthe difficulty of network learning. In this paper, we propose a decoupled\niterative refinement framework to achieve pixel-alignment hand reconstruction\nwhile efficiently modeling the spatial relationship between hands.\nSpecifically, we define two feature spaces with different characteristics,\nnamely 2D visual feature space and 3D joint feature space. First, we obtain\njoint-wise features from the visual feature map and utilize a graph convolution\nnetwork and a transformer to perform intra- and inter-hand information\ninteraction in the 3D joint feature space, respectively. Then, we project the\njoint features with global information back into the 2D visual feature space in\nan obfuscation-free manner and utilize the 2D convolution for pixel-wise\nenhancement. By performing multiple alternate enhancements in the two feature\nspaces, our method can achieve an accurate and robust reconstruction of\ninteracting hands. Our method outperforms all existing two-hand reconstruction\nmethods by a large margin on the InterHand2.6M dataset.\n","authors":["Pengfei Ren","Chao Wen","Xiaozheng Zheng","Zhou Xue","Haifeng Sun","Qi Qi","Jingyu Wang","Jianxin Liao"],"pdf_url":"https://arxiv.org/pdf/2302.02410v2.pdf","comment":"Accepted to ICCV 2023 (Oral)"},{"id":"http://arxiv.org/abs/2308.10447v1","updated":"2023-08-21T03:46:04Z","published":"2023-08-21T03:46:04Z","title":"Explore and Tell: Embodied Visual Captioning in 3D Environments","summary":" While current visual captioning models have achieved impressive performance,\nthey often assume that the image is well-captured and provides a complete view\nof the scene. In real-world scenarios, however, a single image may not offer a\ngood viewpoint, hindering fine-grained scene understanding. To overcome this\nlimitation, we propose a novel task called Embodied Captioning, which equips\nvisual captioning models with navigation capabilities, enabling them to\nactively explore the scene and reduce visual ambiguity from suboptimal\nviewpoints. Specifically, starting at a random viewpoint, an agent must\nnavigate the environment to gather information from different viewpoints and\ngenerate a comprehensive paragraph describing all objects in the scene. To\nsupport this task, we build the ET-Cap dataset with Kubric simulator,\nconsisting of 10K 3D scenes with cluttered objects and three annotated\nparagraphs per scene. We propose a Cascade Embodied Captioning model (CaBOT),\nwhich comprises of a navigator and a captioner, to tackle this task. The\nnavigator predicts which actions to take in the environment, while the\ncaptioner generates a paragraph description based on the whole navigation\ntrajectory. Extensive experiments demonstrate that our model outperforms other\ncarefully designed baselines. Our dataset, codes and models are available at\nhttps://aim3-ruc.github.io/ExploreAndTell.\n","authors":["Anwen Hu","Shizhe Chen","Liang Zhang","Qin Jin"],"pdf_url":"https://arxiv.org/pdf/2308.10447v1.pdf","comment":"12 pages; 10 figures; ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10446v1","updated":"2023-08-21T03:44:54Z","published":"2023-08-21T03:44:54Z","title":"LDCSF: Local depth convolution-based Swim framework for classifying\n multi-label histopathology images","summary":" Histopathological images are the gold standard for diagnosing liver cancer.\nHowever, the accuracy of fully digital diagnosis in computational pathology\nneeds to be improved. In this paper, in order to solve the problem of\nmulti-label and low classification accuracy of histopathology images, we\npropose a locally deep convolutional Swim framework (LDCSF) to classify\nmulti-label histopathology images. In order to be able to provide local field\nof view diagnostic results, we propose the LDCSF model, which consists of a\nSwin transformer module, a local depth convolution (LDC) module, a feature\nreconstruction (FR) module, and a ResNet module. The Swin transformer module\nreduces the amount of computation generated by the attention mechanism by\nlimiting the attention to each window. The LDC then reconstructs the attention\nmap and performs convolution operations in multiple channels, passing the\nresulting feature map to the next layer. The FR module uses the corresponding\nweight coefficient vectors obtained from the channels to dot product with the\noriginal feature map vector matrix to generate representative feature maps.\nFinally, the residual network undertakes the final classification task. As a\nresult, the classification accuracy of LDCSF for interstitial area, necrosis,\nnon-tumor and tumor reached 0.9460, 0.9960, 0.9808, 0.9847, respectively.\nFinally, we use the results of multi-label pathological image classification to\ncalculate the tumor-to-stromal ratio, which lays the foundation for the\nanalysis of the microenvironment of liver cancer histopathological images.\nSecond, we released a multilabel histopathology image of liver cancer, our code\nand data are available at https://github.com/panliangrui/LSF.\n","authors":["Liangrui Pan","Yutao Dou","Zhichao Feng","Liwen Xu","Shaoliang Peng"],"pdf_url":"https://arxiv.org/pdf/2308.10446v1.pdf","comment":"Submitted to BIBM2023"},{"id":"http://arxiv.org/abs/2308.10445v1","updated":"2023-08-21T03:33:21Z","published":"2023-08-21T03:33:21Z","title":"When Prompt-based Incremental Learning Does Not Meet Strong Pretraining","summary":" Incremental learning aims to overcome catastrophic forgetting when learning\ndeep networks from sequential tasks. With impressive learning efficiency and\nperformance, prompt-based methods adopt a fixed backbone to sequential tasks by\nlearning task-specific prompts. However, existing prompt-based methods heavily\nrely on strong pretraining (typically trained on ImageNet-21k), and we find\nthat their models could be trapped if the potential gap between the pretraining\ntask and unknown future tasks is large. In this work, we develop a learnable\nAdaptive Prompt Generator (APG). The key is to unify the prompt retrieval and\nprompt learning processes into a learnable prompt generator. Hence, the whole\nprompting process can be optimized to reduce the negative effects of the gap\nbetween tasks effectively. To make our APG avoid learning ineffective\nknowledge, we maintain a knowledge pool to regularize APG with the feature\ndistribution of each class. Extensive experiments show that our method\nsignificantly outperforms advanced methods in exemplar-free incremental\nlearning without (strong) pretraining. Besides, under strong retraining, our\nmethod also has comparable performance to existing prompt-based models, showing\nthat our method can still benefit from pretraining. Codes can be found at\nhttps://github.com/TOM-tym/APG\n","authors":["Yu-Ming Tang","Yi-Xing Peng","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.10445v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08197v2","updated":"2023-08-21T03:31:09Z","published":"2023-08-16T07:57:35Z","title":"Self-Reference Deep Adaptive Curve Estimation for Low-Light Image\n Enhancement","summary":" In this paper, we propose a 2-stage low-light image enhancement method called\nSelf-Reference Deep Adaptive Curve Estimation (Self-DACE). In the first stage,\nwe present an intuitive, lightweight, fast, and unsupervised luminance\nenhancement algorithm. The algorithm is based on a novel low-light enhancement\ncurve that can be used to locally boost image brightness. We also propose a new\nloss function with a simplified physical model designed to preserve natural\nimages' color, structure, and fidelity. We use a vanilla CNN to map each pixel\nthrough deep Adaptive Adjustment Curves (AAC) while preserving the local image\nstructure. Secondly, we introduce the corresponding denoising scheme to remove\nthe latent noise in the darkness. We approximately model the noise in the dark\nand deploy a Denoising-Net to estimate and remove the noise after the first\nstage. Exhaustive qualitative and quantitative analysis shows that our method\noutperforms existing state-of-the-art algorithms on multiple real-world\ndatasets.\n","authors":["Jianyu Wen","Chenhao Wu","Tong Zhang","Yixuan Yu","Piotr Swierczynski"],"pdf_url":"https://arxiv.org/pdf/2308.08197v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10441v1","updated":"2023-08-21T03:28:23Z","published":"2023-08-21T03:28:23Z","title":"X-VoE: Measuring eXplanatory Violation of Expectation in Physical Events","summary":" Intuitive physics is pivotal for human understanding of the physical world,\nenabling prediction and interpretation of events even in infancy. Nonetheless,\nreplicating this level of intuitive physics in artificial intelligence (AI)\nremains a formidable challenge. This study introduces X-VoE, a comprehensive\nbenchmark dataset, to assess AI agents' grasp of intuitive physics. Built on\nthe developmental psychology-rooted Violation of Expectation (VoE) paradigm,\nX-VoE establishes a higher bar for the explanatory capacities of intuitive\nphysics models. Each VoE scenario within X-VoE encompasses three distinct\nsettings, probing models' comprehension of events and their underlying\nexplanations. Beyond model evaluation, we present an explanation-based learning\nsystem that captures physics dynamics and infers occluded object states solely\nfrom visual sequences, without explicit occlusion labels. Experimental outcomes\nhighlight our model's alignment with human commonsense when tested against\nX-VoE. A remarkable feature is our model's ability to visually expound VoE\nevents by reconstructing concealed scenes. Concluding, we discuss the findings'\nimplications and outline future research directions. Through X-VoE, we catalyze\nthe advancement of AI endowed with human-like intuitive physics capabilities.\n","authors":["Bo Dai","Linge Wang","Baoxiong Jia","Zeyu Zhang","Song-Chun Zhu","Chi Zhang","Yixin Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.10441v1.pdf","comment":"19 pages, 16 figures, selected for an Oral presentation at ICCV 2023.\n Project link: https://pku.ai/publication/intuitive2023iccv/"},{"id":"http://arxiv.org/abs/2308.10438v1","updated":"2023-08-21T03:22:47Z","published":"2023-08-21T03:22:47Z","title":"Efficient Joint Optimization of Layer-Adaptive Weight Pruning in Deep\n Neural Networks","summary":" In this paper, we propose a novel layer-adaptive weight-pruning approach for\nDeep Neural Networks (DNNs) that addresses the challenge of optimizing the\noutput distortion minimization while adhering to a target pruning ratio\nconstraint. Our approach takes into account the collective influence of all\nlayers to design a layer-adaptive pruning scheme. We discover and utilize a\nvery important additivity property of output distortion caused by pruning\nweights on multiple layers. This property enables us to formulate the pruning\nas a combinatorial optimization problem and efficiently solve it through\ndynamic programming. By decomposing the problem into sub-problems, we achieve\nlinear time complexity, making our optimization algorithm fast and feasible to\nrun on CPUs. Our extensive experiments demonstrate the superiority of our\napproach over existing methods on the ImageNet and CIFAR-10 datasets. On\nCIFAR-10, our method achieves remarkable improvements, outperforming others by\nup to 1.0% for ResNet-32, 0.5% for VGG-16, and 0.7% for DenseNet-121 in terms\nof top-1 accuracy. On ImageNet, we achieve up to 4.7% and 4.6% higher top-1\naccuracy compared to other methods for VGG-16 and ResNet-50, respectively.\nThese results highlight the effectiveness and practicality of our approach for\nenhancing DNN performance through layer-adaptive weight pruning. Code will be\navailable on https://github.com/Akimoto-Cris/RD_VIT_PRUNE.\n","authors":["Kaixin Xu","Zhe Wang","Xue Geng","Min Wu","Xiaoli Li","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2308.10438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14609v6","updated":"2023-08-21T03:15:35Z","published":"2022-09-29T07:58:32Z","title":"Dataset Distillation Using Parameter Pruning","summary":" In this study, we propose a novel dataset distillation method based on\nparameter pruning. The proposed method can synthesize more robust distilled\ndatasets and improve distillation performance by pruning difficult-to-match\nparameters during the distillation process. Experimental results on two\nbenchmark datasets show the superiority of the proposed method.\n","authors":["Guang Li","Ren Togo","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2209.14609v6.pdf","comment":"Published as a journal paper at IEICE Trans. Fund"},{"id":"http://arxiv.org/abs/2111.14422v3","updated":"2023-08-21T03:13:12Z","published":"2021-11-29T10:06:31Z","title":"Agent-Centric Relation Graph for Object Visual Navigation","summary":" Object visual navigation aims to steer an agent toward a target object based\non visual observations. It is highly desirable to reasonably perceive the\nenvironment and accurately control the agent. In the navigation task, we\nintroduce an Agent-Centric Relation Graph (ACRG) for learning the visual\nrepresentation based on the relationships in the environment. ACRG is a highly\neffective structure that consists of two relationships, i.e., the horizontal\nrelationship among objects and the distance relationship between the agent and\nobjects . On the one hand, we design the Object Horizontal Relationship Graph\n(OHRG) that stores the relative horizontal location among objects. On the other\nhand, we propose the Agent-Target Distance Relationship Graph (ATDRG) that\nenables the agent to perceive the distance between the target and objects. For\nATDRG, we utilize image depth to obtain the target distance and imply the\nvertical location to capture the distance relationship among objects in the\nvertical direction. With the above graphs, the agent can perceive the\nenvironment and output navigation actions. Experimental results in the\nartificial environment AI2-THOR demonstrate that ACRG significantly outperforms\nother state-of-the-art methods in unseen testing environments.\n","authors":["Xiaobo Hu","Youfang Lin","Shuo Wang","Zhihao Wu","Kai Lv"],"pdf_url":"https://arxiv.org/pdf/2111.14422v3.pdf","comment":"16 pages, 13 figures, 7 tables"},{"id":"http://arxiv.org/abs/2308.09658v2","updated":"2023-08-21T03:08:52Z","published":"2023-08-18T16:21:40Z","title":"Tree-of-Mixed-Thought: Combining Fast and Slow Thinking for Multi-hop\n Visual Reasoning","summary":" There emerges a promising trend of using large language models (LLMs) to\ngenerate code-like plans for complex inference tasks such as visual reasoning.\nThis paradigm, known as LLM-based planning, provides flexibility in problem\nsolving and endows better interpretability. However, current research is mostly\nlimited to basic scenarios of simple questions that can be straightforward\nanswered in a few inference steps. Planning for the more challenging multi-hop\nvisual reasoning tasks remains under-explored. Specifically, under multi-hop\nreasoning situations, the trade-off between accuracy and the complexity of\nplan-searching becomes prominent. The prevailing algorithms either address the\nefficiency issue by employing the fast one-stop generation or adopt a complex\niterative generation method to improve accuracy. Both fail to balance the need\nfor efficiency and performance. Drawing inspiration from the dual system of\ncognition in the human brain, the fast and the slow think processes, we propose\na hierarchical plan-searching algorithm that integrates the one-stop reasoning\n(fast) and the Tree-of-thought (slow). Our approach succeeds in performance\nwhile significantly saving inference steps. Moreover, we repurpose the PTR and\nthe CLEVER datasets, developing a systematic framework for evaluating the\nperformance and efficiency of LLMs-based plan-search algorithms under reasoning\ntasks at different levels of difficulty. Extensive experiments demonstrate the\nsuperiority of our proposed algorithm in terms of performance and efficiency.\nThe dataset and code will be release soon.\n","authors":["Pengbo Hu","Ji Qi","Xingyu Li","Hong Li","Xinqi Wang","Bing Quan","Ruiyu Wang","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.09658v2.pdf","comment":"16 pages,1 figures, under review"},{"id":"http://arxiv.org/abs/2308.08806v2","updated":"2023-08-21T02:50:42Z","published":"2023-08-17T06:32:57Z","title":"Self-distillation Regularized Connectionist Temporal Classification Loss\n for Text Recognition: A Simple Yet Effective Approach","summary":" Text recognition methods are gaining rapid development. Some advanced\ntechniques, e.g., powerful modules, language models, and un- and\nsemi-supervised learning schemes, consecutively push the performance on public\nbenchmarks forward. However, the problem of how to better optimize a text\nrecognition model from the perspective of loss functions is largely overlooked.\nCTC-based methods, widely used in practice due to their good balance between\nperformance and inference speed, still grapple with accuracy degradation. This\nis because CTC loss emphasizes the optimization of the entire sequence target\nwhile neglecting to learn individual characters. We propose a self-distillation\nscheme for CTC-based model to address this issue. It incorporates a framewise\nregularization term in CTC loss to emphasize individual supervision, and\nleverages the maximizing-a-posteriori of latent alignment to solve the\ninconsistency problem that arises in distillation between CTC-based models. We\nrefer to the regularized CTC loss as Distillation Connectionist Temporal\nClassification (DCTC) loss. DCTC loss is module-free, requiring no extra\nparameters, longer inference lag, or additional training data or phases.\nExtensive experiments on public benchmarks demonstrate that DCTC can boost text\nrecognition model accuracy by up to 2.6%, without any of these drawbacks.\n","authors":["Ziyin Zhang","Ning Lu","Minghui Liao","Yongshuai Huang","Cheng Li","Min Wang","Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2308.08806v2.pdf","comment":"Ziyin Zhang and Ning Lu are co-first authors"},{"id":"http://arxiv.org/abs/2305.00795v3","updated":"2023-08-21T02:14:41Z","published":"2023-05-01T12:47:55Z","title":"SelfDocSeg: A Self-Supervised vision-based Approach towards Document\n Segmentation","summary":" Document layout analysis is a known problem to the documents research\ncommunity and has been vastly explored yielding a multitude of solutions\nranging from text mining, and recognition to graph-based representation, visual\nfeature extraction, etc. However, most of the existing works have ignored the\ncrucial fact regarding the scarcity of labeled data. With growing internet\nconnectivity to personal life, an enormous amount of documents had been\navailable in the public domain and thus making data annotation a tedious task.\nWe address this challenge using self-supervision and unlike, the few existing\nself-supervised document segmentation approaches which use text mining and\ntextual labels, we use a complete vision-based approach in pre-training without\nany ground-truth label or its derivative. Instead, we generate pseudo-layouts\nfrom the document images to pre-train an image encoder to learn the document\nobject representation and localization in a self-supervised framework before\nfine-tuning it with an object detection model. We show that our pipeline sets a\nnew benchmark in this context and performs at par with the existing methods and\nthe supervised counterparts, if not outperforms. The code is made publicly\navailable at: https://github.com/MaitySubhajit/SelfDocSeg\n","authors":["Subhajit Maity","Sanket Biswas","Siladittya Manna","Ayan Banerjee","Josep Lladós","Saumik Bhattacharya","Umapada Pal"],"pdf_url":"https://arxiv.org/pdf/2305.00795v3.pdf","comment":"Accepted at The 17th International Conference on Document Analysis\n and Recognition (ICDAR 2023)"},{"id":"http://arxiv.org/abs/2308.10421v1","updated":"2023-08-21T02:13:40Z","published":"2023-08-21T02:13:40Z","title":"UniM$^2$AE: Multi-modal Masked Autoencoders with Unified 3D\n Representation for 3D Perception in Autonomous Driving","summary":" Masked Autoencoders (MAE) play a pivotal role in learning potent\nrepresentations, delivering outstanding results across various 3D perception\ntasks essential for autonomous driving. In real-world driving scenarios, it's\ncommonplace to deploy multiple sensors for comprehensive environment\nperception. While integrating multi-modal features from these sensors can\nproduce rich and powerful features, there is a noticeable gap in MAE methods\naddressing this integration. This research delves into multi-modal Masked\nAutoencoders tailored for a unified representation space in autonomous driving,\naiming to pioneer a more efficient fusion of two distinct modalities. To\nintricately marry the semantics inherent in images with the geometric\nintricacies of LiDAR point clouds, the UniM$^2$AE is proposed. This model\nstands as a potent yet straightforward, multi-modal self-supervised\npre-training framework, mainly consisting of two designs. First, it projects\nthe features from both modalities into a cohesive 3D volume space, ingeniously\nexpanded from the bird's eye view (BEV) to include the height dimension. The\nextension makes it possible to back-project the informative features, obtained\nby fusing features from both modalities, into their native modalities to\nreconstruct the multiple masked inputs. Second, the Multi-modal 3D Interactive\nModule (MMIM) is invoked to facilitate the efficient inter-modal interaction\nduring the interaction process. Extensive experiments conducted on the nuScenes\nDataset attest to the efficacy of UniM$^2$AE, indicating enhancements in 3D\nobject detection and BEV map segmentation by 1.2\\%(NDS) and 6.5\\% (mIoU),\nrespectively. Code is available at https://github.com/hollow-503/UniM2AE.\n","authors":["Jian Zou","Tianyu Huang","Guanglei Yang","Zhenhua Guo","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.10421v1.pdf","comment":"Code available at https://github.com/hollow-503/UniM2AE"},{"id":"http://arxiv.org/abs/2308.10417v1","updated":"2023-08-21T01:59:45Z","published":"2023-08-21T01:59:45Z","title":"The Change You Want to See (Now in 3D)","summary":" The goal of this paper is to detect what has changed, if anything, between\ntwo \"in the wild\" images of the same 3D scene acquired from different camera\npositions and at different temporal instances. The open-set nature of this\nproblem, occlusions/dis-occlusions due to the shift in viewpoint, and the lack\nof suitable training datasets, presents substantial challenges in devising a\nsolution.\n To address this problem, we contribute a change detection model that is\ntrained entirely on synthetic data and is class-agnostic, yet it is performant\nout-of-the-box on real world images without requiring fine-tuning. Our solution\nentails a \"register and difference\" approach that leverages self-supervised\nfrozen embeddings and feature differences, which allows the model to generalise\nto a wide variety of scenes and domains. The model is able to operate directly\non two RGB images, without requiring access to ground truth camera intrinsics,\nextrinsics, depth maps, point clouds, or additional before-after images.\nFinally, we collect and release a new evaluation dataset consisting of\nreal-world image pairs with human-annotated differences and demonstrate the\nefficacy of our method. The code, datasets and pre-trained model can be found\nat: https://github.com/ragavsachdeva/CYWS-3D\n","authors":["Ragav Sachdeva","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2308.10417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15046v5","updated":"2023-08-21T01:50:49Z","published":"2022-11-28T04:08:55Z","title":"PCT-CycleGAN: Paired Complementary Temporal Cycle-Consistent Adversarial\n Networks for Radar-Based Precipitation Nowcasting","summary":" The precipitation nowcasting methods have been elaborated over the centuries\nbecause rain has a crucial impact on human life. Not only quantitative\nprecipitation forecast (QPF) models and convolutional long short-term memory\n(ConvLSTM), but also various sophisticated methods such as the latest MetNet-2\nare emerging. In this paper, we propose a paired complementary temporal\ncycle-consistent adversarial networks (PCT-CycleGAN) for radar-based\nprecipitation nowcasting, inspired by cycle-consistent adversarial networks\n(CycleGAN), which shows strong performance in image-to-image translation.\nPCT-CycleGAN generates temporal causality using two generator networks with\nforward and backward temporal dynamics in paired complementary cycles. Each\ngenerator network learns a huge number of one-to-one mappings about\ntime-dependent radar-based precipitation data to approximate a mapping function\nrepresenting the temporal dynamics in each direction. To create robust temporal\ncausality between paired complementary cycles, novel connection loss is\nproposed. And torrential loss to cover exceptional heavy rain events is also\nproposed. The generator network learning forward temporal dynamics in\nPCT-CycleGAN generates radar-based precipitation data 10 minutes from the\ncurrent time. Also, it provides a reliable prediction of up to 2 hours with\niterative forecasting. The superiority of PCT-CycleGAN is demonstrated through\nqualitative and quantitative comparisons with several previous methods.\n","authors":["Jaeho Choi","Yura Kim","Kwang-Ho Kim","Sung-Hwa Jung","Ikhyun Cho"],"pdf_url":"https://arxiv.org/pdf/2211.15046v5.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2308.10411v1","updated":"2023-08-21T01:35:06Z","published":"2023-08-21T01:35:06Z","title":"In-Rack Test Tube Pose Estimation Using RGB-D Data","summary":" Accurate robotic manipulation of test tubes in biology and medical industries\nis becoming increasingly important to address workforce shortages and improve\nworker safety. The detection and localization of test tubes are essential for\nthe robots to successfully manipulate test tubes. In this paper, we present a\nframework to detect and estimate poses for the in-rack test tubes using color\nand depth data. The methodology involves the utilization of a YOLO object\ndetector to effectively classify and localize both the test tubes and the tube\nracks within the provided image data. Subsequently, the pose of the tube rack\nis estimated through point cloud registration techniques. During the process of\nestimating the poses of the test tubes, we capitalize on constraints derived\nfrom the arrangement of rack slots. By employing an optimization-based\nalgorithm, we effectively evaluate and refine the pose of the test tubes. This\nstrategic approach ensures the robustness of pose estimation, even when\nconfronted with noisy and incomplete point cloud data.\n","authors":["Hao Chen","Weiwei Wan","Masaki Matsushita","Takeyuki Kotaka","Kensuke Harada"],"pdf_url":"https://arxiv.org/pdf/2308.10411v1.pdf","comment":"Submit to IEEE ROBIO 2023"},{"id":"http://arxiv.org/abs/2308.10408v1","updated":"2023-08-21T01:25:48Z","published":"2023-08-21T01:25:48Z","title":"Turning a CLIP Model into a Scene Text Spotter","summary":" We exploit the potential of the large-scale Contrastive Language-Image\nPretraining (CLIP) model to enhance scene text detection and spotting tasks,\ntransforming it into a robust backbone, FastTCM-CR50. This backbone utilizes\nvisual prompt learning and cross-attention in CLIP to extract image and\ntext-based prior knowledge. Using predefined and learnable prompts,\nFastTCM-CR50 introduces an instance-language matching process to enhance the\nsynergy between image and text embeddings, thereby refining text regions. Our\nBimodal Similarity Matching (BSM) module facilitates dynamic language prompt\ngeneration, enabling offline computations and improving performance.\nFastTCM-CR50 offers several advantages: 1) It can enhance existing text\ndetectors and spotters, improving performance by an average of 1.7% and 1.5%,\nrespectively. 2) It outperforms the previous TCM-CR50 backbone, yielding an\naverage improvement of 0.2% and 0.56% in text detection and spotting tasks,\nalong with a 48.5% increase in inference speed. 3) It showcases robust few-shot\ntraining capabilities. Utilizing only 10% of the supervised data, FastTCM-CR50\nimproves performance by an average of 26.5% and 5.5% for text detection and\nspotting tasks, respectively. 4) It consistently enhances performance on\nout-of-distribution text detection and spotting datasets, particularly the\nNightTime-ArT subset from ICDAR2019-ArT and the DOTA dataset for oriented\nobject detection. The code is available at https://github.com/wenwenyu/TCM.\n","authors":["Wenwen Yu","Yuliang Liu","Xingkui Zhu","Haoyu Cao","Xing Sun","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2308.10408v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2302.14338"},{"id":"http://arxiv.org/abs/2308.03282v2","updated":"2023-08-21T01:19:33Z","published":"2023-08-07T03:56:15Z","title":"Environment-Invariant Curriculum Relation Learning for Fine-Grained\n Scene Graph Generation","summary":" The scene graph generation (SGG) task is designed to identify the predicates\nbased on the subject-object pairs.However,existing datasets generally include\ntwo imbalance cases: one is the class imbalance from the predicted predicates\nand another is the context imbalance from the given subject-object pairs, which\npresents significant challenges for SGG. Most existing methods focus on the\nimbalance of the predicted predicate while ignoring the imbalance of the\nsubject-object pairs, which could not achieve satisfactory results. To address\nthe two imbalance cases, we propose a novel Environment Invariant Curriculum\nRelation learning (EICR) method, which can be applied in a plug-and-play\nfashion to existing SGG methods. Concretely, to remove the imbalance of the\nsubject-object pairs, we first construct different distribution environments\nfor the subject-object pairs and learn a model invariant to the environment\nchanges. Then, we construct a class-balanced curriculum learning strategy to\nbalance the different environments to remove the predicate imbalance.\nComprehensive experiments conducted on VG and GQA datasets demonstrate that our\nEICR framework can be taken as a general strategy for various SGG models, and\nachieve significant improvements.\n","authors":["Yukuan Min","Aming Wu","Cheng Deng"],"pdf_url":"https://arxiv.org/pdf/2308.03282v2.pdf","comment":"ICCV2023. arXiv admin note: text overlap with arXiv:2203.11654 by\n other authors"},{"id":"http://arxiv.org/abs/2303.08998v2","updated":"2023-08-21T01:04:50Z","published":"2023-03-16T00:06:28Z","title":"Unified Visual Relationship Detection with Vision and Language Models","summary":" This work focuses on training a single visual relationship detector\npredicting over the union of label spaces from multiple datasets. Merging\nlabels spanning different datasets could be challenging due to inconsistent\ntaxonomies. The issue is exacerbated in visual relationship detection when\nsecond-order visual semantics are introduced between pairs of objects. To\naddress this challenge, we propose UniVRD, a novel bottom-up method for Unified\nVisual Relationship Detection by leveraging vision and language models (VLMs).\nVLMs provide well-aligned image and text embeddings, where similar\nrelationships are optimized to be close to each other for semantic unification.\nOur bottom-up design enables the model to enjoy the benefit of training with\nboth object detection and visual relationship datasets. Empirical results on\nboth human-object interaction detection and scene-graph generation demonstrate\nthe competitive performance of our model. UniVRD achieves 38.07 mAP on\nHICO-DET, outperforming the current best bottom-up HOI detector by 14.26 mAP.\nMore importantly, we show that our unified detector performs as well as\ndataset-specific models in mAP, and achieves further improvements when we scale\nup the model. Our code will be made publicly available on GitHub.\n","authors":["Long Zhao","Liangzhe Yuan","Boqing Gong","Yin Cui","Florian Schroff","Ming-Hsuan Yang","Hartwig Adam","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2303.08998v2.pdf","comment":"Accepted to ICCV 2023. Code is available at\n https://github.com/google-research/scenic/tree/main/scenic/projects/univrd"},{"id":"http://arxiv.org/abs/2304.04909v2","updated":"2023-08-21T00:37:57Z","published":"2023-04-11T00:43:16Z","title":"SATR: Zero-Shot Semantic Segmentation of 3D Shapes","summary":" We explore the task of zero-shot semantic segmentation of 3D shapes by using\nlarge-scale off-the-shelf 2D image recognition models. Surprisingly, we find\nthat modern zero-shot 2D object detectors are better suited for this task than\ncontemporary text/image similarity predictors or even zero-shot 2D segmentation\nnetworks. Our key finding is that it is possible to extract accurate 3D\nsegmentation maps from multi-view bounding box predictions by using the\ntopological properties of the underlying surface. For this, we develop the\nSegmentation Assignment with Topological Reweighting (SATR) algorithm and\nevaluate it on ShapeNetPart and our proposed FAUST benchmarks. SATR achieves\nstate-of-the-art performance and outperforms a baseline algorithm by 1.3% and\n4% average mIoU on the FAUST coarse and fine-grained benchmarks, respectively,\nand by 5.2% average mIoU on the ShapeNetPart benchmark. Our source code and\ndata will be publicly released. Project webpage:\nhttps://samir55.github.io/SATR/.\n","authors":["Ahmed Abdelreheem","Ivan Skorokhodov","Maks Ovsjanikov","Peter Wonka"],"pdf_url":"https://arxiv.org/pdf/2304.04909v2.pdf","comment":"Project webpage: https://samir55.github.io/SATR/"},{"id":"http://arxiv.org/abs/2308.10402v1","updated":"2023-08-21T00:32:19Z","published":"2023-08-21T00:32:19Z","title":"Simple Baselines for Interactive Video Retrieval with Questions and\n Answers","summary":" To date, the majority of video retrieval systems have been optimized for a\n\"single-shot\" scenario in which the user submits a query in isolation, ignoring\nprevious interactions with the system. Recently, there has been renewed\ninterest in interactive systems to enhance retrieval, but existing approaches\nare complex and deliver limited gains in performance. In this work, we revisit\nthis topic and propose several simple yet effective baselines for interactive\nvideo retrieval via question-answering. We employ a VideoQA model to simulate\nuser interactions and show that this enables the productive study of the\ninteractive retrieval task without access to ground truth dialogue data.\nExperiments on MSR-VTT, MSVD, and AVSD show that our framework using\nquestion-based interaction significantly improves the performance of text-based\nvideo retrieval systems.\n","authors":["Kaiqu Liang","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2308.10402v1.pdf","comment":"ICCV 2023, project page:\n https://github.com/kevinliang888/IVR-QA-baselines"},{"id":"http://arxiv.org/abs/2307.03275v2","updated":"2023-08-21T23:56:44Z","published":"2023-07-06T20:23:39Z","title":"To pretrain or not to pretrain? A case study of domain-specific\n pretraining for semantic segmentation in histopathology","summary":" Annotating medical imaging datasets is costly, so fine-tuning (or transfer\nlearning) is the most effective method for digital pathology vision\napplications such as disease classification and semantic segmentation. However,\ndue to texture bias in models trained on real-world images, transfer learning\nfor histopathology applications might result in underperforming models, which\nnecessitates the need for using unlabeled histopathology data and\nself-supervised methods to discover domain-specific characteristics. Here, we\ntested the premise that histopathology-specific pretrained models provide\nbetter initializations for pathology vision tasks, i.e., gland and cell\nsegmentation. In this study, we compare the performance of gland and cell\nsegmentation tasks with histopathology domain-specific and non-domain-specific\n(real-world images) pretrained weights. Moreover, we investigate the dataset\nsize at which domain-specific pretraining produces significant gains in\nperformance. In addition, we investigated whether domain-specific\ninitialization improves the effectiveness of out-of-distribution testing on\ndistinct datasets but the same task. The results indicate that performance gain\nusing domain-specific pretrained weights depends on both the task and the size\nof the training dataset. In instances with limited dataset sizes, a significant\nimprovement in gland segmentation performance was also observed, whereas models\ntrained on cell segmentation datasets exhibit no improvement.\n","authors":["Tushar Kataria","Beatrice Knudsen","Shireen Elhabian"],"pdf_url":"https://arxiv.org/pdf/2307.03275v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11079v1","updated":"2023-08-21T23:16:58Z","published":"2023-08-21T23:16:58Z","title":"Long-Term Prediction of Natural Video Sequences with Robust Video\n Predictors","summary":" Predicting high dimensional video sequences is a curiously difficult problem.\nThe number of possible futures for a given video sequence grows exponentially\nover time due to uncertainty. This is especially evident when trying to predict\ncomplicated natural video scenes from a limited snapshot of the world. The\ninherent uncertainty accumulates the further into the future you predict making\nlong-term prediction very difficult. In this work we introduce a number of\nimprovements to existing work that aid in creating Robust Video Predictors\n(RoViPs). We show that with a combination of deep Perceptual and\nuncertainty-based reconstruction losses we are able to create high quality\nshort-term predictions. Attention-based skip connections are utilised to allow\nfor long range spatial movement of input features to further improve\nperformance. Finally, we show that by simply making the predictor robust to its\nown prediction errors, it is possible to produce very long, realistic natural\nvideo sequences using an iterated single-step prediction task.\n","authors":["Luke Ditria","Tom Drummond"],"pdf_url":"https://arxiv.org/pdf/2308.11079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11073v1","updated":"2023-08-21T22:43:47Z","published":"2023-08-21T22:43:47Z","title":"Audio-Visual Class-Incremental Learning","summary":" In this paper, we introduce audio-visual class-incremental learning, a\nclass-incremental learning scenario for audio-visual video recognition. We\ndemonstrate that joint audio-visual modeling can improve class-incremental\nlearning, but current methods fail to preserve semantic similarity between\naudio and visual features as incremental step grows. Furthermore, we observe\nthat audio-visual correlations learned in previous tasks can be forgotten as\nincremental steps progress, leading to poor performance. To overcome these\nchallenges, we propose AV-CIL, which incorporates Dual-Audio-Visual Similarity\nConstraint (D-AVSC) to maintain both instance-aware and class-aware semantic\nsimilarity between audio-visual modalities and Visual Attention Distillation\n(VAD) to retain previously learned audio-guided visual attentive ability. We\ncreate three audio-visual class-incremental datasets, AVE-Class-Incremental\n(AVE-CI), Kinetics-Sounds-Class-Incremental (K-S-CI), and\nVGGSound100-Class-Incremental (VS100-CI) based on the AVE, Kinetics-Sounds, and\nVGGSound datasets, respectively. Our experiments on AVE-CI, K-S-CI, and\nVS100-CI demonstrate that AV-CIL significantly outperforms existing\nclass-incremental learning methods in audio-visual class-incremental learning.\nCode and data are available at: https://github.com/weiguoPian/AV-CIL_ICCV2023.\n","authors":["Weiguo Pian","Shentong Mo","Yunhui Guo","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2308.11073v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11072v1","updated":"2023-08-21T22:42:55Z","published":"2023-08-21T22:42:55Z","title":"TeD-SPAD: Temporal Distinctiveness for Self-supervised\n Privacy-preservation for video Anomaly Detection","summary":" Video anomaly detection (VAD) without human monitoring is a complex computer\nvision task that can have a positive impact on society if implemented\nsuccessfully. While recent advances have made significant progress in solving\nthis task, most existing approaches overlook a critical real-world concern:\nprivacy. With the increasing popularity of artificial intelligence\ntechnologies, it becomes crucial to implement proper AI ethics into their\ndevelopment. Privacy leakage in VAD allows models to pick up and amplify\nunnecessary biases related to people's personal information, which may lead to\nundesirable decision making. In this paper, we propose TeD-SPAD, a\nprivacy-aware video anomaly detection framework that destroys visual private\ninformation in a self-supervised manner. In particular, we propose the use of a\ntemporally-distinct triplet loss to promote temporally discriminative features,\nwhich complements current weakly-supervised VAD methods. Using TeD-SPAD, we\nachieve a positive trade-off between privacy protection and utility anomaly\ndetection performance on three popular weakly supervised VAD datasets:\nUCF-Crime, XD-Violence, and ShanghaiTech. Our proposed anonymization model\nreduces private attribute prediction by 32.25% while only reducing frame-level\nROC AUC on the UCF-Crime anomaly detection dataset by 3.69%. Project Page:\nhttps://joefioresi718.github.io/TeD-SPAD_webpage/\n","authors":["Joseph Fioresi","Ishan Rajendrakumar Dave","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2308.11072v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2208.07174v3","updated":"2023-08-21T22:42:48Z","published":"2022-08-15T13:21:41Z","title":"A Man-in-the-Middle Attack against Object Detection Systems","summary":" Object detection systems using deep learning models have become increasingly\npopular in robotics thanks to the rising power of CPUs and GPUs in embedded\nsystems. However, these models are susceptible to adversarial attacks. While\nsome attacks are limited by strict assumptions on access to the detection\nsystem, we propose a novel hardware attack inspired by Man-in-the-Middle\nattacks in cryptography. This attack generates an Universal Adversarial\nPerturbation (UAP) and then inject the perturbation between the USB camera and\nthe detection system via a hardware attack. Besides, prior research is misled\nby an evaluation metric that measures the model accuracy rather than the attack\nperformance. In combination with our proposed evaluation metrics, we\nsignificantly increases the strength of adversarial perturbations. These\nfindings raise serious concerns for applications of deep learning models in\nsafety-critical systems, such as autonomous driving.\n","authors":["Han Wu","Sareh Rowlands","Johan Wahlstrom"],"pdf_url":"https://arxiv.org/pdf/2208.07174v3.pdf","comment":"6 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.03273v3","updated":"2023-08-21T22:39:42Z","published":"2023-07-06T20:21:12Z","title":"ADASSM: Adversarial Data Augmentation in Statistical Shape Models From\n Images","summary":" Statistical shape models (SSM) have been well-established as an excellent\ntool for identifying variations in the morphology of anatomy across the\nunderlying population. Shape models use consistent shape representation across\nall the samples in a given cohort, which helps to compare shapes and identify\nthe variations that can detect pathologies and help in formulating treatment\nplans. In medical imaging, computing these shape representations from CT/MRI\nscans requires time-intensive preprocessing operations, including but not\nlimited to anatomy segmentation annotations, registration, and texture\ndenoising. Deep learning models have demonstrated exceptional capabilities in\nlearning shape representations directly from volumetric images, giving rise to\nhighly effective and efficient Image-to-SSM networks. Nevertheless, these\nmodels are data-hungry and due to the limited availability of medical data,\ndeep learning models tend to overfit. Offline data augmentation techniques,\nthat use kernel density estimation based (KDE) methods for generating\nshape-augmented samples, have successfully aided Image-to-SSM networks in\nachieving comparable accuracy to traditional SSM methods. However, these\naugmentation methods focus on shape augmentation, whereas deep learning models\nexhibit image-based texture bias resulting in sub-optimal models. This paper\nintroduces a novel strategy for on-the-fly data augmentation for the\nImage-to-SSM framework by leveraging data-dependent noise generation or texture\naugmentation. The proposed framework is trained as an adversary to the\nImage-to-SSM network, augmenting diverse and challenging noisy samples. Our\napproach achieves improved accuracy by encouraging the model to focus on the\nunderlying geometry rather than relying solely on pixel values.\n","authors":["Mokshagna Sai Teja Karanam","Tushar Kataria","Krithika Iyer","Shireen Elhabian"],"pdf_url":"https://arxiv.org/pdf/2307.03273v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11070v1","updated":"2023-08-21T22:31:54Z","published":"2023-08-21T22:31:54Z","title":"Temporal-Distributed Backdoor Attack Against Video Based Action\n Recognition","summary":" Deep neural networks (DNNs) have achieved tremendous success in various\napplications including video action recognition, yet remain vulnerable to\nbackdoor attacks (Trojans). The backdoor-compromised model will mis-classify to\nthe target class chosen by the attacker when a test instance (from a non-target\nclass) is embedded with a specific trigger, while maintaining high accuracy on\nattack-free instances. Although there are extensive studies on backdoor attacks\nagainst image data, the susceptibility of video-based systems under backdoor\nattacks remains largely unexplored. Current studies are direct extensions of\napproaches proposed for image data, e.g., the triggers are\n\\textbf{independently} embedded within the frames, which tend to be detectable\nby existing defenses. In this paper, we introduce a \\textit{simple} yet\n\\textit{effective} backdoor attack against video data. Our proposed attack,\nadding perturbations in a transformed domain, plants an \\textbf{imperceptible,\ntemporally distributed} trigger across the video frames, and is shown to be\nresilient to existing defensive strategies. The effectiveness of the proposed\nattack is demonstrated by extensive experiments with various well-known models\non two video recognition benchmarks, UCF101 and HMDB51, and a sign language\nrecognition benchmark, Greek Sign Language (GSL) dataset. We delve into the\nimpact of several influential factors on our proposed attack and identify an\nintriguing effect termed \"collateral damage\" through extensive studies.\n","authors":["Xi Li","Songhe Wang","Ruiquan Huang","Mahanth Gowda","George Kesidis"],"pdf_url":"https://arxiv.org/pdf/2308.11070v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11063v1","updated":"2023-08-21T22:16:49Z","published":"2023-08-21T22:16:49Z","title":"MetaGCD: Learning to Continually Learn in Generalized Category Discovery","summary":" In this paper, we consider a real-world scenario where a model that is\ntrained on pre-defined classes continually encounters unlabeled data that\ncontains both known and novel classes. The goal is to continually discover\nnovel classes while maintaining the performance in known classes. We name the\nsetting Continual Generalized Category Discovery (C-GCD). Existing methods for\nnovel class discovery cannot directly handle the C-GCD setting due to some\nunrealistic assumptions, such as the unlabeled data only containing novel\nclasses. Furthermore, they fail to discover novel classes in a continual\nfashion. In this work, we lift all these assumptions and propose an approach,\ncalled MetaGCD, to learn how to incrementally discover with less forgetting.\nOur proposed method uses a meta-learning framework and leverages the offline\nlabeled data to simulate the testing incremental learning process. A\nmeta-objective is defined to revolve around two conflicting learning objectives\nto achieve novel class discovery without forgetting. Furthermore, a soft\nneighborhood-based contrastive network is proposed to discriminate uncorrelated\nimages while attracting correlated images. We build strong baselines and\nconduct extensive experiments on three widely used benchmarks to demonstrate\nthe superiority of our method.\n","authors":["Yanan Wu","Zhixiang Chi","Yang Wang","Songhe Feng"],"pdf_url":"https://arxiv.org/pdf/2308.11063v1.pdf","comment":"This paper has been accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.11062v1","updated":"2023-08-21T22:15:20Z","published":"2023-08-21T22:15:20Z","title":"UnLoc: A Unified Framework for Video Localization Tasks","summary":" While large-scale image-text pretrained models such as CLIP have been used\nfor multiple video-level tasks on trimmed videos, their use for temporal\nlocalization in untrimmed videos is still a relatively unexplored task. We\ndesign a new approach for this called UnLoc, which uses pretrained image and\ntext towers, and feeds tokens to a video-text fusion model. The output of the\nfusion module are then used to construct a feature pyramid in which each level\nconnects to a head to predict a per-frame relevancy score and start/end time\ndisplacements. Unlike previous works, our architecture enables Moment\nRetrieval, Temporal Localization, and Action Segmentation with a single stage\nmodel, without the need for action proposals, motion based pretrained features\nor representation masking. Unlike specialized models, we achieve state of the\nart results on all three different localization tasks with a unified approach.\nCode will be available at: \\url{https://github.com/google-research/scenic}.\n","authors":["Shen Yan","Xuehan Xiong","Arsha Nagrani","Anurag Arnab","Zhonghao Wang","Weina Ge","David Ross","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2308.11062v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2210.01249v2","updated":"2023-08-21T21:47:31Z","published":"2022-10-03T22:04:00Z","title":"LOPR: Latent Occupancy PRediction using Generative Models","summary":" Environment prediction frameworks are integral for autonomous vehicles,\nenabling safe navigation in dynamic environments. LiDAR generated occupancy\ngrid maps (L-OGMs) offer a robust bird's eye-view scene representation that\nfacilitates joint scene predictions without relying on manual labeling unlike\ncommonly used trajectory prediction frameworks. Prior approaches have optimized\ndeterministic L-OGM prediction architectures directly in grid cell space. While\nthese methods have achieved some degree of success in prediction, they\noccasionally grapple with unrealistic and incorrect predictions. We claim that\nthe quality and realism of the forecasted occupancy grids can be enhanced with\nthe use of generative models. We propose a framework that decouples occupancy\nprediction into: representation learning and stochastic prediction within the\nlearned latent space. Our approach allows for conditioning the model on other\navailable sensor modalities such as RGB-cameras and high definition maps. We\ndemonstrate that our approach achieves state-of-the-art performance and is\nreadily transferable between different robotic platforms on the real-world\nNuScenes, Waymo Open, and a custom dataset we collected on an experimental\nvehicle platform.\n","authors":["Bernard Lange","Masha Itkina","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2210.01249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11052v1","updated":"2023-08-21T21:30:48Z","published":"2023-08-21T21:30:48Z","title":"Beyond Discriminative Regions: Saliency Maps as Alternatives to CAMs for\n Weakly Supervised Semantic Segmentation","summary":" In recent years, several Weakly Supervised Semantic Segmentation (WS3)\nmethods have been proposed that use class activation maps (CAMs) generated by a\nclassifier to produce pseudo-ground truths for training segmentation models.\nWhile CAMs are good at highlighting discriminative regions (DR) of an image,\nthey are known to disregard regions of the object that do not contribute to the\nclassifier's prediction, termed non-discriminative regions (NDR). In contrast,\nattribution methods such as saliency maps provide an alternative approach for\nassigning a score to every pixel based on its contribution to the\nclassification prediction. This paper provides a comprehensive comparison\nbetween saliencies and CAMs for WS3. Our study includes multiple perspectives\non understanding their similarities and dissimilarities. Moreover, we provide\nnew evaluation metrics that perform a comprehensive assessment of WS3\nperformance of alternative methods w.r.t. CAMs. We demonstrate the\neffectiveness of saliencies in addressing the limitation of CAMs through our\nempirical studies on benchmark datasets. Furthermore, we propose random\ncropping as a stochastic aggregation technique that improves the performance of\nsaliency, making it a strong alternative to CAM for WS3.\n","authors":["M. Maruf","Arka Daw","Amartya Dutta","Jie Bu","Anuj Karpatne"],"pdf_url":"https://arxiv.org/pdf/2308.11052v1.pdf","comment":"24 pages, 13 figures, 4 tables"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.10837v1","updated":"2023-08-21T16:39:11Z","published":"2023-08-21T16:39:11Z","title":"Leveraging Large Language Models for Pre-trained Recommender Systems","summary":" Recent advancements in recommendation systems have shifted towards more\ncomprehensive and personalized recommendations by utilizing large language\nmodels (LLM). However, effectively integrating LLM's commonsense knowledge and\nreasoning abilities into recommendation systems remains a challenging problem.\nIn this paper, we propose RecSysLLM, a novel pre-trained recommendation model\nbased on LLMs. RecSysLLM retains LLM reasoning and knowledge while integrating\nrecommendation domain knowledge through unique designs of data, training, and\ninference. This allows RecSysLLM to leverage LLMs' capabilities for\nrecommendation tasks in an efficient, unified framework. We demonstrate the\neffectiveness of RecSysLLM on benchmarks and real-world scenarios. RecSysLLM\nprovides a promising approach to developing unified recommendation systems by\nfully exploiting the power of pre-trained language models.\n","authors":["Zhixuan Chu","Hongyan Hao","Xin Ouyang","Simeng Wang","Yan Wang","Yue Shen","Jinjie Gu","Qing Cui","Longfei Li","Siqiao Xue","James Y Zhang","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.10837v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.10835v1","updated":"2023-08-21T16:35:19Z","published":"2023-08-21T16:35:19Z","title":"Enhancing Recommender Systems with Large Language Model Reasoning Graphs","summary":" Recommendation systems aim to provide users with relevant suggestions, but\noften lack interpretability and fail to capture higher-level semantic\nrelationships between user behaviors and profiles. In this paper, we propose a\nnovel approach that leverages large language models (LLMs) to construct\npersonalized reasoning graphs. These graphs link a user's profile and\nbehavioral sequences through causal and logical inferences, representing the\nuser's interests in an interpretable way. Our approach, LLM reasoning graphs\n(LLMRG), has four components: chained graph reasoning, divergent extension,\nself-verification and scoring, and knowledge base self-improvement. The\nresulting reasoning graph is encoded using graph neural networks, which serves\nas additional input to improve conventional recommender systems, without\nrequiring extra user or item information. Our approach demonstrates how LLMs\ncan enable more logical and interpretable recommender systems through\npersonalized reasoning graphs. LLMRG allows recommendations to benefit from\nboth engineered recommendation systems and LLM-derived reasoning graphs. We\ndemonstrate the effectiveness of LLMRG on benchmarks and real-world scenarios\nin enhancing base recommendation models.\n","authors":["Yan Wang","Zhixuan Chu","Xin Ouyang","Simeng Wang","Hongyan Hao","Yue Shen","Jinjie Gu","Siqiao Xue","James Y Zhang","Qing Cui","Longfei Li","Jun Zhou","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2308.10835v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.10807v1","updated":"2023-08-21T15:56:05Z","published":"2023-08-21T15:56:05Z","title":"DynED: Dynamic Ensemble Diversification in Data Stream Classification","summary":" Ensemble methods are commonly used in classification due to their remarkable\nperformance. Achieving high accuracy in a data stream environment is a\nchallenging task considering disruptive changes in the data distribution, also\nknown as concept drift. A greater diversity of ensemble components is known to\nenhance prediction accuracy in such settings. Despite the diversity of\ncomponents within an ensemble, not all contribute as expected to its overall\nperformance. This necessitates a method for selecting components that exhibit\nhigh performance and diversity. We present a novel ensemble construction and\nmaintenance approach based on MMR (Maximal Marginal Relevance) that dynamically\ncombines the diversity and prediction accuracy of components during the process\nof structuring an ensemble. The experimental results on both four real and 11\nsynthetic datasets demonstrate that the proposed approach (DynED) provides a\nhigher average mean accuracy compared to the five state-of-the-art baselines.\n","authors":["Soheil Abadifard","Sepehr Bakhshi","Sanaz Gheibuni","Fazli Can"],"pdf_url":"https://arxiv.org/pdf/2308.10807v1.pdf","comment":"Proceedings of the 32nd ACM International Conference on Information\n and Knowledge Management (CIKM '23), October 21--25, 2023, Birmingham, United\n Kingdom"},{"id":"http://arxiv.org/abs/2308.10801v1","updated":"2023-08-21T15:48:38Z","published":"2023-08-21T15:48:38Z","title":"LSCPM: communities in massive real-world Link Streams by Clique\n Percolation Method","summary":" Community detection is a popular approach to understand the organization of\ninteractions in static networks. For that purpose, the Clique Percolation\nMethod (CPM), which involves the percolation of k-cliques, is a well-studied\ntechnique that offers several advantages. Besides, studying interactions that\noccur over time is useful in various contexts, which can be modeled by the link\nstream formalism. The Dynamic Clique Percolation Method (DCPM) has been\nproposed for extending CPM to temporal networks.\n However, existing implementations are unable to handle massive datasets. We\npresent a novel algorithm that adapts CPM to link streams, which has the\nadvantage that it allows us to speed up the computation time with respect to\nthe existing DCPM method. We evaluate it experimentally on real datasets and\nshow that it scales to massive link streams. For example, it allows to obtain a\ncomplete set of communities in under twenty-five minutes for a dataset with\nthirty million links, what the state of the art fails to achieve even after a\nweek of computation. We further show that our method provides communities\nsimilar to DCPM, but slightly more aggregated. We exhibit the relevance of the\nobtained communities in real world cases, and show that they provide\ninformation on the importance of vertices in the link streams.\n","authors":["Alexis Baudin","Lionel Tabourier","Clémence Magnien"],"pdf_url":"https://arxiv.org/pdf/2308.10801v1.pdf","comment":"18 pages, 7 figures, to be published in 30th International Symposium\n on Temporal Representation and Reasoning (TIME 2023)"},{"id":"http://arxiv.org/abs/2308.10778v1","updated":"2023-08-21T15:09:19Z","published":"2023-08-21T15:09:19Z","title":"A Topology-aware Analysis of Graph Collaborative Filtering","summary":" The successful integration of graph neural networks into recommender systems\n(RSs) has led to a novel paradigm in collaborative filtering (CF), graph\ncollaborative filtering (graph CF). By representing user-item data as an\nundirected, bipartite graph, graph CF utilizes short- and long-range\nconnections to extract collaborative signals that yield more accurate user\npreferences than traditional CF methods. Although the recent literature\nhighlights the efficacy of various algorithmic strategies in graph CF, the\nimpact of datasets and their topological features on recommendation performance\nis yet to be studied. To fill this gap, we propose a topology-aware analysis of\ngraph CF. In this study, we (i) take some widely-adopted recommendation\ndatasets and use them to generate a large set of synthetic sub-datasets through\ntwo state-of-the-art graph sampling methods, (ii) measure eleven of their\nclassical and topological characteristics, and (iii) estimate the accuracy\ncalculated on the generated sub-datasets considering four popular and recent\ngraph-based RSs (i.e., LightGCN, DGCF, UltraGCN, and SVD-GCN). Finally, the\ninvestigation presents an explanatory framework that reveals the linear\nrelationships between characteristics and accuracy measures. The results,\nstatistically validated under different graph sampling settings, confirm the\nexistence of solid dependencies between topological characteristics and\naccuracy in the graph-based recommendation, offering a new perspective on how\nto interpret graph CF.\n","authors":["Daniele Malitesta","Claudio Pomo","Vito Walter Anelli","Alberto Carlo Maria Mancino","Eugenio Di Sciascio","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2308.10778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10758v1","updated":"2023-08-21T14:44:31Z","published":"2023-08-21T14:44:31Z","title":"DepreSym: A Depression Symptom Annotated Corpus and the Role of LLMs as\n Assessors of Psychological Markers","summary":" Computational methods for depression detection aim to mine traces of\ndepression from online publications posted by Internet users. However,\nsolutions trained on existing collections exhibit limited generalisation and\ninterpretability. To tackle these issues, recent studies have shown that\nidentifying depressive symptoms can lead to more robust models. The eRisk\ninitiative fosters research on this area and has recently proposed a new\nranking task focused on developing search methods to find sentences related to\ndepressive symptoms. This search challenge relies on the symptoms specified by\nthe Beck Depression Inventory-II (BDI-II), a questionnaire widely used in\nclinical practice. Based on the participant systems' results, we present the\nDepreSym dataset, consisting of 21580 sentences annotated according to their\nrelevance to the 21 BDI-II symptoms. The labelled sentences come from a pool of\ndiverse ranking methods, and the final dataset serves as a valuable resource\nfor advancing the development of models that incorporate depressive markers\nsuch as clinical symptoms. Due to the complex nature of this relevance\nannotation, we designed a robust assessment methodology carried out by three\nexpert assessors (including an expert psychologist). Additionally, we explore\nhere the feasibility of employing recent Large Language Models (ChatGPT and\nGPT4) as potential assessors in this complex task. We undertake a comprehensive\nexamination of their performance, determine their main limitations and analyze\ntheir role as a complement or replacement for human annotators.\n","authors":["Anxo Pérez","Marcos Fernández-Pichel","Javier Parapar","David E. Losada"],"pdf_url":"https://arxiv.org/pdf/2308.10758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10685v1","updated":"2023-08-21T12:38:53Z","published":"2023-08-21T12:38:53Z","title":"Contrastive Graph Prompt-tuning for Cross-domain Recommendation","summary":" Recommender systems are frequently challenged by the data sparsity problem.\nOne approach to mitigate this issue is through cross-domain recommendation\ntechniques. In a cross-domain context, sharing knowledge between domains can\nenhance the effectiveness in the target domain. Recent cross-domain methods\nhave employed a pre-training approach, but we argue that these methods often\nresult in suboptimal fine-tuning, especially with large neural models. Modern\nlanguage models utilize prompts for efficient model tuning. Such prompts act as\na tunable latent vector, allowing for the freezing of the main model\nparameters. In our research, we introduce the Personalised Graph Prompt-based\nRecommendation (PGPRec) framework. This leverages the advantages of\nprompt-tuning. Within this framework, we formulate personalized graph prompts\nitem-wise, rooted in items that a user has previously engaged with.\nSpecifically, we employ Contrastive Learning (CL) to produce pre-trained\nembeddings that offer greater generalizability in the pre-training phase,\nensuring robust training during the tuning phase. Our evaluation of PGPRec in\ncross-domain scenarios involves comprehensive testing with the top-k\nrecommendation tasks and a cold-start analysis. Our empirical findings, based\non four Amazon Review datasets, reveal that the PGPRec framework can decrease\nthe tuned parameters by as much as 74%, maintaining competitive performance.\nRemarkably, there's an 11.41% enhancement in performance against the leading\nbaseline in cold-start situations.\n","authors":["Zixuan Yi","Iadh Ounis","Craig Macdonald"],"pdf_url":"https://arxiv.org/pdf/2308.10685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05016v2","updated":"2023-08-21T12:00:47Z","published":"2022-09-12T04:13:49Z","title":"FiBiNet++: Reducing Model Size by Low Rank Feature Interaction Layer for\n CTR Prediction","summary":" Click-Through Rate (CTR) estimation has become one of the most fundamental\ntasks in many real-world applications and various deep models have been\nproposed. Some research has proved that FiBiNet is one of the best performance\nmodels and outperforms all other models on Avazu dataset. However, the large\nmodel size of FiBiNet hinders its wider application. In this paper, we propose\na novel FiBiNet++ model to redesign FiBiNet's model structure, which greatly\nreduces model size while further improves its performance. One of the primary\ntechniques involves our proposed \"Low Rank Layer\" focused on feature\ninteraction, which serves as a crucial driver of achieving a superior\ncompression ratio for models. Extensive experiments on three public datasets\nshow that FiBiNet++ effectively reduces non-embedding model parameters of\nFiBiNet by 12x to 16x on three datasets. On the other hand, FiBiNet++ leads to\nsignificant performance improvements compared to state-of-the-art CTR methods,\nincluding FiBiNet.\n","authors":["Pengtao Zhang","Zheng Zheng","Junlin Zhang"],"pdf_url":"https://arxiv.org/pdf/2209.05016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16527v2","updated":"2023-08-21T09:35:52Z","published":"2023-06-21T14:01:01Z","title":"OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text\n Documents","summary":" Large multimodal models trained on natural documents, which interleave images\nand text, outperform models trained on image-text pairs on various multimodal\nbenchmarks. However, the datasets used to train these models have not been\nreleased, and the collection process has not been fully specified. We introduce\nthe OBELICS dataset, an open web-scale filtered dataset of interleaved\nimage-text documents comprising 141 million web pages extracted from Common\nCrawl, 353 million associated images, and 115 billion text tokens. We describe\nthe dataset creation process, present comprehensive filtering rules, and\nprovide an analysis of the dataset's content. To show the viability of OBELICS,\nwe train vision and language models of 9 and 80 billion parameters named\nIDEFICS, and obtain competitive performance on different multimodal benchmarks.\nWe release our dataset, models and code.\n","authors":["Hugo Laurençon","Lucile Saulnier","Léo Tronchon","Stas Bekman","Amanpreet Singh","Anton Lozhkov","Thomas Wang","Siddharth Karamcheti","Alexander M. Rush","Douwe Kiela","Matthieu Cord","Victor Sanh"],"pdf_url":"https://arxiv.org/pdf/2306.16527v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14640v2","updated":"2023-08-21T08:37:37Z","published":"2023-02-28T15:18:42Z","title":"Meta-Learning with Adaptive Weighted Loss for Imbalanced Cold-Start\n Recommendation","summary":" Sequential recommenders have made great strides in capturing a user's\npreferences. Nevertheless, the cold-start recommendation remains a fundamental\nchallenge as they typically involve limited user-item interactions for\npersonalization. Recently, gradient-based meta-learning approaches have emerged\nin the sequential recommendation field due to their fast adaptation and\neasy-to-integrate abilities. The meta-learning algorithms formulate the\ncold-start recommendation as a few-shot learning problem, where each user is\nrepresented as a task to be adapted. While meta-learning algorithms generally\nassume that task-wise samples are evenly distributed over classes or values,\nuser-item interactions in real-world applications do not conform to such a\ndistribution (e.g., watching favorite videos multiple times, leaving only\npositive ratings without any negative ones). Consequently, imbalanced user\nfeedback, which accounts for the majority of task training data, may dominate\nthe user adaptation process and prevent meta-learning algorithms from learning\nmeaningful meta-knowledge for personalized recommendations. To alleviate this\nlimitation, we propose a novel sequential recommendation framework based on\ngradient-based meta-learning that captures the imbalanced rating distribution\nof each user and computes adaptive loss for user-specific learning. Our work is\nthe first to tackle the impact of imbalanced ratings in cold-start sequential\nrecommendation scenarios. Through extensive experiments conducted on real-world\ndatasets, we demonstrate the effectiveness of our framework.\n","authors":["Minchang Kim","Yongjin Yang","Jung Hyun Ryu","Taesup Kim"],"pdf_url":"https://arxiv.org/pdf/2302.14640v2.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2308.10549v1","updated":"2023-08-21T08:05:16Z","published":"2023-08-21T08:05:16Z","title":"Evaluating Temporal Persistence Using Replicability Measures","summary":" In real-world Information Retrieval (IR) experiments, the Evaluation\nEnvironment (EE) is exposed to constant change. Documents are added, removed,\nor updated, and the information need and the search behavior of users is\nevolving. Simultaneously, IR systems are expected to retain a consistent\nquality. The LongEval Lab seeks to investigate the longitudinal persistence of\nIR systems, and in this work, we describe our participation. We submitted runs\nof five advanced retrieval systems, namely a Reciprocal Rank Fusion (RRF)\napproach, ColBERT, monoT5, Doc2Query, and E5, to both sub-tasks. Further, we\ncast the longitudinal evaluation as a replicability study to better understand\nthe temporal change observed. As a result, we quantify the persistence of the\nsubmitted runs and see great potential in this evaluation method.\n","authors":["Jüri Keller","Timo Breuer","Philipp Schaer"],"pdf_url":"https://arxiv.org/pdf/2308.10549v1.pdf","comment":"To be published in Proceedings of the Working Notes of CLEF 2023 -\n Conference and Labs of the Evaluation Forum, Thessaloniki, Greece 18 - 21,\n 2023"},{"id":"http://arxiv.org/abs/2308.10527v1","updated":"2023-08-21T07:26:09Z","published":"2023-08-21T07:26:09Z","title":"DPAN: Dynamic Preference-based and Attribute-aware Network for Relevant\n Recommendations","summary":" In e-commerce platforms, the relevant recommendation is a unique scenario\nproviding related items for a trigger item that users are interested in.\nHowever, users' preferences for the similarity and diversity of recommendation\nresults are dynamic and vary under different conditions. Moreover, individual\nitem-level diversity is too coarse-grained since all recommended items are\nrelated to the trigger item. Thus, the two main challenges are to learn\nfine-grained representations of similarity and diversity and capture users'\ndynamic preferences for them under different conditions. To address these\nchallenges, we propose a novel method called the Dynamic Preference-based and\nAttribute-aware Network (DPAN) for predicting Click-Through Rate (CTR) in\nrelevant recommendations. Specifically, based on Attribute-aware Activation\nValues Generation (AAVG), Bi-dimensional Compression-based Re-expression (BCR)\nis designed to obtain similarity and diversity representations of user\ninterests and item information. Then Shallow and Deep Union-based Fusion (SDUF)\nis proposed to capture users' dynamic preferences for the diverse degree of\nrecommendation results according to various conditions. DPAN has demonstrated\nits effectiveness through extensive offline experiments and online A/B testing,\nresulting in a significant 7.62% improvement in CTR. Currently, DPAN has been\nsuccessfully deployed on our e-commerce platform serving the primary traffic\nfor relevant recommendations. The code of DPAN has been made publicly\navailable.\n","authors":["Wei Dai","Yingmin Su","Xiaofeng Pan"],"pdf_url":"https://arxiv.org/pdf/2308.10527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.01160v2","updated":"2023-08-21T06:40:41Z","published":"2022-10-31T01:45:42Z","title":"A Profit-Maximizing Strategy for Advertising on the e-Commerce Platforms","summary":" The online advertising management platform has become increasingly popular\namong e-commerce vendors/advertisers, offering a streamlined approach to reach\ntarget customers. Despite its advantages, configuring advertising strategies\ncorrectly remains a challenge for online vendors, particularly those with\nlimited resources. Ineffective strategies often result in a surge of\nunproductive ``just looking'' clicks, leading to disproportionately high\nadvertising expenses comparing to the growth of sales. In this paper, we\npresent a novel profit-maximing strategy for targeting options of online\nadvertising. The proposed model aims to find the optimal set of features to\nmaximize the probability of converting targeted audiences into actual buyers.\nWe address the optimization challenge by reformulating it as a multiple-choice\nknapsack problem (MCKP). We conduct an empirical study featuring real-world\ndata from Tmall to show that our proposed method can effectively optimize the\nadvertising strategy with budgetary constraints.\n","authors":["Lianghai Xiao","Yixing Zhao","Jiwei Chen"],"pdf_url":"https://arxiv.org/pdf/2211.01160v2.pdf","comment":"Online advertising campaigns"},{"id":"http://arxiv.org/abs/2301.00280v2","updated":"2023-08-21T05:46:48Z","published":"2022-12-31T20:04:31Z","title":"RECOMED: A Comprehensive Pharmaceutical Recommendation System","summary":" A comprehensive pharmaceutical recommendation system was designed based on\nthe patients and drugs features extracted from Drugs.com and Druglib.com.\nFirst, data from these databases were combined, and a dataset of patients and\ndrug information was built. Secondly, the patients and drugs were clustered,\nand then the recommendation was performed using different ratings provided by\npatients, and importantly by the knowledge obtained from patients and drug\nspecifications, and considering drug interactions. To the best of our\nknowledge, we are the first group to consider patients conditions and history\nin the proposed approach for selecting a specific medicine appropriate for that\nparticular user. Our approach applies artificial intelligence (AI) models for\nthe implementation. Sentiment analysis using natural language processing\napproaches is employed in pre-processing along with neural network-based\nmethods and recommender system algorithms for modeling the system. In our work,\npatients conditions and drugs features are used for making two models based on\nmatrix factorization. Then we used drug interaction to filter drugs with severe\nor mild interactions with other drugs. We developed a deep learning model for\nrecommending drugs by using data from 2304 patients as a training set, and then\nwe used data from 660 patients as our validation set. After that, we used\nknowledge from critical information about drugs and combined the outcome of the\nmodel into a knowledge-based system with the rules obtained from constraints on\ntaking medicine.\n","authors":["Mariam Zomorodi","Ismail Ghodsollahee","Jennifer H. Martin","Nicholas J. Talley","Vahid Salari","Pawel Plawiak","Kazem Rahimi","U. Rajendra Acharya"],"pdf_url":"https://arxiv.org/pdf/2301.00280v2.pdf","comment":"39 pages, 14 figures, 13 tables"},{"id":"http://arxiv.org/abs/2308.10467v1","updated":"2023-08-21T05:08:20Z","published":"2023-08-21T05:08:20Z","title":"Single-User Injection for Invisible Shilling Attack against Recommender\n Systems","summary":" Recommendation systems (RS) are crucial for alleviating the information\noverload problem. Due to its pivotal role in guiding users to make decisions,\nunscrupulous parties are lured to launch attacks against RS to affect the\ndecisions of normal users and gain illegal profits. Among various types of\nattacks, shilling attack is one of the most subsistent and profitable attacks.\nIn shilling attack, an adversarial party injects a number of well-designed fake\nuser profiles into the system to mislead RS so that the attack goal can be\nachieved. Although existing shilling attack methods have achieved promising\nresults, they all adopt the attack paradigm of multi-user injection, where some\nfake user profiles are required. This paper provides the first study of\nshilling attack in an extremely limited scenario: only one fake user profile is\ninjected into the victim RS to launch shilling attacks (i.e., single-user\ninjection). We propose a novel single-user injection method SUI-Attack for\ninvisible shilling attack. SUI-Attack is a graph based attack method that\nmodels shilling attack as a node generation task over the user-item bipartite\ngraph of the victim RS, and it constructs the fake user profile by generating\nuser features and edges that link the fake user to items. Extensive experiments\ndemonstrate that SUI-Attack can achieve promising attack results in single-user\ninjection. In addition to its attack power, SUI-Attack increases the\nstealthiness of shilling attack and reduces the risk of being detected. We\nprovide our implementation at: https://github.com/KDEGroup/SUI-Attack.\n","authors":["Chengzhi Huang","Hui Li"],"pdf_url":"https://arxiv.org/pdf/2308.10467v1.pdf","comment":"CIKM 2023. 10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.09308v2","updated":"2023-08-21T03:43:56Z","published":"2023-08-18T05:05:35Z","title":"Differentiable Retrieval Augmentation via Generative Language Modeling\n for E-commerce Query Intent Classification","summary":" Retrieval augmentation, which enhances downstream models by a knowledge\nretriever and an external corpus instead of by merely increasing the number of\nmodel parameters, has been successfully applied to many natural language\nprocessing (NLP) tasks such as text classification, question answering and so\non. However, existing methods that separately or asynchronously train the\nretriever and downstream model mainly due to the non-differentiability between\nthe two parts, usually lead to degraded performance compared to end-to-end\njoint training. In this paper, we propose Differentiable Retrieval Augmentation\nvia Generative lANguage modeling(Dragan), to address this problem by a novel\ndifferentiable reformulation. We demonstrate the effectiveness of our proposed\nmethod on a challenging NLP task in e-commerce search, namely query intent\nclassification. Both the experimental results and ablation study show that the\nproposed method significantly and reasonably improves the state-of-the-art\nbaselines on both offline evaluation and online A/B test.\n","authors":["Chenyu Zhao","Yunjiang Jiang","Yiming Qiu","Han Zhang","Wen-Yun Yang"],"pdf_url":"https://arxiv.org/pdf/2308.09308v2.pdf","comment":"5 pages, 2 figures; accepted by CIKM2023"},{"id":"http://arxiv.org/abs/2307.03206v2","updated":"2023-08-21T03:41:25Z","published":"2023-07-06T04:10:12Z","title":"Optimal Bandwidth Selection for DENCLUE Algorithm","summary":" In modern day industry, clustering algorithms are daily routines of algorithm\nengineers. Although clustering algorithms experienced rapid growth before 2010.\nInnovation related to the research topic has stagnated after deep learning\nbecame the de facto industrial standard for machine learning applications. In\n2007, a density-based clustering algorithm named DENCLUE was invented to solve\nclustering problem for nonlinear data structures. However, its parameter\nselection problem was largely neglected until 2011. In this paper, we propose a\nnew approach to compute the optimal parameters for the DENCLUE algorithm, and\ndiscuss its performance in the experiment section.\n","authors":["Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2307.03206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02442v2","updated":"2023-08-21T03:11:28Z","published":"2023-08-04T16:14:43Z","title":"Adaptive Preferential Attached kNN Graph with Distribution-Awareness","summary":" Graph-based kNN algorithms have garnered widespread popularity for machine\nlearning tasks due to their simplicity and effectiveness. However, as factual\ndata often inherit complex distributions, the conventional kNN graph's reliance\non a unified k-value can hinder its performance. A crucial factor behind this\nchallenge is the presence of ambiguous samples along decision boundaries that\nare inevitably more prone to incorrect classifications. To address the\nsituation, we propose the Preferential Attached k-Nearest Neighbors Graph\n(paNNG), which adopts distribution-aware adaptive-k into graph construction. By\nincorporating distribution information as a cohesive entity, paNNG can\nsignificantly improve performance on ambiguous samples by \"pulling\" them\ntowards their original classes and hence enhance overall generalization\ncapability. Through rigorous evaluations on diverse datasets, paNNG outperforms\nstate-of-the-art algorithms, showcasing its adaptability and efficacy across\nvarious real-world scenarios.\n","authors":["Shaojie Min","Ji Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02442v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01792v2","updated":"2023-08-21T01:07:53Z","published":"2023-06-01T08:10:03Z","title":"Task Relation-aware Continual User Representation Learning","summary":" User modeling, which learns to represent users into a low-dimensional\nrepresentation space based on their past behaviors, got a surge of interest\nfrom the industry for providing personalized services to users. Previous\nefforts in user modeling mainly focus on learning a task-specific user\nrepresentation that is designed for a single task. However, since learning\ntask-specific user representations for every task is infeasible, recent studies\nintroduce the concept of universal user representation, which is a more\ngeneralized representation of a user that is relevant to a variety of tasks.\nDespite their effectiveness, existing approaches for learning universal user\nrepresentations are impractical in real-world applications due to the data\nrequirement, catastrophic forgetting and the limited learning capability for\ncontinually added tasks. In this paper, we propose a novel continual user\nrepresentation learning method, called TERACON, whose learning capability is\nnot limited as the number of learned tasks increases while capturing the\nrelationship between the tasks. The main idea is to introduce an embedding for\neach task, i.e., task embedding, which is utilized to generate task-specific\nsoft masks that not only allow the entire model parameters to be updated until\nthe end of training sequence, but also facilitate the relationship between the\ntasks to be captured. Moreover, we introduce a novel knowledge retention module\nwith pseudo-labeling strategy that successfully alleviates the long-standing\nproblem of continual learning, i.e., catastrophic forgetting. Extensive\nexperiments on public and proprietary real-world datasets demonstrate the\nsuperiority and practicality of TERACON. Our code is available at\nhttps://github.com/Sein-Kim/TERACON.\n","authors":["Sein Kim","Namkyeong Lee","Donghyun Kim","Minchul Yang","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2306.01792v2.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2211.01494v2","updated":"2023-08-21T19:57:41Z","published":"2022-11-02T21:47:11Z","title":"Regression Compatible Listwise Objectives for Calibrated Ranking with\n Binary Relevance","summary":" As Learning-to-Rank (LTR) approaches primarily seek to improve ranking\nquality, their output scores are not scale-calibrated by design. This\nfundamentally limits LTR usage in score-sensitive applications. Though a simple\nmulti-objective approach that combines a regression and a ranking objective can\neffectively learn scale-calibrated scores, we argue that the two objectives are\nnot necessarily compatible, which makes the trade-off less ideal for either of\nthem. In this paper, we propose a practical regression compatible ranking (RCR)\napproach that achieves a better trade-off, where the two ranking and regression\ncomponents are proved to be mutually aligned. Although the same idea applies to\nranking with both binary and graded relevance, we mainly focus on binary labels\nin this paper. We evaluate the proposed approach on several public LTR\nbenchmarks and show that it consistently achieves either best or competitive\nresult in terms of both regression and ranking metrics, and significantly\nimproves the Pareto frontiers in the context of multi-objective optimization.\nFurthermore, we evaluated the proposed approach on YouTube Search and found\nthat it not only improved the ranking quality of the production pCTR model, but\nalso brought gains to the click prediction accuracy. The proposed approach has\nbeen successfully deployed in the YouTube production system.\n","authors":["Aijun Bai","Rolf Jagerman","Zhen Qin","Le Yan","Pratyush Kar","Bing-Rong Lin","Xuanhui Wang","Michael Bendersky","Marc Najork"],"pdf_url":"https://arxiv.org/pdf/2211.01494v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.10901v1","updated":"2023-08-21T17:59:32Z","published":"2023-08-21T17:59:32Z","title":"Structured World Models from Human Videos","summary":" We tackle the problem of learning complex, general behaviors directly in the\nreal world. We propose an approach for robots to efficiently learn manipulation\nskills using only a handful of real-world interaction trajectories from many\ndifferent settings. Inspired by the success of learning from large-scale\ndatasets in the fields of computer vision and natural language, our belief is\nthat in order to efficiently learn, a robot must be able to leverage\ninternet-scale, human video data. Humans interact with the world in many\ninteresting ways, which can allow a robot to not only build an understanding of\nuseful actions and affordances but also how these actions affect the world for\nmanipulation. Our approach builds a structured, human-centric action space\ngrounded in visual affordances learned from human videos. Further, we train a\nworld model on human videos and fine-tune on a small amount of robot\ninteraction data without any task supervision. We show that this approach of\naffordance-space world models enables different robots to learn various\nmanipulation skills in complex settings, in under 30 minutes of interaction.\nVideos can be found at https://human-world-model.github.io\n","authors":["Russell Mendonca","Shikhar Bahl","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2308.10901v1.pdf","comment":"RSS 2023. Website at https://human-world-model.github.io"},{"id":"http://arxiv.org/abs/2308.10888v1","updated":"2023-08-21T17:42:33Z","published":"2023-08-21T17:42:33Z","title":"Unlocking Accuracy and Fairness in Differentially Private Image\n Classification","summary":" Privacy-preserving machine learning aims to train models on private data\nwithout leaking sensitive information. Differential privacy (DP) is considered\nthe gold standard framework for privacy-preserving training, as it provides\nformal privacy guarantees. However, compared to their non-private counterparts,\nmodels trained with DP often have significantly reduced accuracy. Private\nclassifiers are also believed to exhibit larger performance disparities across\nsubpopulations, raising fairness concerns. The poor performance of classifiers\ntrained with DP has prevented the widespread adoption of privacy preserving\nmachine learning in industry. Here we show that pre-trained foundation models\nfine-tuned with DP can achieve similar accuracy to non-private classifiers,\neven in the presence of significant distribution shifts between pre-training\ndata and downstream tasks. We achieve private accuracies within a few percent\nof the non-private state of the art across four datasets, including two medical\nimaging benchmarks. Furthermore, our private medical classifiers do not exhibit\nlarger performance disparities across demographic groups than non-private\nmodels. This milestone to make DP training a practical and reliable technology\nhas the potential to widely enable machine learning practitioners to train\nsafely on sensitive datasets while protecting individuals' privacy.\n","authors":["Leonard Berrada","Soham De","Judy Hanwen Shen","Jamie Hayes","Robert Stanforth","David Stutz","Pushmeet Kohli","Samuel L. Smith","Borja Balle"],"pdf_url":"https://arxiv.org/pdf/2308.10888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10977v2","updated":"2023-08-21T17:36:36Z","published":"2023-02-17T17:00:12Z","title":"HLSDataset: Open-Source Dataset for ML-Assisted FPGA Design using High\n Level Synthesis","summary":" Machine Learning (ML) has been widely adopted in design exploration using\nhigh level synthesis (HLS) to give a better and faster performance, and\nresource and power estimation at very early stages for FPGA-based design. To\nperform prediction accurately, high-quality and large-volume datasets are\nrequired for training ML models.This paper presents a dataset for ML-assisted\nFPGA design using HLS, called HLSDataset. The dataset is generated from widely\nused HLS C benchmarks including Polybench, Machsuite, CHStone and Rossetta. The\nVerilog samples are generated with a variety of directives including loop\nunroll, loop pipeline and array partition to make sure optimized and realistic\ndesigns are covered. The total number of generated Verilog samples is nearly\n9,000 per FPGA type. To demonstrate the effectiveness of our dataset, we\nundertake case studies to perform power estimation and resource usage\nestimation with ML models trained with our dataset. All the codes and dataset\nare public at the github repo.We believe that HLSDataset can save valuable time\nfor researchers by avoiding the tedious process of running tools, scripting and\nparsing files to generate the dataset, and enable them to spend more time where\nit counts, that is, in training ML models.\n","authors":["Zhigang Wei","Aman Arora","Ruihao Li","Lizy K. John"],"pdf_url":"https://arxiv.org/pdf/2302.10977v2.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2210.00953v3","updated":"2023-08-21T17:23:31Z","published":"2022-10-03T14:11:03Z","title":"Bias and Extrapolation in Markovian Linear Stochastic Approximation with\n Constant Stepsizes","summary":" We consider Linear Stochastic Approximation (LSA) with a constant stepsize\nand Markovian data. Viewing the joint process of the data and LSA iterate as a\ntime-homogeneous Markov chain, we prove its convergence to a unique limiting\nand stationary distribution in Wasserstein distance and establish\nnon-asymptotic, geometric convergence rates. Furthermore, we show that the bias\nvector of this limit admits an infinite series expansion with respect to the\nstepsize. Consequently, the bias is proportional to the stepsize up to higher\norder terms. This result stands in contrast with LSA under i.i.d. data, for\nwhich the bias vanishes. In the reversible chain setting, we provide a general\ncharacterization of the relationship between the bias and the mixing time of\nthe Markovian data, establishing that they are roughly proportional to each\nother.\n While Polyak-Ruppert tail-averaging reduces the variance of the LSA iterates,\nit does not affect the bias. The above characterization allows us to show that\nthe bias can be reduced using Richardson-Romberg extrapolation with $m\\ge 2$\nstepsizes, which eliminates the $m-1$ leading terms in the bias expansion. This\nextrapolation scheme leads to an exponentially smaller bias and an improved\nmean squared error, both in theory and empirically. Our results immediately\napply to the Temporal Difference learning algorithm with linear function\napproximation, Markovian data, and constant stepsizes.\n","authors":["Dongyan Huo","Yudong Chen","Qiaomin Xie"],"pdf_url":"https://arxiv.org/pdf/2210.00953v3.pdf","comment":"SIGMETRICS 2023"},{"id":"http://arxiv.org/abs/2308.10874v1","updated":"2023-08-21T17:21:23Z","published":"2023-08-21T17:21:23Z","title":"Analyzing Transformer Dynamics as Movement through Embedding Space","summary":" Transformer language models exhibit intelligent behaviors such as\nunderstanding natural language, recognizing patterns, acquiring knowledge,\nreasoning, planning, reflecting and using tools. This paper explores how their\nunderlying mechanics give rise to intelligent behaviors. We adopt a systems\napproach to analyze Transformers in detail and develop a mathematical framework\nthat frames their dynamics as movement through embedding space. This novel\nperspective provides a principled way of thinking about the problem and reveals\nimportant insights related to the emergence of intelligence:\n 1. At its core the Transformer is a Embedding Space walker, mapping\nintelligent behavior to trajectories in this vector space.\n 2. At each step of the walk, it composes context into a single composite\nvector whose location in Embedding Space defines the next step.\n 3. No learning actually occurs during decoding; in-context learning and\ngeneralization are simply the result of different contexts composing into\ndifferent vectors.\n 4. Ultimately the knowledge, intelligence and skills exhibited by the model\nare embodied in the organization of vectors in Embedding Space rather than in\nspecific neurons or layers. These abilities are properties of this\norganization.\n 5. Attention's contribution boils down to the association-bias it lends to\nvector composition and which influences the aforementioned organization.\nHowever, more investigation is needed to ascertain its significance.\n 6. The entire model is composed from two principal operations: data\nindependent filtering and data dependent aggregation. This generalization\nunifies Transformers with other sequence models and across modalities.\n Building upon this foundation we formalize and test a semantic space theory\nwhich posits that embedding vectors represent semantic concepts and find some\nevidence of its validity.\n","authors":["Sumeet S. Singh"],"pdf_url":"https://arxiv.org/pdf/2308.10874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.07282v4","updated":"2023-08-21T17:17:26Z","published":"2022-12-14T15:30:56Z","title":"Low-Variance Forward Gradients using Direct Feedback Alignment and\n Momentum","summary":" Supervised learning in deep neural networks is commonly performed using error\nbackpropagation. However, the sequential propagation of errors during the\nbackward pass limits its scalability and applicability to low-powered\nneuromorphic hardware. Therefore, there is growing interest in finding local\nalternatives to backpropagation. Recently proposed methods based on\nforward-mode automatic differentiation suffer from high variance in large deep\nneural networks, which affects convergence. In this paper, we propose the\nForward Direct Feedback Alignment algorithm that combines Activity-Perturbed\nForward Gradients with Direct Feedback Alignment and momentum. We provide both\ntheoretical proofs and empirical evidence that our proposed method achieves\nlower variance than forward gradient techniques. In this way, our approach\nenables faster convergence and better performance when compared to other local\nalternatives to backpropagation and opens a new perspective for the development\nof online learning algorithms compatible with neuromorphic systems.\n","authors":["Florian Bacho","Dominique Chu"],"pdf_url":"https://arxiv.org/pdf/2212.07282v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01538v5","updated":"2023-08-21T17:00:13Z","published":"2023-02-03T04:24:49Z","title":"A deep complementary energy method for solid mechanics using minimum\n complementary energy principle","summary":" In recent years, the rapid advancement of deep learning has significantly\nimpacted various fields, particularly in solving partial differential equations\n(PDEs) in the realm of solid mechanics, benefiting greatly from the remarkable\napproximation capabilities of neural networks. In solving PDEs,\nPhysics-Informed Neural Networks (PINNs) and the Deep Energy Method (DEM) have\ngarnered substantial attention. The principle of minimum potential energy and\ncomplementary energy are two important variational principles in solid\nmechanics. However, the well-known Deep Energy Method (DEM) is based on the\nprinciple of minimum potential energy, but there lacks the important form of\nminimum complementary energy. To bridge this gap, we propose the deep\ncomplementary energy method (DCEM) based on the principle of minimum\ncomplementary energy. The output function of DCEM is the stress function, which\ninherently satisfies the equilibrium equation. We present numerical results\nusing the Prandtl and Airy stress functions, and compare DCEM with existing\nPINNs and DEM algorithms when modeling representative mechanical problems. The\nresults demonstrate that DCEM outperforms DEM in terms of stress accuracy and\nefficiency and has an advantage in dealing with complex displacement boundary\nconditions, which is supported by theoretical analyses and numerical\nsimulations. We extend DCEM to DCEM-Plus (DCEM-P), adding terms that satisfy\npartial differential equations. Furthermore, we propose a deep complementary\nenergy operator method (DCEM-O) by combining operator learning with physical\nequations. Initially, we train DCEM-O using high-fidelity numerical results and\nthen incorporate complementary energy. DCEM-P and DCEM-O further enhance the\naccuracy and efficiency of DCEM.\n","authors":["Yizheng Wang","Jia Sun","Timon Rabczuk","Yinghua Liu"],"pdf_url":"https://arxiv.org/pdf/2302.01538v5.pdf","comment":"58 pages, 30 figures"},{"id":"http://arxiv.org/abs/2306.07180v2","updated":"2023-08-21T16:55:58Z","published":"2023-06-12T15:26:44Z","title":"Diffusion Models for Black-Box Optimization","summary":" The goal of offline black-box optimization (BBO) is to optimize an expensive\nblack-box function using a fixed dataset of function evaluations. Prior works\nconsider forward approaches that learn surrogates to the black-box function and\ninverse approaches that directly map function values to corresponding points in\nthe input domain of the black-box function. These approaches are limited by the\nquality of the offline dataset and the difficulty in learning one-to-many\nmappings in high dimensions, respectively. We propose Denoising Diffusion\nOptimization Models (DDOM), a new inverse approach for offline black-box\noptimization based on diffusion models. Given an offline dataset, DDOM learns a\nconditional generative model over the domain of the black-box function\nconditioned on the function values. We investigate several design choices in\nDDOM, such as re-weighting the dataset to focus on high function values and the\nuse of classifier-free guidance at test-time to enable generalization to\nfunction values that can even exceed the dataset maxima. Empirically, we\nconduct experiments on the Design-Bench benchmark and show that DDOM achieves\nresults competitive with state-of-the-art baselines.\n","authors":["Siddarth Krishnamoorthy","Satvik Mehul Mashkaria","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2306.07180v2.pdf","comment":"International Conference on Machine Learning 2023"},{"id":"http://arxiv.org/abs/2308.10856v1","updated":"2023-08-21T16:50:59Z","published":"2023-08-21T16:50:59Z","title":"Majorana Demonstrator Data Release for AI/ML Applications","summary":" The enclosed data release consists of a subset of the calibration data from\nthe Majorana Demonstrator experiment. Each Majorana event is accompanied by raw\nGermanium detector waveforms, pulse shape discrimination cuts, and calibrated\nfinal energies, all shared in an HDF5 file format along with relevant metadata.\nThis release is specifically designed to support the training and testing of\nArtificial Intelligence (AI) and Machine Learning (ML) algorithms upon our\ndata. This document is structured as follows. Section I provides an overview of\nthe dataset's content and format; Section II outlines the location of this\ndataset and the method for accessing it; Section III presents the NPML Machine\nLearning Challenge associated with this dataset; Section IV contains a\ndisclaimer from the Majorana collaboration regarding the use of this dataset;\nAppendix A contains technical details of this data release. Please direct\nquestions about the material provided within this release to liaobo77@ucsd.edu\n(A. Li).\n","authors":["I. J. Arnquist","F. T. Avignone III","A. S. Barabash","C. J. Barton","K. H. Bhimani","E. Blalock","B. Bos","M. Busch","M. Buuck","T. S. Caldwell","Y. -D. Chan","C. D. Christofferson","P. -H. Chu","M. L. Clark","C. Cuesta","J. A. Detwiler","Yu. Efremenko","H. Ejiri","S. R. Elliott","N. Fuad","G. K. Giovanetti","M. P. Green","J. Gruszko","I. S. Guinn","V. E. Guiseppe","C. R. Haufe","R. Henning","D. Hervas Aguilar","E. W. Hoppe","A. Hostiuc","M. F. Kidd","I. Kim","R. T. Kouzes","T. E. Lannen V","A. Li","J. M. Lopez-Castano","R. D. Martin","R. Massarczyk","S. J. Meijer","S. Mertens","T. K. Oli","L. S. Paudel","W. Pettus","A. W. P. Poon","B. Quenallata","D. C. Radford","A. L. Reine","K. Rielage","N. W. Ruof","D. C. Schaper","S. J. Schleich","D. Tedeschi","R. L. Varner","S. Vasilyev","S. L. Watkins","J. F. Wilkerson","C. Wiseman","W. Xu","C. -H. Yu","B. X. Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.10856v1.pdf","comment":"Zenodo DOI: 10.5281/zenodo.8257027"},{"id":"http://arxiv.org/abs/2206.10786v4","updated":"2023-08-21T16:49:14Z","published":"2022-06-22T00:54:30Z","title":"Generative Pretraining for Black-Box Optimization","summary":" Many problems in science and engineering involve optimizing an expensive\nblack-box function over a high-dimensional space. For such black-box\noptimization (BBO) problems, we typically assume a small budget for online\nfunction evaluations, but also often have access to a fixed, offline dataset\nfor pretraining. Prior approaches seek to utilize the offline data to\napproximate the function or its inverse but are not sufficiently accurate far\nfrom the data distribution. We propose BONET, a generative framework for\npretraining a novel black-box optimizer using offline datasets. In BONET, we\ntrain an autoregressive model on fixed-length trajectories derived from an\noffline dataset. We design a sampling strategy to synthesize trajectories from\noffline data using a simple heuristic of rolling out monotonic transitions from\nlow-fidelity to high-fidelity samples. Empirically, we instantiate BONET using\na causally masked Transformer and evaluate it on Design-Bench, where we rank\nthe best on average, outperforming state-of-the-art baselines.\n","authors":["Siddarth Krishnamoorthy","Satvik Mehul Mashkaria","Aditya Grover"],"pdf_url":"https://arxiv.org/pdf/2206.10786v4.pdf","comment":"International Conference for Machine Learning 2023 NeurIPS Workshop\n for Foundational Models for Decision Making (Oral) 2022"},{"id":"http://arxiv.org/abs/2308.10847v1","updated":"2023-08-21T16:46:36Z","published":"2023-08-21T16:46:36Z","title":"Evaluating quantum generative models via imbalanced data classification\n benchmarks","summary":" A limited set of tools exist for assessing whether the behavior of quantum\nmachine learning models diverges from conventional models, outside of abstract\nor theoretical settings. We present a systematic application of explainable\nartificial intelligence techniques to analyze synthetic data generated from a\nhybrid quantum-classical neural network adapted from twenty different\nreal-world data sets, including solar flares, cardiac arrhythmia, and speech\ndata. Each of these data sets exhibits varying degrees of complexity and class\nimbalance. We benchmark the quantum-generated data relative to state-of-the-art\nmethods for mitigating class imbalance for associated classification tasks. We\nleverage this approach to elucidate the qualities of a problem that make it\nmore or less likely to be amenable to a hybrid quantum-classical generative\nmodel.\n","authors":["Graham R. Enos","Matthew J. Reagor","Eric Hulburd"],"pdf_url":"https://arxiv.org/pdf/2308.10847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10846v1","updated":"2023-08-21T16:44:56Z","published":"2023-08-21T16:44:56Z","title":"Real World Time Series Benchmark Datasets with Distribution Shifts:\n Global Crude Oil Price and Volatility","summary":" The scarcity of task-labeled time-series benchmarks in the financial domain\nhinders progress in continual learning. Addressing this deficit would foster\ninnovation in this area. Therefore, we present COB, Crude Oil Benchmark\ndatasets. COB includes 30 years of asset prices that exhibit significant\ndistribution shifts and optimally generates corresponding task (i.e., regime)\nlabels based on these distribution shifts for the three most important crude\noils in the world. Our contributions include creating real-world benchmark\ndatasets by transforming asset price data into volatility proxies, fitting\nmodels using expectation-maximization (EM), generating contextual task labels\nthat align with real-world events, and providing these labels as well as the\ngeneral algorithm to the public. We show that the inclusion of these task\nlabels universally improves performance on four continual learning algorithms,\nsome state-of-the-art, over multiple forecasting horizons. We hope these\nbenchmarks accelerate research in handling distribution shifts in real-world\ndata, especially due to the global importance of the assets considered. We've\nmade the (1) raw price data, (2) task labels generated by our approach, (3) and\ncode for our algorithm available at https://oilpricebenchmarks.github.io.\n","authors":["Pranay Pasula"],"pdf_url":"https://arxiv.org/pdf/2308.10846v1.pdf","comment":"7 pages, 5 figures. Awarded Best Paper Runner Up / Honorable Mention\n and presented as Contributed Talk at IJCAI 2023, the 32nd International Joint\n Conference on Artificial Intelligence (AI4TS)"},{"id":"http://arxiv.org/abs/2306.13592v2","updated":"2023-08-21T16:37:46Z","published":"2023-06-23T16:28:12Z","title":"TACOformer:Token-channel compounded Cross Attention for Multimodal\n Emotion Recognition","summary":" Recently, emotion recognition based on physiological signals has emerged as a\nfield with intensive research. The utilization of multi-modal, multi-channel\nphysiological signals has significantly improved the performance of emotion\nrecognition systems, due to their complementarity. However, effectively\nintegrating emotion-related semantic information from different modalities and\ncapturing inter-modal dependencies remains a challenging issue. Many existing\nmultimodal fusion methods ignore either token-to-token or channel-to-channel\ncorrelations of multichannel signals from different modalities, which limits\nthe classification capability of the models to some extent. In this paper, we\npropose a comprehensive perspective of multimodal fusion that integrates\nchannel-level and token-level cross-modal interactions. Specifically, we\nintroduce a unified cross attention module called Token-chAnnel COmpound (TACO)\nCross Attention to perform multimodal fusion, which simultaneously models\nchannel-level and token-level dependencies between modalities. Additionally, we\npropose a 2D position encoding method to preserve information about the spatial\ndistribution of EEG signal channels, then we use two transformer encoders ahead\nof the fusion module to capture long-term temporal dependencies from the EEG\nsignal and the peripheral physiological signal, respectively.\nSubject-independent experiments on emotional dataset DEAP and Dreamer\ndemonstrate that the proposed model achieves state-of-the-art performance.\n","authors":["Xinda Li"],"pdf_url":"https://arxiv.org/pdf/2306.13592v2.pdf","comment":"Accepted by IJCAI 2023- AI4TS workshop"},{"id":"http://arxiv.org/abs/2205.02654v3","updated":"2023-08-21T16:31:24Z","published":"2022-05-05T13:56:13Z","title":"Polynomial-Time Algorithms for Counting and Sampling Markov Equivalent\n DAGs with Applications","summary":" Counting and sampling directed acyclic graphs from a Markov equivalence class\nare fundamental tasks in graphical causal analysis. In this paper we show that\nthese tasks can be performed in polynomial time, solving a long-standing open\nproblem in this area. Our algorithms are effective and easily implementable. As\nwe show in experiments, these breakthroughs make thought-to-be-infeasible\nstrategies in active learning of causal structures and causal effect\nidentification with regard to a Markov equivalence class practically\napplicable.\n","authors":["Marcel Wienöbst","Max Bannach","Maciej Liśkiewicz"],"pdf_url":"https://arxiv.org/pdf/2205.02654v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2012.09679"},{"id":"http://arxiv.org/abs/2308.06058v2","updated":"2023-08-21T16:28:13Z","published":"2023-08-11T10:17:29Z","title":"Adaptive SGD with Polyak stepsize and Line-search: Robust Convergence\n and Variance Reduction","summary":" The recently proposed stochastic Polyak stepsize (SPS) and stochastic\nline-search (SLS) for SGD have shown remarkable effectiveness when training\nover-parameterized models. However, in non-interpolation settings, both\nalgorithms only guarantee convergence to a neighborhood of a solution which may\nresult in a worse output than the initial guess. While artificially decreasing\nthe adaptive stepsize has been proposed to address this issue (Orvieto et al.\n[2022]), this approach results in slower convergence rates for convex and\nover-parameterized models. In this work, we make two contributions: Firstly, we\npropose two new variants of SPS and SLS, called AdaSPS and AdaSLS, which\nguarantee convergence in non-interpolation settings and maintain sub-linear and\nlinear convergence rates for convex and strongly convex functions when training\nover-parameterized models. AdaSLS requires no knowledge of problem-dependent\nparameters, and AdaSPS requires only a lower bound of the optimal function\nvalue as input. Secondly, we equip AdaSPS and AdaSLS with a novel variance\nreduction technique and obtain algorithms that require\n$\\smash{\\widetilde{\\mathcal{O}}}(n+1/\\epsilon)$ gradient evaluations to achieve\nan $\\mathcal{O}(\\epsilon)$-suboptimality for convex functions, which improves\nupon the slower $\\mathcal{O}(1/\\epsilon^2)$ rates of AdaSPS and AdaSLS without\nvariance reduction in the non-interpolation regimes. Moreover, our result\nmatches the fast rates of AdaSVRG but removes the inner-outer-loop structure,\nwhich is easier to implement and analyze. Finally, numerical experiments on\nsynthetic and real datasets validate our theory and demonstrate the\neffectiveness and robustness of our algorithms.\n","authors":["Xiaowen Jiang","Sebastian U. Stich"],"pdf_url":"https://arxiv.org/pdf/2308.06058v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12463v2","updated":"2023-08-21T16:16:59Z","published":"2023-07-24T00:53:46Z","title":"Rethinking Data Distillation: Do Not Overlook Calibration","summary":" Neural networks trained on distilled data often produce over-confident output\nand require correction by calibration methods. Existing calibration methods\nsuch as temperature scaling and mixup work well for networks trained on\noriginal large-scale data. However, we find that these methods fail to\ncalibrate networks trained on data distilled from large source datasets. In\nthis paper, we show that distilled data lead to networks that are not\ncalibratable due to (i) a more concentrated distribution of the maximum logits\nand (ii) the loss of information that is semantically meaningful but unrelated\nto classification tasks. To address this problem, we propose Masked Temperature\nScaling (MTS) and Masked Distillation Training (MDT) which mitigate the\nlimitations of distilled data and achieve better calibration results while\nmaintaining the efficiency of dataset distillation.\n","authors":["Dongyao Zhu","Bowen Lei","Jie Zhang","Yanbo Fang","Ruqi Zhang","Yiqun Xie","Dongkuan Xu"],"pdf_url":"https://arxiv.org/pdf/2307.12463v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10821v1","updated":"2023-08-21T16:13:23Z","published":"2023-08-21T16:13:23Z","title":"Neural Networks Optimizations Against Concept and Data Drift in Malware\n Detection","summary":" Despite the promising results of machine learning models in malware\ndetection, they face the problem of concept drift due to malware constant\nevolution. This leads to a decline in performance over time, as the data\ndistribution of the new files differs from the training one, requiring regular\nmodel update. In this work, we propose a model-agnostic protocol to improve a\nbaseline neural network to handle with the drift problem. We show the\nimportance of feature reduction and training with the most recent validation\nset possible, and propose a loss function named Drift-Resilient Binary\nCross-Entropy, an improvement to the classical Binary Cross-Entropy more\neffective against drift. We train our model on the EMBER dataset (2018) and\nevaluate it on a dataset of recent malicious files, collected between 2020 and\n2023. Our improved model shows promising results, detecting 15.2% more malware\nthan a baseline model.\n","authors":["William Maillet","Benjamin Marais"],"pdf_url":"https://arxiv.org/pdf/2308.10821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10808v1","updated":"2023-08-21T15:57:57Z","published":"2023-08-21T15:57:57Z","title":"Graph Neural Bandits","summary":" Contextual bandits algorithms aim to choose the optimal arm with the highest\nreward out of a set of candidates based on the contextual information. Various\nbandit algorithms have been applied to real-world applications due to their\nability of tackling the exploitation-exploration dilemma. Motivated by online\nrecommendation scenarios, in this paper, we propose a framework named Graph\nNeural Bandits (GNB) to leverage the collaborative nature among users empowered\nby graph neural networks (GNNs). Instead of estimating rigid user clusters as\nin existing works, we model the \"fine-grained\" collaborative effects through\nestimated user graphs in terms of exploitation and exploration respectively.\nThen, to refine the recommendation strategy, we utilize separate GNN-based\nmodels on estimated user graphs for exploitation and adaptive exploration.\nTheoretical analysis and experimental results on multiple real data sets in\ncomparison with state-of-the-art baselines are provided to demonstrate the\neffectiveness of our proposed framework.\n","authors":["Yunzhe Qi","Yikun Ban","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2308.10808v1.pdf","comment":"Accepted to SIGKDD 2023"},{"id":"http://arxiv.org/abs/2308.10807v1","updated":"2023-08-21T15:56:05Z","published":"2023-08-21T15:56:05Z","title":"DynED: Dynamic Ensemble Diversification in Data Stream Classification","summary":" Ensemble methods are commonly used in classification due to their remarkable\nperformance. Achieving high accuracy in a data stream environment is a\nchallenging task considering disruptive changes in the data distribution, also\nknown as concept drift. A greater diversity of ensemble components is known to\nenhance prediction accuracy in such settings. Despite the diversity of\ncomponents within an ensemble, not all contribute as expected to its overall\nperformance. This necessitates a method for selecting components that exhibit\nhigh performance and diversity. We present a novel ensemble construction and\nmaintenance approach based on MMR (Maximal Marginal Relevance) that dynamically\ncombines the diversity and prediction accuracy of components during the process\nof structuring an ensemble. The experimental results on both four real and 11\nsynthetic datasets demonstrate that the proposed approach (DynED) provides a\nhigher average mean accuracy compared to the five state-of-the-art baselines.\n","authors":["Soheil Abadifard","Sepehr Bakhshi","Sanaz Gheibuni","Fazli Can"],"pdf_url":"https://arxiv.org/pdf/2308.10807v1.pdf","comment":"Proceedings of the 32nd ACM International Conference on Information\n and Knowledge Management (CIKM '23), October 21--25, 2023, Birmingham, United\n Kingdom"},{"id":"http://arxiv.org/abs/2308.10806v1","updated":"2023-08-21T15:53:38Z","published":"2023-08-21T15:53:38Z","title":"Differentiable Frank-Wolfe Optimization Layer","summary":" Differentiable optimization has received a significant amount of attention\ndue to its foundational role in the domain of machine learning based on neural\nnetworks. The existing methods leverages the optimality conditions and implicit\nfunction theorem to obtain the Jacobian matrix of the output, which increases\nthe computational cost and limits the application of differentiable\noptimization. In addition, some non-differentiable constraints lead to more\nchallenges when using prior differentiable optimization layers. This paper\nproposes a differentiable layer, named Differentiable Frank-Wolfe Layer\n(DFWLayer), by rolling out the Frank-Wolfe method, a well-known optimization\nalgorithm which can solve constrained optimization problems without projections\nand Hessian matrix computations, thus leading to a efficient way of dealing\nwith large-scale problems. Theoretically, we establish a bound on the\nsuboptimality gap of the DFWLayer in the context of l1-norm constraints.\nExperimental assessments demonstrate that the DFWLayer not only attains\ncompetitive accuracy in solutions and gradients but also consistently adheres\nto constraints. Moreover, it surpasses the baselines in both forward and\nbackward computational speeds.\n","authors":["Zixuan Liu","Liu Liu","Xueqian Wang","Peilin Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.10806v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10797v1","updated":"2023-08-21T15:42:56Z","published":"2023-08-21T15:42:56Z","title":"Stabilizing Unsupervised Environment Design with a Learned Adversary","summary":" A key challenge in training generally-capable agents is the design of\ntraining tasks that facilitate broad generalization and robustness to\nenvironment variations. This challenge motivates the problem setting of\nUnsupervised Environment Design (UED), whereby a student agent trains on an\nadaptive distribution of tasks proposed by a teacher agent. A pioneering\napproach for UED is PAIRED, which uses reinforcement learning (RL) to train a\nteacher policy to design tasks from scratch, making it possible to directly\ngenerate tasks that are adapted to the agent's current capabilities. Despite\nits strong theoretical backing, PAIRED suffers from a variety of challenges\nthat hinder its practical performance. Thus, state-of-the-art methods currently\nrely on curation and mutation rather than generation of new tasks. In this\nwork, we investigate several key shortcomings of PAIRED and propose solutions\nfor each shortcoming. As a result, we make it possible for PAIRED to match or\nexceed state-of-the-art methods, producing robust agents in several established\nchallenging procedurally-generated environments, including a partially-observed\nmaze navigation task and a continuous-control car racing environment. We\nbelieve this work motivates a renewed emphasis on UED methods based on learned\nmodels that directly generate challenging environments, potentially unlocking\nmore open-ended RL training and, as a result, more general agents.\n","authors":["Ishita Mediratta","Minqi Jiang","Jack Parker-Holder","Michael Dennis","Eugene Vinitsky","Tim Rocktäschel"],"pdf_url":"https://arxiv.org/pdf/2308.10797v1.pdf","comment":"CoLLAs 2023 - Oral; Minqi and Jack contributed equally"},{"id":"http://arxiv.org/abs/2308.10794v1","updated":"2023-08-21T15:39:41Z","published":"2023-08-21T15:39:41Z","title":"MGMAE: Motion Guided Masking for Video Masked Autoencoding","summary":" Masked autoencoding has shown excellent performance on self-supervised video\nrepresentation learning. Temporal redundancy has led to a high masking ratio\nand customized masking strategy in VideoMAE. In this paper, we aim to further\nimprove the performance of video masked autoencoding by introducing a motion\nguided masking strategy. Our key insight is that motion is a general and unique\nprior in video, which should be taken into account during masked pre-training.\nOur motion guided masking explicitly incorporates motion information to build\ntemporal consistent masking volume. Based on this masking volume, we can track\nthe unmasked tokens in time and sample a set of temporal consistent cubes from\nvideos. These temporal aligned unmasked tokens will further relieve the\ninformation leakage issue in time and encourage the MGMAE to learn more useful\nstructure information. We implement our MGMAE with an online efficient optical\nflow estimator and backward masking map warping strategy. We perform\nexperiments on the datasets of Something-Something V2 and Kinetics-400,\ndemonstrating the superior performance of our MGMAE to the original VideoMAE.\nIn addition, we provide the visualization analysis to illustrate that our MGMAE\ncan sample temporal consistent cubes in a motion-adaptive manner for more\neffective video pre-training.\n","authors":["Bingkun Huang","Zhiyu Zhao","Guozhen Zhang","Yu Qiao","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10794v1.pdf","comment":"ICCV 2023 camera-ready version"},{"id":"http://arxiv.org/abs/2308.10792v1","updated":"2023-08-21T15:35:16Z","published":"2023-08-21T15:35:16Z","title":"Instruction Tuning for Large Language Models: A Survey","summary":" This paper surveys research works in the quickly advancing field of\ninstruction tuning (IT), a crucial technique to enhance the capabilities and\ncontrollability of large language models (LLMs). Instruction tuning refers to\nthe process of further training LLMs on a dataset consisting of\n\\textsc{(instruction, output)} pairs in a supervised fashion, which bridges the\ngap between the next-word prediction objective of LLMs and the users' objective\nof having LLMs adhere to human instructions. In this work, we make a systematic\nreview of the literature, including the general methodology of IT, the\nconstruction of IT datasets, the training of IT models, and applications to\ndifferent modalities, domains and applications, along with an analysis on\naspects that influence the outcome of IT (e.g., generation of instruction\noutputs, size of the instruction dataset, etc). We also review the potential\npitfalls of IT along with criticism against it, along with efforts pointing out\ncurrent deficiencies of existing strategies and suggest some avenues for\nfruitful research.\n","authors":["Shengyu Zhang","Linfeng Dong","Xiaoya Li","Sen Zhang","Xiaofei Sun","Shuhe Wang","Jiwei Li","Runyi Hu","Tianwei Zhang","Fei Wu","Guoyin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10792v1.pdf","comment":"A Survey paper, Pre-print"},{"id":"http://arxiv.org/abs/2012.04841v4","updated":"2023-08-21T15:23:10Z","published":"2020-12-09T03:20:06Z","title":"One-Vote Veto: Semi-Supervised Learning for Low-Shot Glaucoma Diagnosis","summary":" Convolutional neural networks (CNNs) are a promising technique for automated\nglaucoma diagnosis from images of the fundus, and these images are routinely\nacquired as part of an ophthalmic exam. Nevertheless, CNNs typically require a\nlarge amount of well-labeled data for training, which may not be available in\nmany biomedical image classification applications, especially when diseases are\nrare and where labeling by experts is costly. This article makes two\ncontributions to address this issue: (1) It extends the conventional Siamese\nnetwork and introduces a training method for low-shot learning when labeled\ndata are limited and imbalanced, and (2) it introduces a novel semi-supervised\nlearning strategy that uses additional unlabeled training data to achieve\ngreater accuracy. Our proposed multi-task Siamese network (MTSN) can employ any\nbackbone CNN, and we demonstrate with four backbone CNNs that its accuracy with\nlimited training data approaches the accuracy of backbone CNNs trained with a\ndataset that is 50 times larger. We also introduce One-Vote Veto (OVV)\nself-training, a semi-supervised learning strategy that is designed\nspecifically for MTSNs. By taking both self-predictions and contrastive\npredictions of the unlabeled training data into account, OVV self-training\nprovides additional pseudo labels for fine-tuning a pre-trained MTSN. Using a\nlarge (imbalanced) dataset with 66,715 fundus photographs acquired over 15\nyears, extensive experimental results demonstrate the effectiveness of low-shot\nlearning with MTSN and semi-supervised learning with OVV self-training. Three\nadditional, smaller clinical datasets of fundus images acquired under different\nconditions (cameras, instruments, locations, populations) are used to\ndemonstrate the generalizability of the proposed methods.\n","authors":["Rui Fan","Christopher Bowd","Nicole Brye","Mark Christopher","Robert N. Weinreb","David Kriegman","Linda M. Zangwill"],"pdf_url":"https://arxiv.org/pdf/2012.04841v4.pdf","comment":"accepted by IEEE Transactions on Medical Imaging (T-MI). DOI:\n 10.1109/TMI.2023.3307689"},{"id":"http://arxiv.org/abs/2110.15073v4","updated":"2023-08-21T15:20:37Z","published":"2021-10-28T12:47:49Z","title":"MMD Aggregated Two-Sample Test","summary":" We propose two novel nonparametric two-sample kernel tests based on the\nMaximum Mean Discrepancy (MMD). First, for a fixed kernel, we construct an MMD\ntest using either permutations or a wild bootstrap, two popular numerical\nprocedures to determine the test threshold. We prove that this test controls\nthe probability of type I error non-asymptotically. Hence, it can be used\nreliably even in settings with small sample sizes as it remains\nwell-calibrated, which differs from previous MMD tests which only guarantee\ncorrect test level asymptotically. When the difference in densities lies in a\nSobolev ball, we prove minimax optimality of our MMD test with a specific\nkernel depending on the smoothness parameter of the Sobolev ball. In practice,\nthis parameter is unknown and, hence, the optimal MMD test with this particular\nkernel cannot be used. To overcome this issue, we construct an aggregated test,\ncalled MMDAgg, which is adaptive to the smoothness parameter. The test power is\nmaximised over the collection of kernels used, without requiring held-out data\nfor kernel selection (which results in a loss of test power), or arbitrary\nkernel choices such as the median heuristic. We prove that MMDAgg still\ncontrols the level non-asymptotically, and achieves the minimax rate over\nSobolev balls, up to an iterated logarithmic term. Our guarantees are not\nrestricted to a specific type of kernel, but hold for any product of\none-dimensional translation invariant characteristic kernels. We provide a\nuser-friendly parameter-free implementation of MMDAgg using an adaptive\ncollection of bandwidths. We demonstrate that MMDAgg significantly outperforms\nalternative state-of-the-art MMD-based two-sample tests on synthetic data\nsatisfying the Sobolev smoothness assumption, and that, on real-world image\ndata, MMDAgg closely matches the power of tests leveraging the use of models\nsuch as neural networks.\n","authors":["Antonin Schrab","Ilmun Kim","Mélisande Albert","Béatrice Laurent","Benjamin Guedj","Arthur Gretton"],"pdf_url":"https://arxiv.org/pdf/2110.15073v4.pdf","comment":"81 pages"},{"id":"http://arxiv.org/abs/2308.10783v1","updated":"2023-08-21T15:19:10Z","published":"2023-08-21T15:19:10Z","title":"Zero- and Few-Shot Prompting with LLMs: A Comparative Study with\n Fine-tuned Models for Bangla Sentiment Analysis","summary":" The rapid expansion of the digital world has propelled sentiment analysis\ninto a critical tool across diverse sectors such as marketing, politics,\ncustomer service, and healthcare. While there have been significant\nadvancements in sentiment analysis for widely spoken languages, low-resource\nlanguages, such as Bangla, remain largely under-researched due to resource\nconstraints. Furthermore, the recent unprecedented performance of Large\nLanguage Models (LLMs) in various applications highlights the need to evaluate\nthem in the context of low-resource languages. In this study, we present a\nsizeable manually annotated dataset encompassing 33,605 Bangla news tweets and\nFacebook comments. We also investigate zero- and few-shot in-context learning\nwith several language models, including Flan-T5, GPT-4, and Bloomz, offering a\ncomparative analysis against fine-tuned models. Our findings suggest that\nmonolingual transformer-based models consistently outperform other models, even\nin zero and few-shot scenarios. To foster continued exploration, we intend to\nmake this dataset and our research tools publicly available to the broader\nresearch community. In the spirit of further research, we plan to make this\ndataset and our experimental resources publicly accessible to the wider\nresearch community.\n","authors":["Md. Arid Hasan","Shudipta Das","Afiyat Anjum","Firoj Alam","Anika Anjum","Avijit Sarker","Sheak Rashed Haider Noori"],"pdf_url":"https://arxiv.org/pdf/2308.10783v1.pdf","comment":"Zero-Shot Prompting, Few-Shot Prompting, LLMs, Comparative Study,\n Fine-tuned Models, Bangla, Sentiment Analysis"},{"id":"http://arxiv.org/abs/2308.10782v1","updated":"2023-08-21T15:16:19Z","published":"2023-08-21T15:16:19Z","title":"Sparse Linear Concept Discovery Models","summary":" The recent mass adoption of DNNs, even in safety-critical scenarios, has\nshifted the focus of the research community towards the creation of inherently\nintrepretable models. Concept Bottleneck Models (CBMs) constitute a popular\napproach where hidden layers are tied to human understandable concepts allowing\nfor investigation and correction of the network's decisions. However, CBMs\nusually suffer from: (i) performance degradation and (ii) lower\ninterpretability than intended due to the sheer amount of concepts contributing\nto each decision. In this work, we propose a simple yet highly intuitive\ninterpretable framework based on Contrastive Language Image models and a single\nsparse linear layer. In stark contrast to related approaches, the sparsity in\nour framework is achieved via principled Bayesian arguments by inferring\nconcept presence via a data-driven Bernoulli distribution. As we experimentally\nshow, our framework not only outperforms recent CBM approaches accuracy-wise,\nbut it also yields high per example concept sparsity, facilitating the\nindividual investigation of the emerging concepts.\n","authors":["Konstantinos P. Panousis","Dino Ienco","Diego Marcos"],"pdf_url":"https://arxiv.org/pdf/2308.10782v1.pdf","comment":"Accepted @ ICCVW CLVL 2023"},{"id":"http://arxiv.org/abs/2308.10781v1","updated":"2023-08-21T15:14:49Z","published":"2023-08-21T15:14:49Z","title":"Mixed-Integer Projections for Automated Data Correction of EMRs Improve\n Predictions of Sepsis among Hospitalized Patients","summary":" Machine learning (ML) models are increasingly pivotal in automating clinical\ndecisions. Yet, a glaring oversight in prior research has been the lack of\nproper processing of Electronic Medical Record (EMR) data in the clinical\ncontext for errors and outliers. Addressing this oversight, we introduce an\ninnovative projections-based method that seamlessly integrates clinical\nexpertise as domain constraints, generating important meta-data that can be\nused in ML workflows. In particular, by using high-dimensional mixed-integer\nprograms that capture physiological and biological constraints on patient\nvitals and lab values, we can harness the power of mathematical \"projections\"\nfor the EMR data to correct patient data. Consequently, we measure the distance\nof corrected data from the constraints defining a healthy range of patient\ndata, resulting in a unique predictive metric we term as \"trust-scores\". These\nscores provide insight into the patient's health status and significantly boost\nthe performance of ML classifiers in real-life clinical settings. We validate\nthe impact of our framework in the context of early detection of sepsis using\nML. We show an AUROC of 0.865 and a precision of 0.922, that surpasses\nconventional ML models without such projections.\n","authors":["Mehak Arora","Hassan Mortagy","Nathan Dwarshius","Swati Gupta","Andre L. Holder","Rishikesan Kamaleswaran"],"pdf_url":"https://arxiv.org/pdf/2308.10781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10779v1","updated":"2023-08-21T15:09:51Z","published":"2023-08-21T15:09:51Z","title":"Spear and Shield: Adversarial Attacks and Defense Methods for\n Model-Based Link Prediction on Continuous-Time Dynamic Graphs","summary":" Real-world graphs are dynamic, constantly evolving with new interactions,\nsuch as financial transactions in financial networks. Temporal Graph Neural\nNetworks (TGNNs) have been developed to effectively capture the evolving\npatterns in dynamic graphs. While these models have demonstrated their\nsuperiority, being widely adopted in various important fields, their\nvulnerabilities against adversarial attacks remain largely unexplored. In this\npaper, we propose T-SPEAR, a simple and effective adversarial attack method for\nlink prediction on continuous-time dynamic graphs, focusing on investigating\nthe vulnerabilities of TGNNs. Specifically, before the training procedure of a\nvictim model, which is a TGNN for link prediction, we inject edge perturbations\nto the data that are unnoticeable in terms of the four constraints we propose,\nand yet effective enough to cause malfunction of the victim model. Moreover, we\npropose a robust training approach T-SHIELD to mitigate the impact of\nadversarial attacks. By using edge filtering and enforcing temporal smoothness\nto node embeddings, we enhance the robustness of the victim model. Our\nexperimental study shows that T-SPEAR significantly degrades the victim model's\nperformance on link prediction tasks, and even more, our attacks are\ntransferable to other TGNNs, which differ from the victim model assumed by the\nattacker. Moreover, we demonstrate that T-SHIELD effectively filters out\nadversarial edges and exhibits robustness against adversarial attacks,\nsurpassing the link prediction performance of the naive TGNN by up to 11.2%\nunder T-SPEAR.\n","authors":["Dongjin Lee","Juho Lee","Kijung Shin"],"pdf_url":"https://arxiv.org/pdf/2308.10779v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10776v1","updated":"2023-08-21T15:06:02Z","published":"2023-08-21T15:06:02Z","title":"A Modular and Adaptive System for Business Email Compromise Detection","summary":" The growing sophistication of Business Email Compromise (BEC) and spear\nphishing attacks poses significant challenges to organizations worldwide. The\ntechniques featured in traditional spam and phishing detection are insufficient\ndue to the tailored nature of modern BEC attacks as they often blend in with\nthe regular benign traffic. Recent advances in machine learning, particularly\nin Natural Language Understanding (NLU), offer a promising avenue for combating\nsuch attacks but in a practical system, due to limitations such as data\navailability, operational costs, verdict explainability requirements or a need\nto robustly evolve the system, it is essential to combine multiple approaches\ntogether. We present CAPE, a comprehensive and efficient system for BEC\ndetection that has been proven in a production environment for a period of over\ntwo years. Rather than being a single model, CAPE is a system that combines\nindependent ML models and algorithms detecting BEC-related behaviors across\nvarious email modalities such as text, images, metadata and the email's\ncommunication context. This decomposition makes CAPE's verdicts naturally\nexplainable. In the paper, we describe the design principles and constraints\nbehind its architecture, as well as the challenges of model design, evaluation\nand adapting the system continuously through a Bayesian approach that combines\nlimited data with domain knowledge. Furthermore, we elaborate on several\nspecific behavioral detectors, such as those based on Transformer neural\narchitectures.\n","authors":["Jan Brabec","Filip Šrajer","Radek Starosta","Tomáš Sixta","Marc Dupont","Miloš Lenoch","Jiří Menšík","Florian Becker","Jakub Boros","Tomáš Pop","Pavel Novák"],"pdf_url":"https://arxiv.org/pdf/2308.10776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08107v2","updated":"2023-08-21T15:01:46Z","published":"2023-06-13T19:51:22Z","title":"AutoML in the Age of Large Language Models: Current Challenges, Future\n Opportunities and Risks","summary":" The fields of both Natural Language Processing (NLP) and Automated Machine\nLearning (AutoML) have achieved remarkable results over the past years. In NLP,\nespecially Large Language Models (LLMs) have experienced a rapid series of\nbreakthroughs very recently. We envision that the two fields can radically push\nthe boundaries of each other through tight integration. To showcase this\nvision, we explore the potential of a symbiotic relationship between AutoML and\nLLMs, shedding light on how they can benefit each other. In particular, we\ninvestigate both the opportunities to enhance AutoML approaches with LLMs from\ndifferent perspectives and the challenges of leveraging AutoML to further\nimprove LLMs. To this end, we survey existing work, and we critically assess\nrisks. We strongly believe that the integration of the two fields has the\npotential to disrupt both fields, NLP and AutoML. By highlighting conceivable\nsynergies, but also risks, we aim to foster further exploration at the\nintersection of AutoML and LLMs.\n","authors":["Alexander Tornede","Difan Deng","Theresa Eimer","Joseph Giovanelli","Aditya Mohan","Tim Ruhkopf","Sarah Segel","Daphne Theodorakopoulos","Tanja Tornede","Henning Wachsmuth","Marius Lindauer"],"pdf_url":"https://arxiv.org/pdf/2306.08107v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.11437v2","updated":"2023-08-21T15:00:03Z","published":"2022-07-23T06:53:57Z","title":"The prediction of the quality of results in Logic Synthesis using\n Transformer and Graph Neural Networks","summary":" In the logic synthesis stage, structure transformations in the synthesis tool\nneed to be combined into optimization sequences and act on the circuit to meet\nthe specified circuit area and delay. However, logic synthesis optimization\nsequences are time-consuming to run, and predicting the quality of the results\n(QoR) against the synthesis optimization sequence for a circuit can help\nengineers find a better optimization sequence faster. In this work, we propose\na deep learning method to predict the QoR of unseen circuit-optimization\nsequences pairs. Specifically, the structure transformations are translated\ninto vectors by embedding methods and advanced natural language processing\n(NLP) technology (Transformer) is used to extract the features of the\noptimization sequences. In addition, to enable the prediction process of the\nmodel to be generalized from circuit to circuit, the graph representation of\nthe circuit is represented as an adjacency matrix and a feature matrix. Graph\nneural networks(GNN) are used to extract the structural features of the\ncircuits. For this problem, the Transformer and three typical GNNs are used.\nFurthermore, the Transformer and GNNs are adopted as a joint learning policy\nfor the QoR prediction of the unseen circuit-optimization sequences. The\nmethods resulting from the combination of Transformer and GNNs are benchmarked.\nThe experimental results show that the joint learning of Transformer and\nGraphSage gives the best results. The Mean Absolute Error (MAE) of the\npredicted result is 0.412.\n","authors":["Chenghao Yang","Zhongda Wang","Yinshui Xia","Zhufei Chu"],"pdf_url":"https://arxiv.org/pdf/2207.11437v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10767v1","updated":"2023-08-21T14:56:51Z","published":"2023-08-21T14:56:51Z","title":"GBM-based Bregman Proximal Algorithms for Constrained Learning","summary":" As the complexity of learning tasks surges, modern machine learning\nencounters a new constrained learning paradigm characterized by more intricate\nand data-driven function constraints. Prominent applications include\nNeyman-Pearson classification (NPC) and fairness classification, which entail\nspecific risk constraints that render standard projection-based training\nalgorithms unsuitable. Gradient boosting machines (GBMs) are among the most\npopular algorithms for supervised learning; however, they are generally limited\nto unconstrained settings. In this paper, we adapt the GBM for constrained\nlearning tasks within the framework of Bregman proximal algorithms. We\nintroduce a new Bregman primal-dual method with a global optimality guarantee\nwhen the learning objective and constraint functions are convex. In cases of\nnonconvex functions, we demonstrate how our algorithm remains effective under a\nBregman proximal point framework. Distinct from existing constrained learning\nalgorithms, ours possess a unique advantage in their ability to seamlessly\nintegrate with publicly available GBM implementations such as XGBoost (Chen and\nGuestrin, 2016) and LightGBM (Ke et al., 2017), exclusively relying on their\npublic interfaces. We provide substantial experimental evidence to showcase the\neffectiveness of the Bregman algorithm framework. While our primary focus is on\nNPC and fairness ML, our framework holds significant potential for a broader\nrange of constrained learning applications. The source code is currently freely\navailable at\nhttps://github.com/zhenweilin/ConstrainedGBM}{https://github.com/zhenweilin/ConstrainedGBM.\n","authors":["Zhenwei Lin","Qi Deng"],"pdf_url":"https://arxiv.org/pdf/2308.10767v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.12949v3","updated":"2023-08-21T14:44:30Z","published":"2021-04-27T02:39:21Z","title":"Discriminative Bayesian filtering lends momentum to the stochastic\n Newton method for minimizing log-convex functions","summary":" To minimize the average of a set of log-convex functions, the stochastic\nNewton method iteratively updates its estimate using subsampled versions of the\nfull objective's gradient and Hessian. We contextualize this optimization\nproblem as sequential Bayesian inference on a latent state-space model with a\ndiscriminatively-specified observation process. Applying Bayesian filtering\nthen yields a novel optimization algorithm that considers the entire history of\ngradients and Hessians when forming an update. We establish matrix-based\nconditions under which the effect of older observations diminishes over time,\nin a manner analogous to Polyak's heavy ball momentum. We illustrate various\naspects of our approach with an example and review other relevant innovations\nfor the stochastic Newton method.\n","authors":["Michael C. Burkhart"],"pdf_url":"https://arxiv.org/pdf/2104.12949v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10757v1","updated":"2023-08-21T14:43:42Z","published":"2023-08-21T14:43:42Z","title":"To Whom are You Talking? A Deep Learning Model to Endow Social Robots\n with Addressee Estimation Skills","summary":" Communicating shapes our social word. For a robot to be considered social and\nbeing consequently integrated in our social environment it is fundamental to\nunderstand some of the dynamics that rule human-human communication. In this\nwork, we tackle the problem of Addressee Estimation, the ability to understand\nan utterance's addressee, by interpreting and exploiting non-verbal bodily cues\nfrom the speaker. We do so by implementing an hybrid deep learning model\ncomposed of convolutional layers and LSTM cells taking as input images\nportraying the face of the speaker and 2D vectors of the speaker's body\nposture. Our implementation choices were guided by the aim to develop a model\nthat could be deployed on social robots and be efficient in ecological\nscenarios. We demonstrate that our model is able to solve the Addressee\nEstimation problem in terms of addressee localisation in space, from a robot\nego-centric point of view.\n","authors":["Carlo Mazzola","Marta Romeo","Francesco Rea","Alessandra Sciutti","Angelo Cangelosi"],"pdf_url":"https://arxiv.org/pdf/2308.10757v1.pdf","comment":"Accepted version of a paper published at 2023 International Joint\n Conference on Neural Networks (IJCNN). Please find the published version and\n info to cite the paper at https://doi.org/10.1109/IJCNN54540.2023.10191452 .\n 10 pages, 8 Figures, 3 Tables"},{"id":"http://arxiv.org/abs/2307.02632v2","updated":"2023-08-21T14:14:04Z","published":"2023-07-05T20:04:26Z","title":"Stability of Q-Learning Through Design and Optimism","summary":" Q-learning has become an important part of the reinforcement learning toolkit\nsince its introduction in the dissertation of Chris Watkins in the 1980s. The\npurpose of this paper is in part a tutorial on stochastic approximation and\nQ-learning, providing details regarding the INFORMS APS inaugural Applied\nProbability Trust Plenary Lecture, presented in Nancy France, June 2023.\n The paper also presents new approaches to ensure stability and potentially\naccelerated convergence for these algorithms, and stochastic approximation in\nother settings. Two contributions are entirely new:\n 1. Stability of Q-learning with linear function approximation has been an\nopen topic for research for over three decades. It is shown that with\nappropriate optimistic training in the form of a modified Gibbs policy, there\nexists a solution to the projected Bellman equation, and the algorithm is\nstable (in terms of bounded parameter estimates). Convergence remains one of\nmany open topics for research.\n 2. The new Zap Zero algorithm is designed to approximate the Newton-Raphson\nflow without matrix inversion. It is stable and convergent under mild\nassumptions on the mean flow vector field for the algorithm, and compatible\nstatistical assumption on an underlying Markov chain. The algorithm is a\ngeneral approach to stochastic approximation which in particular applies to\nQ-learning with \"oblivious\" training even with non-linear function\napproximation.\n","authors":["Sean Meyn"],"pdf_url":"https://arxiv.org/pdf/2307.02632v2.pdf","comment":"Companion paper to the INFORMS APS inaugural Applied Probability\n Trust Plenary Lecture, presented in Nancy France, June 2023. Slides available\n online, Online, DOI 10.13140/RG.2.2.24897.33127"},{"id":"http://arxiv.org/abs/2308.10741v1","updated":"2023-08-21T14:09:09Z","published":"2023-08-21T14:09:09Z","title":"On the Adversarial Robustness of Multi-Modal Foundation Models","summary":" Multi-modal foundation models combining vision and language models such as\nFlamingo or GPT-4 have recently gained enormous interest. Alignment of\nfoundation models is used to prevent models from providing toxic or harmful\noutput. While malicious users have successfully tried to jailbreak foundation\nmodels, an equally important question is if honest users could be harmed by\nmalicious third-party content. In this paper we show that imperceivable attacks\non images in order to change the caption output of a multi-modal foundation\nmodel can be used by malicious content providers to harm honest users e.g. by\nguiding them to malicious websites or broadcast fake information. This\nindicates that countermeasures to adversarial attacks should be used by any\ndeployed multi-modal foundation model.\n","authors":["Christian Schlarmann","Matthias Hein"],"pdf_url":"https://arxiv.org/pdf/2308.10741v1.pdf","comment":"ICCV AROW 2023"},{"id":"http://arxiv.org/abs/2308.10740v1","updated":"2023-08-21T14:08:42Z","published":"2023-08-21T14:08:42Z","title":"We Don't Need No Adam, All We Need Is EVE: On The Variance of Dual\n Learning Rate And Beyond","summary":" In the rapidly advancing field of deep learning, optimising deep neural\nnetworks is paramount. This paper introduces a novel method, Enhanced Velocity\nEstimation (EVE), which innovatively applies different learning rates to\ndistinct components of the gradients. By bifurcating the learning rate, EVE\nenables more nuanced control and faster convergence, addressing the challenges\nassociated with traditional single learning rate approaches. Utilising a\nmomentum term that adapts to the learning landscape, the method achieves a more\nefficient navigation of the complex loss surface, resulting in enhanced\nperformance and stability. Extensive experiments demonstrate that EVE\nsignificantly outperforms existing optimisation techniques across various\nbenchmark datasets and architectures.\n","authors":["Afshin Khadangi"],"pdf_url":"https://arxiv.org/pdf/2308.10740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10737v1","updated":"2023-08-21T14:05:21Z","published":"2023-08-21T14:05:21Z","title":"UGSL: A Unified Framework for Benchmarking Graph Structure Learning","summary":" Graph neural networks (GNNs) demonstrate outstanding performance in a broad\nrange of applications. While the majority of GNN applications assume that a\ngraph structure is given, some recent methods substantially expanded the\napplicability of GNNs by showing that they may be effective even when no graph\nstructure is explicitly provided. The GNN parameters and a graph structure are\njointly learned. Previous studies adopt different experimentation setups,\nmaking it difficult to compare their merits. In this paper, we propose a\nbenchmarking strategy for graph structure learning using a unified framework.\nOur framework, called Unified Graph Structure Learning (UGSL), reformulates\nexisting models into a single model. We implement a wide range of existing\nmodels in our framework and conduct extensive analyses of the effectiveness of\ndifferent components in the framework. Our results provide a clear and concise\nunderstanding of the different methods in this area as well as their strengths\nand weaknesses. The benchmark code is available at\nhttps://github.com/google-research/google-research/tree/master/ugsl.\n","authors":["Bahare Fatemi","Sami Abu-El-Haija","Anton Tsitsulin","Mehran Kazemi","Dustin Zelle","Neslihan Bulut","Jonathan Halcrow","Bryan Perozzi"],"pdf_url":"https://arxiv.org/pdf/2308.10737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04937v2","updated":"2023-08-21T14:05:05Z","published":"2023-07-10T23:28:03Z","title":"Towards Fair Graph Neural Networks via Graph Counterfactual","summary":" Graph neural networks have shown great ability in representation (GNNs)\nlearning on graphs, facilitating various tasks. Despite their great performance\nin modeling graphs, recent works show that GNNs tend to inherit and amplify the\nbias from training data, causing concerns of the adoption of GNNs in high-stake\nscenarios. Hence, many efforts have been taken for fairness-aware GNNs.\nHowever, most existing fair GNNs learn fair node representations by adopting\nstatistical fairness notions, which may fail to alleviate bias in the presence\nof statistical anomalies. Motivated by causal theory, there are several\nattempts utilizing graph counterfactual fairness to mitigate root causes of\nunfairness. However, these methods suffer from non-realistic counterfactuals\nobtained by perturbation or generation. In this paper, we take a causal view on\nfair graph learning problem. Guided by the casual analysis, we propose a novel\nframework CAF, which can select counterfactuals from training data to avoid\nnon-realistic counterfactuals and adopt selected counterfactuals to learn fair\nnode representations for node classification task. Extensive experiments on\nsynthetic and real-world datasets show the effectiveness of CAF. Our code is\navailable at https://github.com/TimeLovercc/CAF-GNN.\n","authors":["Zhimeng Guo","Jialiang Li","Teng Xiao","Yao Ma","Suhang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.04937v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10727v1","updated":"2023-08-21T13:50:41Z","published":"2023-08-21T13:50:41Z","title":"Test-time augmentation-based active learning and self-training for\n label-efficient segmentation","summary":" Deep learning techniques depend on large datasets whose annotation is\ntime-consuming. To reduce annotation burden, the self-training (ST) and\nactive-learning (AL) methods have been developed as well as methods that\ncombine them in an iterative fashion. However, it remains unclear when each\nmethod is the most useful, and when it is advantageous to combine them. In this\npaper, we propose a new method that combines ST with AL using Test-Time\nAugmentations (TTA). First, TTA is performed on an initial teacher network.\nThen, cases for annotation are selected based on the lowest estimated Dice\nscore. Cases with high estimated scores are used as soft pseudo-labels for ST.\nThe selected annotated cases are trained with existing annotated cases and ST\ncases with border slices annotations. We demonstrate the method on MRI fetal\nbody and placenta segmentation tasks with different data variability\ncharacteristics. Our results indicate that ST is highly effective for both\ntasks, boosting performance for in-distribution (ID) and out-of-distribution\n(OOD) data. However, while self-training improved the performance of\nsingle-sequence fetal body segmentation when combined with AL, it slightly\ndeteriorated performance of multi-sequence placenta segmentation on ID data. AL\nwas helpful for the high variability placenta data, but did not improve upon\nrandom selection for the single-sequence body data. For fetal body segmentation\nsequence transfer, combining AL with ST following ST iteration yielded a Dice\nof 0.961 with only 6 original scans and 2 new sequence scans. Results using\nonly 15 high-variability placenta cases were similar to those using 50 cases.\nCode is available at: https://github.com/Bella31/TTA-quality-estimation-ST-AL\n","authors":["Bella Specktor-Fadida","Anna Levchakov","Dana Schonberger","Liat Ben-Sira","Dafna Ben-Bashat","Leo Joskowicz"],"pdf_url":"https://arxiv.org/pdf/2308.10727v1.pdf","comment":"Accepted to MICCAI MILLanD workshop 2023"},{"id":"http://arxiv.org/abs/2308.10722v1","updated":"2023-08-21T13:47:13Z","published":"2023-08-21T13:47:13Z","title":"Clustered Linear Contextual Bandits with Knapsacks","summary":" In this work, we study clustered contextual bandits where rewards and\nresource consumption are the outcomes of cluster-specific linear models. The\narms are divided in clusters, with the cluster memberships being unknown to an\nalgorithm. Pulling an arm in a time period results in a reward and in\nconsumption for each one of multiple resources, and with the total consumption\nof any resource exceeding a constraint implying the termination of the\nalgorithm. Thus, maximizing the total reward requires learning not only models\nabout the reward and the resource consumption, but also cluster memberships. We\nprovide an algorithm that achieves regret sublinear in the number of time\nperiods, without requiring access to all of the arms. In particular, we show\nthat it suffices to perform clustering only once to a randomly selected subset\nof the arms. To achieve this result, we provide a sophisticated combination of\ntechniques from the literature of econometrics and of bandits with constraints.\n","authors":["Yichuan Deng","Michalis Mamakos","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2308.10722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10721v1","updated":"2023-08-21T13:45:44Z","published":"2023-08-21T13:45:44Z","title":"CoMIX: A Multi-agent Reinforcement Learning Training Architecture for\n Efficient Decentralized Coordination and Independent Decision Making","summary":" Robust coordination skills enable agents to operate cohesively in shared\nenvironments, together towards a common goal and, ideally, individually without\nhindering each other's progress. To this end, this paper presents Coordinated\nQMIX (CoMIX), a novel training framework for decentralized agents that enables\nemergent coordination through flexible policies, allowing at the same time\nindependent decision-making at individual level. CoMIX models selfish and\ncollaborative behavior as incremental steps in each agent's decision process.\nThis allows agents to dynamically adapt their behavior to different situations\nbalancing independence and collaboration. Experiments using a variety of\nsimulation environments demonstrate that CoMIX outperforms baselines on\ncollaborative tasks. The results validate our incremental policy approach as\neffective technique for improving coordination in multi-agent systems.\n","authors":["Giovanni Minelli","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2308.10721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10711v1","updated":"2023-08-21T13:24:52Z","published":"2023-08-21T13:24:52Z","title":"Relax and penalize: a new bilevel approach to mixed-binary\n hyperparameter optimization","summary":" In recent years, bilevel approaches have become very popular to efficiently\nestimate high-dimensional hyperparameters of machine learning models. However,\nto date, binary parameters are handled by continuous relaxation and rounding\nstrategies, which could lead to inconsistent solutions. In this context, we\ntackle the challenging optimization of mixed-binary hyperparameters by\nresorting to an equivalent continuous bilevel reformulation based on an\nappropriate penalty term. We propose an algorithmic framework that, under\nsuitable assumptions, is guaranteed to provide mixed-binary solutions.\nMoreover, the generality of the method allows to safely use existing continuous\nbilevel solvers within the proposed framework. We evaluate the performance of\nour approach for a specific machine learning problem, i.e., the estimation of\nthe group-sparsity structure in regression problems. Reported results clearly\nshow that our method outperforms state-of-the-art approaches based on\nrelaxation and rounding\n","authors":["Marianna de Santis","Jordan Frecon","Francesco Rinaldi","Saverio Salzo","Martin Schmidt"],"pdf_url":"https://arxiv.org/pdf/2308.10711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10708v1","updated":"2023-08-21T13:22:12Z","published":"2023-08-21T13:22:12Z","title":"Measuring the Effect of Causal Disentanglement on the Adversarial\n Robustness of Neural Network Models","summary":" Causal Neural Network models have shown high levels of robustness to\nadversarial attacks as well as an increased capacity for generalisation tasks\nsuch as few-shot learning and rare-context classification compared to\ntraditional Neural Networks. This robustness is argued to stem from the\ndisentanglement of causal and confounder input signals. However, no\nquantitative study has yet measured the level of disentanglement achieved by\nthese types of causal models or assessed how this relates to their adversarial\nrobustness.\n Existing causal disentanglement metrics are not applicable to deterministic\nmodels trained on real-world datasets. We, therefore, utilise metrics of\ncontent/style disentanglement from the field of Computer Vision to measure\ndifferent aspects of the causal disentanglement for four state-of-the-art\ncausal Neural Network models. By re-implementing these models with a common\nResNet18 architecture we are able to fairly measure their adversarial\nrobustness on three standard image classification benchmarking datasets under\nseven common white-box attacks. We find a strong association (r=0.820, p=0.001)\nbetween the degree to which models decorrelate causal and confounder signals\nand their adversarial robustness. Additionally, we find a moderate negative\nassociation between the pixel-level information content of the confounder\nsignal and adversarial robustness (r=-0.597, p=0.040).\n","authors":["Preben M. Ness","Dusica Marijan","Sunanda Bose"],"pdf_url":"https://arxiv.org/pdf/2308.10708v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.10704v1","updated":"2023-08-21T13:18:12Z","published":"2023-08-21T13:18:12Z","title":"Sampling From Autoencoders' Latent Space via Quantization And\n Probability Mass Function Concepts","summary":" In this study, we focus on sampling from the latent space of generative\nmodels built upon autoencoders so as the reconstructed samples are lifelike\nimages. To do to, we introduce a novel post-training sampling algorithm rooted\nin the concept of probability mass functions, coupled with a quantization\nprocess. Our proposed algorithm establishes a vicinity around each latent\nvector from the input data and then proceeds to draw samples from these defined\nneighborhoods. This strategic approach ensures that the sampled latent vectors\npredominantly inhabit high-probability regions, which, in turn, can be\neffectively transformed into authentic real-world images. A noteworthy point of\ncomparison for our sampling algorithm is the sampling technique based on\nGaussian mixture models (GMM), owing to its inherent capability to represent\nclusters. Remarkably, we manage to improve the time complexity from the\nprevious $\\mathcal{O}(n\\times d \\times k \\times i)$ associated with GMM\nsampling to a much more streamlined $\\mathcal{O}(n\\times d)$, thereby resulting\nin substantial speedup during runtime. Moreover, our experimental results,\ngauged through the Fr\\'echet inception distance (FID) for image generation,\nunderscore the superior performance of our sampling algorithm across a diverse\nrange of models and datasets. On the MNIST benchmark dataset, our approach\noutperforms GMM sampling by yielding a noteworthy improvement of up to $0.89$\nin FID value. Furthermore, when it comes to generating images of faces and\nocular images, our approach showcases substantial enhancements with FID\nimprovements of $1.69$ and $0.87$ respectively, as compared to GMM sampling, as\nevidenced on the CelebA and MOBIUS datasets. Lastly, we substantiate our\nmethodology's efficacy in estimating latent space distributions in contrast to\nGMM sampling, particularly through the lens of the Wasserstein distance.\n","authors":["Aymene Mohammed Bouayed","Adrian Iaccovelli","David Naccache"],"pdf_url":"https://arxiv.org/pdf/2308.10704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10699v1","updated":"2023-08-21T13:09:31Z","published":"2023-08-21T13:09:31Z","title":"Cost-Efficient Online Decision Making: A Combinatorial Multi-Armed\n Bandit Approach","summary":" Online decision making plays a crucial role in numerous real-world\napplications. In many scenarios, the decision is made based on performing a\nsequence of tests on the incoming data points. However, performing all tests\ncan be expensive and is not always possible. In this paper, we provide a novel\nformulation of the online decision making problem based on combinatorial\nmulti-armed bandits and take the cost of performing tests into account. Based\non this formulation, we provide a new framework for cost-efficient online\ndecision making which can utilize posterior sampling or BayesUCB for\nexploration. We provide a rigorous theoretical analysis for our framework and\npresent various experimental results that demonstrate its applicability to\nreal-world problems.\n","authors":["Arman Rahbar","Niklas Åkerblom","Morteza Haghir Chehreghani"],"pdf_url":"https://arxiv.org/pdf/2308.10699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10697v1","updated":"2023-08-21T13:05:12Z","published":"2023-08-21T13:05:12Z","title":"Beyond expectations: Residual Dynamic Mode Decomposition and Variance\n for Stochastic Dynamical Systems","summary":" Koopman operators linearize nonlinear dynamical systems, making their\nspectral information of crucial interest. Numerous algorithms have been\ndeveloped to approximate these spectral properties, and Dynamic Mode\nDecomposition (DMD) stands out as the poster child of projection-based methods.\nAlthough the Koopman operator itself is linear, the fact that it acts in an\ninfinite-dimensional space of observables poses various challenges. These\ninclude spurious modes, essential spectra, and the verification of Koopman mode\ndecompositions. While recent work has addressed these challenges for\ndeterministic systems, there remains a notable gap in verified DMD methods\ntailored for stochastic systems, where the Koopman operator measures the\nexpectation of observables. We show that it is necessary to go beyond\nexpectations to address these issues. By incorporating variance into the\nKoopman framework, we address these challenges. Through an additional DMD-type\nmatrix, we approximate the sum of a squared residual and a variance term, each\nof which can be approximated individually using batched snapshot data. This\nallows verified computation of the spectral properties of stochastic Koopman\noperators, controlling the projection error. We also introduce the concept of\nvariance-pseudospectra to gauge statistical coherency. Finally, we present a\nsuite of convergence results for the spectral quantities of stochastic Koopman\noperators. Our study concludes with practical applications using both simulated\nand experimental data. In neural recordings from awake mice, we demonstrate how\nvariance-pseudospectra can reveal physiologically significant information\nunavailable to standard expectation-based dynamical models.\n","authors":["Matthew J. Colbrook","Qin Li","Ryan V. Raut","Alex Townsend"],"pdf_url":"https://arxiv.org/pdf/2308.10697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09100v3","updated":"2023-08-21T12:53:09Z","published":"2022-12-18T14:56:22Z","title":"SPARF: Large-Scale Learning of 3D Sparse Radiance Fields from Few Input\n Images","summary":" Recent advances in Neural Radiance Fields (NeRFs) treat the problem of novel\nview synthesis as Sparse Radiance Field (SRF) optimization using sparse voxels\nfor efficient and fast rendering (plenoxels,InstantNGP). In order to leverage\nmachine learning and adoption of SRFs as a 3D representation, we present SPARF,\na large-scale ShapeNet-based synthetic dataset for novel view synthesis\nconsisting of $\\sim$ 17 million images rendered from nearly 40,000 shapes at\nhigh resolution (400 X 400 pixels). The dataset is orders of magnitude larger\nthan existing synthetic datasets for novel view synthesis and includes more\nthan one million 3D-optimized radiance fields with multiple voxel resolutions.\nFurthermore, we propose a novel pipeline (SuRFNet) that learns to generate\nsparse voxel radiance fields from only few views. This is done by using the\ndensely collected SPARF dataset and 3D sparse convolutions. SuRFNet employs\npartial SRFs from few/one images and a specialized SRF loss to learn to\ngenerate high-quality sparse voxel radiance fields that can be rendered from\nnovel views. Our approach achieves state-of-the-art results in the task of\nunconstrained novel view synthesis based on few views on ShapeNet as compared\nto recent baselines. The SPARF dataset is made public with the code and models\non the project website https://abdullahamdi.com/sparf/ .\n","authors":["Abdullah Hamdi","Bernard Ghanem","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2212.09100v3.pdf","comment":"published at ICCV 2023 workshop proceedings"},{"id":"http://arxiv.org/abs/2208.10533v3","updated":"2023-08-21T12:49:03Z","published":"2022-08-22T18:26:43Z","title":"Some Supervision Required: Incorporating Oracle Policies in\n Reinforcement Learning via Epistemic Uncertainty Metrics","summary":" An inherent problem of reinforcement learning is performing exploration of an\nenvironment through random actions, of which a large portion can be\nunproductive. Instead, exploration can be improved by initializing the learning\npolicy with an existing (previously learned or hard-coded) oracle policy,\noffline data, or demonstrations. In the case of using an oracle policy, it can\nbe unclear how best to incorporate the oracle policy's experience into the\nlearning policy in a way that maximizes learning sample efficiency. In this\npaper, we propose a method termed Critic Confidence Guided Exploration (CCGE)\nfor incorporating such an oracle policy into standard actor-critic\nreinforcement learning algorithms. More specifically, CCGE takes in the oracle\npolicy's actions as suggestions and incorporates this information into the\nlearning scheme when uncertainty is high, while ignoring it when the\nuncertainty is low. CCGE is agnostic to methods of estimating uncertainty, and\nwe show that it is equally effective with two different techniques.\nEmpirically, we evaluate the effect of CCGE on various benchmark reinforcement\nlearning tasks, and show that this idea can lead to improved sample efficiency\nand final performance. Furthermore, when evaluated on sparse reward\nenvironments, CCGE is able to perform competitively against adjacent algorithms\nthat also leverage an oracle policy. Our experiments show that it is possible\nto utilize uncertainty as a heuristic to guide exploration using an oracle in\nreinforcement learning. We expect that this will inspire more research in this\ndirection, where various heuristics are used to determine the direction of\nguidance provided to learning.\n","authors":["Jun Jet Tai","Jordan K. Terry","Mauro S. Innocente","James Brusey","Nadjim Horri"],"pdf_url":"https://arxiv.org/pdf/2208.10533v3.pdf","comment":"Under review at TMLR"},{"id":"http://arxiv.org/abs/2303.05101v3","updated":"2023-08-21T12:33:30Z","published":"2023-03-09T08:20:28Z","title":"Scalable Stochastic Gradient Riemannian Langevin Dynamics in\n Non-Diagonal Metrics","summary":" Stochastic-gradient sampling methods are often used to perform Bayesian\ninference on neural networks. It has been observed that the methods in which\nnotions of differential geometry are included tend to have better performances,\nwith the Riemannian metric improving posterior exploration by accounting for\nthe local curvature. However, the existing methods often resort to simple\ndiagonal metrics to remain computationally efficient. This loses some of the\ngains. We propose two non-diagonal metrics that can be used in\nstochastic-gradient samplers to improve convergence and exploration but have\nonly a minor computational overhead over diagonal metrics. We show that for\nfully connected neural networks (NNs) with sparsity-inducing priors and\nconvolutional NNs with correlated priors, using these metrics can provide\nimprovements. For some other choices the posterior is sufficiently easy also\nfor the simpler metrics.\n","authors":["Hanlin Yu","Marcelo Hartmann","Bernardo Williams","Arto Klami"],"pdf_url":"https://arxiv.org/pdf/2303.05101v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07870v2","updated":"2023-08-21T12:28:34Z","published":"2023-07-15T19:04:33Z","title":"Large Language Models as Superpositions of Cultural Perspectives","summary":" Large Language Models (LLMs) are often misleadingly recognized as having a\npersonality or a set of values. We argue that an LLM can be seen as a\nsuperposition of perspectives with different values and personality traits.\nLLMs exhibit context-dependent values and personality traits that change based\non the induced perspective (as opposed to humans, who tend to have more\ncoherent values and personality traits across contexts). We introduce the\nconcept of perspective controllability, which refers to a model's affordance to\nadopt various perspectives with differing values and personality traits. In our\nexperiments, we use questionnaires from psychology (PVQ, VSM, IPIP) to study\nhow exhibited values and personality traits change based on different\nperspectives. Through qualitative experiments, we show that LLMs express\ndifferent values when those are (implicitly or explicitly) implied in the\nprompt, and that LLMs express different values even when those are not\nobviously implied (demonstrating their context-dependent nature). We then\nconduct quantitative experiments to study the controllability of different\nmodels (GPT-4, GPT-3.5, OpenAssistant, StableVicuna, StableLM), the\neffectiveness of various methods for inducing perspectives, and the smoothness\nof the models' drivability. We conclude by examining the broader implications\nof our work and outline a variety of associated scientific questions. The\nproject website is available at\nhttps://sites.google.com/view/llm-superpositions .\n","authors":["Grgur Kovač","Masataka Sawayama","Rémy Portelas","Cédric Colas","Peter Ford Dominey","Pierre-Yves Oudeyer"],"pdf_url":"https://arxiv.org/pdf/2307.07870v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.10675v1","updated":"2023-08-21T12:17:40Z","published":"2023-08-21T12:17:40Z","title":"An Improved Best-of-both-worlds Algorithm for Bandits with Delayed\n Feedback","summary":" We propose a new best-of-both-worlds algorithm for bandits with variably\ndelayed feedback. The algorithm improves on prior work by Masoudian et al.\n[2022] by eliminating the need in prior knowledge of the maximal delay\n$d_{\\mathrm{max}}$ and providing tighter regret bounds in both regimes. The\nalgorithm and its regret bounds are based on counts of outstanding observations\n(a quantity that is observed at action time) rather than delays or the maximal\ndelay (quantities that are only observed when feedback arrives). One major\ncontribution is a novel control of distribution drift, which is based on biased\nloss estimators and skipping of observations with excessively large delays.\nAnother major contribution is demonstrating that the complexity of\nbest-of-both-worlds bandits with delayed feedback is characterized by the\ncumulative count of outstanding observations after skipping of observations\nwith excessively large delays, rather than the delays or the maximal delay.\n","authors":["Saeed Masoudian","Julian Zimmert","Yevgeny Seldin"],"pdf_url":"https://arxiv.org/pdf/2308.10675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10664v1","updated":"2023-08-21T12:02:54Z","published":"2023-08-21T12:02:54Z","title":"A Safe Deep Reinforcement Learning Approach for Energy Efficient\n Federated Learning in Wireless Communication Networks","summary":" Progressing towards a new era of Artificial Intelligence (AI) - enabled\nwireless networks, concerns regarding the environmental impact of AI have been\nraised both in industry and academia. Federated Learning (FL) has emerged as a\nkey privacy preserving decentralized AI technique. Despite efforts currently\nbeing made in FL, its environmental impact is still an open problem. Targeting\nthe minimization of the overall energy consumption of an FL process, we propose\nthe orchestration of computational and communication resources of the involved\ndevices to minimize the total energy required, while guaranteeing a certain\nperformance of the model. To this end, we propose a Soft Actor Critic Deep\nReinforcement Learning (DRL) solution, where a penalty function is introduced\nduring training, penalizing the strategies that violate the constraints of the\nenvironment, and ensuring a safe RL process. A device level synchronization\nmethod, along with a computationally cost effective FL environment are\nproposed, with the goal of further reducing the energy consumption and\ncommunication overhead. Evaluation results show the effectiveness of the\nproposed scheme compared to four state-of-the-art baseline solutions in both\nstatic and dynamic environments, achieving a decrease of up to 94% in the total\nenergy consumption.\n","authors":["Nikolaos Koursioumpas","Lina Magoula","Nikolaos Petropouleas","Alexandros-Ioannis Thanopoulos","Theodora Panagea","Nancy Alonistioti","M. A. Gutierrez-Estevez","Ramin Khalili"],"pdf_url":"https://arxiv.org/pdf/2308.10664v1.pdf","comment":"27 Pages Single Column, 6 Figures, Submitted for possible publication\n in the IEEE Transactions on Green Communications and Networking (TGCN). arXiv\n admin note: text overlap with arXiv:2306.14237"},{"id":"http://arxiv.org/abs/2308.10656v1","updated":"2023-08-21T11:48:34Z","published":"2023-08-21T11:48:34Z","title":"Practical Parallel Algorithms for Non-Monotone Submodular Maximization","summary":" Submodular maximization has found extensive applications in various domains\nwithin the field of artificial intelligence, including but not limited to\nmachine learning, computer vision, and natural language processing. With the\nincreasing size of datasets in these domains, there is a pressing need to\ndevelop efficient and parallelizable algorithms for submodular maximization.\nOne measure of the parallelizability of a submodular maximization algorithm is\nits adaptive complexity, which indicates the number of sequential rounds where\na polynomial number of queries to the objective function can be executed in\nparallel. In this paper, we study the problem of non-monotone submodular\nmaximization subject to a knapsack constraint, and propose the first\ncombinatorial algorithm achieving an $(8+\\epsilon)$-approximation under\n$\\mathcal{O}(\\log n)$ adaptive complexity, which is \\textit{optimal} up to a\nfactor of $\\mathcal{O}(\\log\\log n)$. Moreover, we also propose the first\nalgorithm with both provable approximation ratio and sublinear adaptive\ncomplexity for the problem of non-monotone submodular maximization subject to a\n$k$-system constraint. As a by-product, we show that our two algorithms can\nalso be applied to the special case of submodular maximization subject to a\ncardinality constraint, and achieve performance bounds comparable with those of\nstate-of-the-art algorithms. Finally, the effectiveness of our approach is\ndemonstrated by extensive experiments on real-world applications.\n","authors":["Shuang Cui","Kai Han","Jing Tang","He Huang","Xueying Li","Aakas Zhiyuli","Hanxiao Li"],"pdf_url":"https://arxiv.org/pdf/2308.10656v1.pdf","comment":"Part of the contribution appears in AAAI-2023"},{"id":"http://arxiv.org/abs/2305.03515v3","updated":"2023-08-21T11:47:08Z","published":"2023-05-05T13:24:35Z","title":"GradTree: Learning Axis-Aligned Decision Trees with Gradient Descent","summary":" Decision Trees (DTs) are commonly used for many machine learning tasks due to\ntheir high degree of interpretability. However, learning a DT from data is a\ndifficult optimization problem, as it is non-convex and non-differentiable.\nTherefore, common approaches learn DTs using a greedy growth algorithm that\nminimizes the impurity locally at each internal node. Unfortunately, this\ngreedy procedure can lead to inaccurate trees. In this paper, we present a\nnovel approach for learning hard, axis-aligned DTs with gradient descent. The\nproposed method uses backpropagation with a straight-through operator on a\ndense DT representation, to jointly optimize all tree parameters. Our approach\noutperforms existing methods on binary classification benchmarks and achieves\ncompetitive results for multi-class tasks.\n","authors":["Sascha Marton","Stefan Lüdtke","Christian Bartelt","Heiner Stuckenschmidt"],"pdf_url":"https://arxiv.org/pdf/2305.03515v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10650v1","updated":"2023-08-21T11:42:16Z","published":"2023-08-21T11:42:16Z","title":"Deep Evidential Learning for Bayesian Quantile Regression","summary":" It is desirable to have accurate uncertainty estimation from a single\ndeterministic forward-pass model, as traditional methods for uncertainty\nquantification are computationally expensive. However, this is difficult\nbecause single forward-pass models do not sample weights during inference and\noften make assumptions about the target distribution, such as assuming it is\nGaussian. This can be restrictive in regression tasks, where the mean and\nstandard deviation are inadequate to model the target distribution accurately.\nThis paper proposes a deep Bayesian quantile regression model that can estimate\nthe quantiles of a continuous target distribution without the Gaussian\nassumption. The proposed method is based on evidential learning, which allows\nthe model to capture aleatoric and epistemic uncertainty with a single\ndeterministic forward-pass model. This makes the method efficient and scalable\nto large models and datasets. We demonstrate that the proposed method achieves\ncalibrated uncertainties on non-Gaussian distributions, disentanglement of\naleatoric and epistemic uncertainty, and robustness to out-of-distribution\nsamples.\n","authors":["Frederik Boe Hüttel","Filipe Rodrigues","Francisco Câmara Pereira"],"pdf_url":"https://arxiv.org/pdf/2308.10650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10649v1","updated":"2023-08-21T11:36:54Z","published":"2023-08-21T11:36:54Z","title":"Reinforcement Learning Based Sensor Optimization for Bio-markers","summary":" Radio frequency (RF) biosensors, in particular those based on inter-digitated\ncapacitors (IDCs), are pivotal in areas like biomedical diagnosis, remote\nsensing, and wireless communication. Despite their advantages of low cost and\neasy fabrication, their sensitivity can be hindered by design imperfections,\nenvironmental factors, and circuit noise. This paper investigates enhancing the\nsensitivity of IDC-based RF sensors using novel reinforcement learning based\nBinary Particle Swarm Optimization (RLBPSO), and it is compared to Ant Colony\nOptimization (ACO), and other state-of-the-art methods. By focusing on\noptimizing design parameters like electrode design and finger width, the\nproposed study found notable improvements in sensor sensitivity. The proposed\nRLBPSO method shows best optimized design for various frequency ranges when\ncompared to current state-of-the-art methods.\n","authors":["Sajal Khandelwal","Pawan Kumar","Syed Azeemuddin"],"pdf_url":"https://arxiv.org/pdf/2308.10649v1.pdf","comment":"7 pages, 4 tables"},{"id":"http://arxiv.org/abs/2308.10644v1","updated":"2023-08-21T11:31:15Z","published":"2023-08-21T11:31:15Z","title":"Faster Training of Neural ODEs Using Gauß-Legendre Quadrature","summary":" Neural ODEs demonstrate strong performance in generative and time-series\nmodelling. However, training them via the adjoint method is slow compared to\ndiscrete models due to the requirement of numerically solving ODEs. To speed\nneural ODEs up, a common approach is to regularise the solutions. However, this\napproach may affect the expressivity of the model; when the trajectory itself\nmatters, this is particularly important. In this paper, we propose an\nalternative way to speed up the training of neural ODEs. The key idea is to\nspeed up the adjoint method by using Gau{\\ss}-Legendre quadrature to solve\nintegrals faster than ODE-based methods while remaining memory efficient. We\nalso extend the idea to training SDEs using the Wong-Zakai theorem, by training\na corresponding ODE and transferring the parameters. Our approach leads to\nfaster training of neural ODEs, especially for large models. It also presents a\nnew way to train SDE-based models.\n","authors":["Alexander Norcliffe","Marc Peter Deisenroth"],"pdf_url":"https://arxiv.org/pdf/2308.10644v1.pdf","comment":"32 pages, 16 figures, 7 tables, published in TMLR 2023"},{"id":"http://arxiv.org/abs/2308.10638v1","updated":"2023-08-21T11:23:25Z","published":"2023-08-21T11:23:25Z","title":"SCULPT: Shape-Conditioned Unpaired Learning of Pose-dependent Clothed\n and Textured Human Meshes","summary":" We present SCULPT, a novel 3D generative model for clothed and textured 3D\nmeshes of humans. Specifically, we devise a deep neural network that learns to\nrepresent the geometry and appearance distribution of clothed human bodies.\nTraining such a model is challenging, as datasets of textured 3D meshes for\nhumans are limited in size and accessibility. Our key observation is that there\nexist medium-sized 3D scan datasets like CAPE, as well as large-scale 2D image\ndatasets of clothed humans and multiple appearances can be mapped to a single\ngeometry. To effectively learn from the two data modalities, we propose an\nunpaired learning procedure for pose-dependent clothed and textured human\nmeshes. Specifically, we learn a pose-dependent geometry space from 3D scan\ndata. We represent this as per vertex displacements w.r.t. the SMPL model.\nNext, we train a geometry conditioned texture generator in an unsupervised way\nusing the 2D image data. We use intermediate activations of the learned\ngeometry model to condition our texture generator. To alleviate entanglement\nbetween pose and clothing type, and pose and clothing appearance, we condition\nboth the texture and geometry generators with attribute labels such as clothing\ntypes for the geometry, and clothing colors for the texture generator. We\nautomatically generated these conditioning labels for the 2D images based on\nthe visual question answering model BLIP and CLIP. We validate our method on\nthe SCULPT dataset, and compare to state-of-the-art 3D generative models for\nclothed human bodies. We will release the codebase for research purposes.\n","authors":["Soubhik Sanyal","Partha Ghosh","Jinlong Yang","Michael J. Black","Justus Thies","Timo Bolkart"],"pdf_url":"https://arxiv.org/pdf/2308.10638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14963v2","updated":"2023-08-21T11:21:13Z","published":"2022-11-27T23:17:08Z","title":"Neural Architecture for Online Ensemble Continual Learning","summary":" Continual learning with an increasing number of classes is a challenging\ntask. The difficulty rises when each example is presented exactly once, which\nrequires the model to learn online. Recent methods with classic parameter\noptimization procedures have been shown to struggle in such setups or have\nlimitations like non-differentiable components or memory buffers. For this\nreason, we present the fully differentiable ensemble method that allows us to\nefficiently train an ensemble of neural networks in the end-to-end regime. The\nproposed technique achieves SOTA results without a memory buffer and clearly\noutperforms the reference methods. The conducted experiments have also shown a\nsignificant increase in the performance for small ensembles, which demonstrates\nthe capability of obtaining relatively high classification accuracy with a\nreduced number of classifiers.\n","authors":["Mateusz Wójcik","Witold Kościukiewicz","Tomasz Kajdanowicz","Adam Gonczarek"],"pdf_url":"https://arxiv.org/pdf/2211.14963v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10632v1","updated":"2023-08-21T11:07:27Z","published":"2023-08-21T11:07:27Z","title":"Foundation Model-oriented Robustness: Robust Image Model Evaluation with\n Pretrained Models","summary":" Machine learning has demonstrated remarkable performance over finite\ndatasets, yet whether the scores over the fixed benchmarks can sufficiently\nindicate the model's performance in the real world is still in discussion. In\nreality, an ideal robust model will probably behave similarly to the oracle\n(e.g., the human users), thus a good evaluation protocol is probably to\nevaluate the models' behaviors in comparison to the oracle. In this paper, we\nintroduce a new robustness measurement that directly measures the image\nclassification model's performance compared with a surrogate oracle (i.e., a\nfoundation model). Besides, we design a simple method that can accomplish the\nevaluation beyond the scope of the benchmarks. Our method extends the image\ndatasets with new samples that are sufficiently perturbed to be distinct from\nthe ones in the original sets, but are still bounded within the same\nimage-label structure the original test image represents, constrained by a\nfoundation model pretrained with a large amount of samples. As a result, our\nnew method will offer us a new way to evaluate the models' robustness\nperformance, free of limitations of fixed benchmarks or constrained\nperturbations, although scoped by the power of the oracle. In addition to the\nevaluation results, we also leverage our generated data to understand the\nbehaviors of the model and our new evaluation strategies.\n","authors":["Peiyan Zhang","Haoyang Liu","Chaozhuo Li","Xing Xie","Sunghun Kim","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10630v1","updated":"2023-08-21T11:03:04Z","published":"2023-08-21T11:03:04Z","title":"A Homogenization Approach for Gradient-Dominated Stochastic Optimization","summary":" Gradient dominance property is a condition weaker than strong convexity, yet\nit sufficiently ensures global convergence for first-order methods even in\nnon-convex optimization. This property finds application in various machine\nlearning domains, including matrix decomposition, linear neural networks, and\npolicy-based reinforcement learning (RL). In this paper, we study the\nstochastic homogeneous second-order descent method (SHSODM) for\ngradient-dominated optimization with $\\alpha \\in [1, 2]$ based on a recently\nproposed homogenization approach. Theoretically, we show that SHSODM achieves a\nsample complexity of $O(\\epsilon^{-7/(2 \\alpha) +1})$ for $\\alpha \\in [1, 3/2)$\nand $\\tilde{O}(\\epsilon^{-2/\\alpha})$ for $\\alpha \\in [3/2, 2]$. We further\nprovide a SHSODM with a variance reduction technique enjoying an improved\nsample complexity of $O( \\epsilon ^{-( 7-3\\alpha ) /( 2\\alpha )})$ for $\\alpha\n\\in [1,3/2)$. Our results match the state-of-the-art sample complexity bounds\nfor stochastic gradient-dominated optimization without \\emph{cubic\nregularization}. Since the homogenization approach only relies on solving\nextremal eigenvector problems instead of Newton-type systems, our methods gain\nthe advantage of cheaper iterations and robustness in ill-conditioned problems.\nNumerical experiments on several RL tasks demonstrate the efficiency of SHSODM\ncompared to other off-the-shelf methods.\n","authors":["Jiyuan Tan","Chenyu Xue","Chuwen Zhang","Qi Deng","Dongdong Ge","Yinyu Ye"],"pdf_url":"https://arxiv.org/pdf/2308.10630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09687v2","updated":"2023-08-21T10:51:42Z","published":"2023-08-18T17:29:23Z","title":"Graph of Thoughts: Solving Elaborate Problems with Large Language Models","summary":" We introduce Graph of Thoughts (GoT): a framework that advances prompting\ncapabilities in large language models (LLMs) beyond those offered by paradigms\nsuch as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary\nadvantage of GoT is the ability to model the information generated by an LLM as\nan arbitrary graph, where units of information (\"LLM thoughts\") are vertices,\nand edges correspond to dependencies between these vertices. This approach\nenables combining arbitrary LLM thoughts into synergistic outcomes, distilling\nthe essence of whole networks of thoughts, or enhancing thoughts using feedback\nloops. We illustrate that GoT offers advantages over state of the art on\ndifferent tasks, for example increasing the quality of sorting by 62% over ToT,\nwhile simultaneously reducing costs by >31%. We ensure that GoT is extensible\nwith new thought transformations and thus can be used to spearhead new\nprompting schemes. This work brings the LLM reasoning closer to human thinking\nor brain mechanisms such as recurrence, both of which form complex networks.\n","authors":["Maciej Besta","Nils Blach","Ales Kubicek","Robert Gerstenberger","Lukas Gianinazzi","Joanna Gajda","Tomasz Lehmann","Michal Podstawski","Hubert Niewiadomski","Piotr Nyczyk","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2308.09687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10623v1","updated":"2023-08-21T10:47:52Z","published":"2023-08-21T10:47:52Z","title":"GaitPT: Skeletons Are All You Need For Gait Recognition","summary":" The analysis of patterns of walking is an important area of research that has\nnumerous applications in security, healthcare, sports and human-computer\ninteraction. Lately, walking patterns have been regarded as a unique\nfingerprinting method for automatic person identification at a distance. In\nthis work, we propose a novel gait recognition architecture called Gait Pyramid\nTransformer (GaitPT) that leverages pose estimation skeletons to capture unique\nwalking patterns, without relying on appearance information. GaitPT adopts a\nhierarchical transformer architecture that effectively extracts both spatial\nand temporal features of movement in an anatomically consistent manner, guided\nby the structure of the human skeleton. Our results show that GaitPT achieves\nstate-of-the-art performance compared to other skeleton-based gait recognition\nworks, in both controlled and in-the-wild scenarios. GaitPT obtains 82.6%\naverage accuracy on CASIA-B, surpassing other works by a margin of 6%.\nMoreover, it obtains 52.16% Rank-1 accuracy on GREW, outperforming both\nskeleton-based and appearance-based approaches.\n","authors":["Andy Catruna","Adrian Cosma","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2308.10623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10622v1","updated":"2023-08-21T10:40:21Z","published":"2023-08-21T10:40:21Z","title":"Weighting by Tying: A New Approach to Weighted Rank Correlation","summary":" Measures of rank correlation are commonly used in statistics to capture the\ndegree of concordance between two orderings of the same set of items. Standard\nmeasures like Kendall's tau and Spearman's rho coefficient put equal emphasis\non each position of a ranking. Yet, motivated by applications in which some of\nthe positions (typically those on the top) are more important than others, a\nfew weighted variants of these measures have been proposed. Most of these\ngeneralizations fail to meet desirable formal properties, however. Besides,\nthey are often quite inflexible in the sense of committing to a fixed weighing\nscheme. In this paper, we propose a weighted rank correlation measure on the\nbasis of fuzzy order relations. Our measure, called scaled gamma, is related to\nGoodman and Kruskal's gamma rank correlation. It is parametrized by a fuzzy\nequivalence relation on the rank positions, which in turn is specified\nconveniently by a so-called scaling function. This approach combines soundness\nwith flexibility: it has a sound formal foundation and allows for weighing rank\npositions in a flexible way.\n","authors":["Sascha Henzgen","Eyke Hüllermeier"],"pdf_url":"https://arxiv.org/pdf/2308.10622v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2308.10619v1","updated":"2023-08-21T10:35:32Z","published":"2023-08-21T10:35:32Z","title":"centroIDA: Cross-Domain Class Discrepancy Minimization Based on\n Accumulative Class-Centroids for Imbalanced Domain Adaptation","summary":" Unsupervised Domain Adaptation (UDA) approaches address the covariate shift\nproblem by minimizing the distribution discrepancy between the source and\ntarget domains, assuming that the label distribution is invariant across\ndomains. However, in the imbalanced domain adaptation (IDA) scenario, covariate\nand long-tailed label shifts both exist across domains. To tackle the IDA\nproblem, some current research focus on minimizing the distribution\ndiscrepancies of each corresponding class between source and target domains.\nSuch methods rely much on the reliable pseudo labels' selection and the feature\ndistributions estimation for target domain, and the minority classes with\nlimited numbers makes the estimations more uncertainty, which influences the\nmodel's performance. In this paper, we propose a cross-domain class discrepancy\nminimization method based on accumulative class-centroids for IDA (centroIDA).\nFirstly, class-based re-sampling strategy is used to obtain an unbiased\nclassifier on source domain. Secondly, the accumulative class-centroids\nalignment loss is proposed for iterative class-centroids alignment across\ndomains. Finally, class-wise feature alignment loss is used to optimize the\nfeature representation for a robust classification boundary. A series of\nexperiments have proved that our method outperforms other SOTA methods on IDA\nproblem, especially with the increasing degree of label shift.\n","authors":["Xiaona Sun","Zhenyu Wu","Yichen Liu","Saier Hu","Zhiqiang Zhan","Yang Ji"],"pdf_url":"https://arxiv.org/pdf/2308.10619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14751v2","updated":"2023-08-21T10:25:43Z","published":"2023-07-27T10:19:10Z","title":"FLARE: Fingerprinting Deep Reinforcement Learning Agents using Universal\n Adversarial Masks","summary":" We propose FLARE, the first fingerprinting mechanism to verify whether a\nsuspected Deep Reinforcement Learning (DRL) policy is an illegitimate copy of\nanother (victim) policy. We first show that it is possible to find\nnon-transferable, universal adversarial masks, i.e., perturbations, to generate\nadversarial examples that can successfully transfer from a victim policy to its\nmodified versions but not to independently trained policies. FLARE employs\nthese masks as fingerprints to verify the true ownership of stolen DRL policies\nby measuring an action agreement value over states perturbed via such masks.\nOur empirical evaluations show that FLARE is effective (100% action agreement\non stolen copies) and does not falsely accuse independent policies (no false\npositives). FLARE is also robust to model modification attacks and cannot be\neasily evaded by more informed adversaries without negatively impacting agent\nperformance. We also show that not all universal adversarial masks are suitable\ncandidates for fingerprints due to the inherent characteristics of DRL\npolicies. The spatio-temporal dynamics of DRL problems and sequential\ndecision-making process make characterizing the decision boundary of DRL\npolicies more difficult, as well as searching for universal masks that capture\nthe geometry of it.\n","authors":["Buse G. A. Tekgul","N. Asokan"],"pdf_url":"https://arxiv.org/pdf/2307.14751v2.pdf","comment":"Will appear in the proceedings of ACSAC 2023; 13 pages, 5 figures, 7\n tables"},{"id":"http://arxiv.org/abs/2308.08998v2","updated":"2023-08-21T10:23:42Z","published":"2023-08-17T14:12:48Z","title":"Reinforced Self-Training (ReST) for Language Modeling","summary":" Reinforcement learning from human feedback (RLHF) can improve the quality of\nlarge language model's (LLM) outputs by aligning them with human preferences.\nWe propose a simple algorithm for aligning LLMs with human preferences inspired\nby growing batch reinforcement learning (RL), which we call Reinforced\nSelf-Training (ReST). Given an initial LLM policy, ReST produces a dataset by\ngenerating samples from the policy, which are then used to improve the LLM\npolicy using offline RL algorithms. ReST is more efficient than typical online\nRLHF methods because the training dataset is produced offline, which allows\ndata reuse. While ReST is a general approach applicable to all generative\nlearning settings, we focus on its application to machine translation. Our\nresults show that ReST can substantially improve translation quality, as\nmeasured by automated metrics and human evaluation on machine translation\nbenchmarks in a compute and sample-efficient manner.\n","authors":["Caglar Gulcehre","Tom Le Paine","Srivatsan Srinivasan","Ksenia Konyushkova","Lotte Weerts","Abhishek Sharma","Aditya Siddhant","Alex Ahern","Miaosen Wang","Chenjie Gu","Wolfgang Macherey","Arnaud Doucet","Orhan Firat","Nando de Freitas"],"pdf_url":"https://arxiv.org/pdf/2308.08998v2.pdf","comment":"23 pages, 16 figures"},{"id":"http://arxiv.org/abs/2308.10609v1","updated":"2023-08-21T10:18:26Z","published":"2023-08-21T10:18:26Z","title":"ST-RAP: A Spatio-Temporal Framework for Real Estate Appraisal","summary":" In this paper, we introduce ST-RAP, a novel Spatio-Temporal framework for\nReal estate APpraisal. ST-RAP employs a hierarchical architecture with a\nheterogeneous graph neural network to encapsulate temporal dynamics and spatial\nrelationships simultaneously. Through comprehensive experiments on a\nlarge-scale real estate dataset, ST-RAP outperforms previous methods,\ndemonstrating the significant benefits of integrating spatial and temporal\naspects in real estate appraisal. Our code and dataset are available at\nhttps://github.com/dojeon-ai/STRAP.\n","authors":["Hojoon Lee","Hawon Jeong","Byungkun Lee","Kyungyup Lee","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2308.10609v1.pdf","comment":"Accepted to CIKM'23"},{"id":"http://arxiv.org/abs/2308.10608v1","updated":"2023-08-21T10:16:52Z","published":"2023-08-21T10:16:52Z","title":"FocalDreamer: Text-driven 3D Editing via Focal-fusion Assembly","summary":" While text-3D editing has made significant strides in leveraging score\ndistillation sampling, emerging approaches still fall short in delivering\nseparable, precise and consistent outcomes that are vital to content creation.\nIn response, we introduce FocalDreamer, a framework that merges base shape with\neditable parts according to text prompts for fine-grained editing within\ndesired regions. Specifically, equipped with geometry union and dual-path\nrendering, FocalDreamer assembles independent 3D parts into a complete object,\ntailored for convenient instance reuse and part-wise control. We propose\ngeometric focal loss and style consistency regularization, which encourage\nfocal fusion and congruent overall appearance. Furthermore, FocalDreamer\ngenerates high-fidelity geometry and PBR textures which are compatible with\nwidely-used graphics engines. Extensive experiments have highlighted the\nsuperior editing capabilities of FocalDreamer in both quantitative and\nqualitative evaluations.\n","authors":["Yuhan Li","Yishun Dou","Yue Shi","Yu Lei","Xuanhong Chen","Yi Zhang","Peng Zhou","Bingbing Ni"],"pdf_url":"https://arxiv.org/pdf/2308.10608v1.pdf","comment":"Project website: https://fantasia3d.github.io"},{"id":"http://arxiv.org/abs/2308.06300v2","updated":"2023-08-21T10:08:11Z","published":"2023-08-11T07:57:12Z","title":"Automatic Classification of Blood Cell Images Using Convolutional Neural\n Network","summary":" Human blood primarily comprises plasma, red blood cells, white blood cells,\nand platelets. It plays a vital role in transporting nutrients to different\norgans, where it stores essential health-related data about the human body.\nBlood cells are utilized to defend the body against diverse infections,\nincluding fungi, viruses, and bacteria. Hence, blood analysis can help\nphysicians assess an individual's physiological condition. Blood cells have\nbeen sub-classified into eight groups: Neutrophils, eosinophils, basophils,\nlymphocytes, monocytes, immature granulocytes (promyelocytes, myelocytes, and\nmetamyelocytes), erythroblasts, and platelets or thrombocytes on the basis of\ntheir nucleus, shape, and cytoplasm. Traditionally, pathologists and\nhematologists in laboratories have examined these blood cells using a\nmicroscope before manually classifying them. The manual approach is slower and\nmore prone to human error. Therefore, it is essential to automate this process.\nIn our paper, transfer learning with CNN pre-trained models. VGG16, VGG19,\nResNet-50, ResNet-101, ResNet-152, InceptionV3, MobileNetV2, and DenseNet-20\napplied to the PBC dataset's normal DIB. The overall accuracy achieved with\nthese models lies between 91.375 and 94.72%. Hence, inspired by these\npre-trained architectures, a model has been proposed to automatically classify\nthe ten types of blood cells with increased accuracy. A novel CNN-based\nframework has been presented to improve accuracy. The proposed CNN model has\nbeen tested on the PBC dataset normal DIB. The outcomes of the experiments\ndemonstrate that our CNN-based framework designed for blood cell classification\nattains an accuracy of 99.91% on the PBC dataset. Our proposed convolutional\nneural network model performs competitively when compared to earlier results\nreported in the literature.\n","authors":["Rabia Asghar","Sanjay Kumar","Paul Hynds","Abeera Mahfooz"],"pdf_url":"https://arxiv.org/pdf/2308.06300v2.pdf","comment":"15"},{"id":"http://arxiv.org/abs/2308.10606v1","updated":"2023-08-21T10:06:15Z","published":"2023-08-21T10:06:15Z","title":"Analyzing Complex Systems with Cascades Using Continuous-Time Bayesian\n Networks","summary":" Interacting systems of events may exhibit cascading behavior where events\ntend to be temporally clustered. While the cascades themselves may be obvious\nfrom the data, it is important to understand which states of the system trigger\nthem. For this purpose, we propose a modeling framework based on\ncontinuous-time Bayesian networks (CTBNs) to analyze cascading behavior in\ncomplex systems. This framework allows us to describe how events propagate\nthrough the system and to identify likely sentry states, that is, system states\nthat may lead to imminent cascading behavior. Moreover, CTBNs have a simple\ngraphical representation and provide interpretable outputs, both of which are\nimportant when communicating with domain experts. We also develop new methods\nfor knowledge extraction from CTBNs and we apply the proposed methodology to a\ndata set of alarms in a large industrial system.\n","authors":["Alessandro Bregoli","Karin Rathsman","Marco Scutari","Fabio Stella","Søren Wengel Mogensen"],"pdf_url":"https://arxiv.org/pdf/2308.10606v1.pdf","comment":"21 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.10604v1","updated":"2023-08-21T10:00:59Z","published":"2023-08-21T10:00:59Z","title":"BackTrack: Robust template update via Backward Tracking of candidate\n template","summary":" Variations of target appearance such as deformations, illumination variance,\nocclusion, etc., are the major challenges of visual object tracking that\nnegatively impact the performance of a tracker. An effective method to tackle\nthese challenges is template update, which updates the template to reflect the\nchange of appearance in the target object during tracking. However, with\ntemplate updates, inadequate quality of new templates or inappropriate timing\nof updates may induce a model drift problem, which severely degrades the\ntracking performance. Here, we propose BackTrack, a robust and reliable method\nto quantify the confidence of the candidate template by backward tracking it on\nthe past frames. Based on the confidence score of candidates from BackTrack, we\ncan update the template with a reliable candidate at the right time while\nrejecting unreliable candidates. BackTrack is a generic template update scheme\nand is applicable to any template-based trackers. Extensive experiments on\nvarious tracking benchmarks verify the effectiveness of BackTrack over existing\ntemplate update algorithms, as it achieves SOTA performance on various tracking\nbenchmarks.\n","authors":["Dongwook Lee","Wonjun Choi","Seohyung Lee","ByungIn Yoo","Eunho Yang","Seongju Hwang"],"pdf_url":"https://arxiv.org/pdf/2308.10604v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.10601v1","updated":"2023-08-21T09:58:13Z","published":"2023-08-21T09:58:13Z","title":"Improving the Transferability of Adversarial Examples with Arbitrary\n Style Transfer","summary":" Deep neural networks are vulnerable to adversarial examples crafted by\napplying human-imperceptible perturbations on clean inputs. Although many\nattack methods can achieve high success rates in the white-box setting, they\nalso exhibit weak transferability in the black-box setting. Recently, various\nmethods have been proposed to improve adversarial transferability, in which the\ninput transformation is one of the most effective methods. In this work, we\nnotice that existing input transformation-based works mainly adopt the\ntransformed data in the same domain for augmentation. Inspired by domain\ngeneralization, we aim to further improve the transferability using the data\naugmented from different domains. Specifically, a style transfer network can\nalter the distribution of low-level visual features in an image while\npreserving semantic content for humans. Hence, we propose a novel attack method\nnamed Style Transfer Method (STM) that utilizes a proposed arbitrary style\ntransfer network to transform the images into different domains. To avoid\ninconsistent semantic information of stylized images for the classification\nnetwork, we fine-tune the style transfer network and mix up the generated\nimages added by random noise with the original images to maintain semantic\nconsistency and boost input diversity. Extensive experimental results on the\nImageNet-compatible dataset show that our proposed method can significantly\nimprove the adversarial transferability on either normally trained models or\nadversarially trained models than state-of-the-art input transformation-based\nattacks. Code is available at: https://github.com/Zhijin-Ge/STM.\n","authors":["Zhijin Ge","Fanhua Shang","Hongying Liu","Yuanyuan Liu","Liang Wan","Wei Feng","Xiaosen Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10601v1.pdf","comment":"10 pages, 2 figures, accepted by the 31st ACM International\n Conference on Multimedia (MM '23)"},{"id":"http://arxiv.org/abs/2308.10599v1","updated":"2023-08-21T09:56:48Z","published":"2023-08-21T09:56:48Z","title":"Image-free Classifier Injection for Zero-Shot Classification","summary":" Zero-shot learning models achieve remarkable results on image classification\nfor samples from classes that were not seen during training. However, such\nmodels must be trained from scratch with specialised methods: therefore, access\nto a training dataset is required when the need for zero-shot classification\narises. In this paper, we aim to equip pre-trained models with zero-shot\nclassification capabilities without the use of image data. We achieve this with\nour proposed Image-free Classifier Injection with Semantics (ICIS) that injects\nclassifiers for new, unseen classes into pre-trained classification models in a\npost-hoc fashion without relying on image data. Instead, the existing\nclassifier weights and simple class-wise descriptors, such as class names or\nattributes, are used. ICIS has two encoder-decoder networks that learn to\nreconstruct classifier weights from descriptors (and vice versa), exploiting\n(cross-)reconstruction and cosine losses to regularise the decoding process.\nNotably, ICIS can be cheaply trained and applied directly on top of pre-trained\nclassification models. Experiments on benchmark ZSL datasets show that ICIS\nproduces unseen classifier weights that achieve strong (generalised) zero-shot\nclassification performance. Code is available at\nhttps://github.com/ExplainableML/ImageFreeZSL .\n","authors":["Anders Christensen","Massimiliano Mancini","A. Sophia Koepke","Ole Winther","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2308.10599v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2106.04923v2","updated":"2023-08-21T09:56:25Z","published":"2021-06-09T09:08:51Z","title":"Learning Domain Invariant Representations by Joint Wasserstein Distance\n Minimization","summary":" Domain shifts in the training data are common in practical applications of\nmachine learning; they occur for instance when the data is coming from\ndifferent sources. Ideally, a ML model should work well independently of these\nshifts, for example, by learning a domain-invariant representation. However,\ncommon ML losses do not give strong guarantees on how consistently the ML model\nperforms for different domains, in particular, whether the model performs well\non a domain at the expense of its performance on another domain. In this paper,\nwe build new theoretical foundations for this problem, by contributing a set of\nmathematical relations between classical losses for supervised ML and the\nWasserstein distance in joint space (i.e. representation and output space). We\nshow that classification or regression losses, when combined with a GAN-type\ndiscriminator between domains, form an upper-bound to the true Wasserstein\ndistance between domains. This implies a more invariant representation and also\nmore stable prediction performance across domains. Theoretical results are\ncorroborated empirically on several image datasets. Our proposed approach\nsystematically produces the highest minimum classification accuracy across\ndomains, and the most invariant representation.\n","authors":["Léo Andeol","Yusei Kawakami","Yuichiro Wada","Takafumi Kanamori","Klaus-Robert Müller","Grégoire Montavon"],"pdf_url":"https://arxiv.org/pdf/2106.04923v2.pdf","comment":"23 pages + supplement"},{"id":"http://arxiv.org/abs/2308.06296v2","updated":"2023-08-21T09:35:24Z","published":"2023-08-11T06:32:25Z","title":"Classification of White Blood Cells Using Machine and Deep Learning\n Models: A Systematic Review","summary":" Machine learning (ML) and deep learning (DL) models have been employed to\nsignificantly improve analyses of medical imagery, with these approaches used\nto enhance the accuracy of prediction and classification. Model predictions and\nclassifications assist diagnoses of various cancers and tumors. This review\npresents an in-depth analysis of modern techniques applied within the domain of\nmedical image analysis for white blood cell classification. The methodologies\nthat use blood smear images, magnetic resonance imaging (MRI), X-rays, and\nsimilar medical imaging domains are identified and discussed, with a detailed\nanalysis of ML/DL techniques applied to the classification of white blood cells\n(WBCs) representing the primary focus of the review. The data utilized in this\nresearch has been extracted from a collection of 136 primary papers that were\npublished between the years 2006 and 2023. The most widely used techniques and\nbest-performing white blood cell classification methods are identified. While\nthe use of ML and DL for white blood cell classification has concurrently\nincreased and improved in recent year, significant challenges remain - 1)\nAvailability of appropriate datasets remain the primary challenge, and may be\nresolved using data augmentation techniques. 2) Medical training of researchers\nis recommended to improve current understanding of white blood cell structure\nand subsequent selection of appropriate classification models. 3) Advanced DL\nnetworks including Generative Adversarial Networks, R-CNN, Fast R-CNN, and\nfaster R-CNN will likely be increasingly employed to supplement or replace\ncurrent techniques.\n","authors":["Rabia Asghar","Sanjay Kumar","Paul Hynds","Arslan Shaukat"],"pdf_url":"https://arxiv.org/pdf/2308.06296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11284v2","updated":"2023-08-21T09:35:20Z","published":"2023-05-18T20:04:55Z","title":"Federated learning for secure development of AI models for Parkinson's\n disease detection using speech from different languages","summary":" Parkinson's disease (PD) is a neurological disorder impacting a person's\nspeech. Among automatic PD assessment methods, deep learning models have gained\nparticular interest. Recently, the community has explored cross-pathology and\ncross-language models which can improve diagnostic accuracy even further.\nHowever, strict patient data privacy regulations largely prevent institutions\nfrom sharing patient speech data with each other. In this paper, we employ\nfederated learning (FL) for PD detection using speech signals from 3 real-world\nlanguage corpora of German, Spanish, and Czech, each from a separate\ninstitution. Our results indicate that the FL model outperforms all the local\nmodels in terms of diagnostic accuracy, while not performing very differently\nfrom the model based on centrally combined training sets, with the advantage of\nnot requiring any data sharing among collaborators. This will simplify\ninter-institutional collaborations, resulting in enhancement of patient\noutcomes.\n","authors":["Soroosh Tayebi Arasteh","Cristian David Rios-Urrego","Elmar Noeth","Andreas Maier","Seung Hee Yang","Jan Rusz","Juan Rafael Orozco-Arroyave"],"pdf_url":"https://arxiv.org/pdf/2305.11284v2.pdf","comment":"INTERSPEECH 2023, pp. 5003--5007, Dublin, Ireland"},{"id":"http://arxiv.org/abs/2308.10584v1","updated":"2023-08-21T09:33:20Z","published":"2023-08-21T09:33:20Z","title":"RADIANCE: Radio-Frequency Adversarial Deep-learning Inference for\n Automated Network Coverage Estimation","summary":" Radio-frequency coverage maps (RF maps) are extensively utilized in wireless\nnetworks for capacity planning, placement of access points and base stations,\nlocalization, and coverage estimation. Conducting site surveys to obtain RF\nmaps is labor-intensive and sometimes not feasible. In this paper, we propose\nradio-frequency adversarial deep-learning inference for automated network\ncoverage estimation (RADIANCE), a generative adversarial network (GAN) based\napproach for synthesizing RF maps in indoor scenarios. RADIANCE utilizes a\nsemantic map, a high-level representation of the indoor environment to encode\nspatial relationships and attributes of objects within the environment and\nguide the RF map generation process. We introduce a new gradient-based loss\nfunction that computes the magnitude and direction of change in received signal\nstrength (RSS) values from a point within the environment. RADIANCE\nincorporates this loss function along with the antenna pattern to capture\nsignal propagation within a given indoor configuration and generate new\npatterns under new configuration, antenna (beam) pattern, and center frequency.\nExtensive simulations are conducted to compare RADIANCE with ray-tracing\nsimulations of RF maps. Our results show that RADIANCE achieves a mean average\nerror (MAE) of 0.09, root-mean-squared error (RMSE) of 0.29, peak\nsignal-to-noise ratio (PSNR) of 10.78, and multi-scale structural similarity\nindex (MS-SSIM) of 0.80.\n","authors":["Sopan Sarkar","Mohammad Hossein Manshaei","Marwan Krunz"],"pdf_url":"https://arxiv.org/pdf/2308.10584v1.pdf","comment":"6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2203.16475v3","updated":"2023-08-21T09:30:58Z","published":"2022-03-30T17:12:18Z","title":"Concept Evolution in Deep Learning Training: A Unified Interpretation\n Framework and Discoveries","summary":" We present ConceptEvo, a unified interpretation framework for deep neural\nnetworks (DNNs) that reveals the inception and evolution of learned concepts\nduring training. Our work addresses a critical gap in DNN interpretation\nresearch, as existing methods primarily focus on post-training interpretation.\nConceptEvo introduces two novel technical contributions: (1) an algorithm that\ngenerates a unified semantic space, enabling side-by-side comparison of\ndifferent models during training, and (2) an algorithm that discovers and\nquantifies important concept evolutions for class predictions. Through a\nlarge-scale human evaluation and quantitative experiments, we demonstrate that\nConceptEvo successfully identifies concept evolutions across different models,\nwhich are not only comprehensible to humans but also crucial for class\npredictions. ConceptEvo is applicable to both modern DNN architectures, such as\nConvNeXt, and classic DNNs, such as VGGs and InceptionV3.\n","authors":["Haekyu Park","Seongmin Lee","Benjamin Hoover","Austin P. Wright","Omar Shaikh","Rahul Duggal","Nilaksh Das","Judy Hoffman","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2203.16475v3.pdf","comment":"Accepted at CIKM'23"},{"id":"http://arxiv.org/abs/2301.09627v3","updated":"2023-08-21T09:11:40Z","published":"2023-01-23T18:57:16Z","title":"The Impossibility of Parallelizing Boosting","summary":" The aim of boosting is to convert a sequence of weak learners into a strong\nlearner. At their heart, these methods are fully sequential. In this paper, we\ninvestigate the possibility of parallelizing boosting. Our main contribution is\na strong negative result, implying that significant parallelization of boosting\nrequires an exponential blow-up in the total computing resources needed for\ntraining.\n","authors":["Amin Karbasi","Kasper Green Larsen"],"pdf_url":"https://arxiv.org/pdf/2301.09627v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10571v1","updated":"2023-08-21T09:04:54Z","published":"2023-08-21T09:04:54Z","title":"Overcoming Overconfidence for Active Learning","summary":" It is not an exaggeration to say that the recent progress in artificial\nintelligence technology depends on large-scale and high-quality data.\nSimultaneously, a prevalent issue exists everywhere: the budget for data\nlabeling is constrained. Active learning is a prominent approach for addressing\nthis issue, where valuable data for labeling is selected through a model and\nutilized to iteratively adjust the model. However, due to the limited amount of\ndata in each iteration, the model is vulnerable to bias; thus, it is more\nlikely to yield overconfident predictions. In this paper, we present two novel\nmethods to address the problem of overconfidence that arises in the active\nlearning scenario. The first is an augmentation strategy named\nCross-Mix-and-Mix (CMaM), which aims to calibrate the model by expanding the\nlimited training distribution. The second is a selection strategy named Ranked\nMargin Sampling (RankedMS), which prevents choosing data that leads to overly\nconfident predictions. Through various experiments and analyses, we are able to\ndemonstrate that our proposals facilitate efficient data selection by\nalleviating overconfidence, even though they are readily applicable.\n","authors":["Yujin Hwang","Won Jo","Juyoung Hong","Yukyung Choi"],"pdf_url":"https://arxiv.org/pdf/2308.10571v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14640v2","updated":"2023-08-21T08:37:37Z","published":"2023-02-28T15:18:42Z","title":"Meta-Learning with Adaptive Weighted Loss for Imbalanced Cold-Start\n Recommendation","summary":" Sequential recommenders have made great strides in capturing a user's\npreferences. Nevertheless, the cold-start recommendation remains a fundamental\nchallenge as they typically involve limited user-item interactions for\npersonalization. Recently, gradient-based meta-learning approaches have emerged\nin the sequential recommendation field due to their fast adaptation and\neasy-to-integrate abilities. The meta-learning algorithms formulate the\ncold-start recommendation as a few-shot learning problem, where each user is\nrepresented as a task to be adapted. While meta-learning algorithms generally\nassume that task-wise samples are evenly distributed over classes or values,\nuser-item interactions in real-world applications do not conform to such a\ndistribution (e.g., watching favorite videos multiple times, leaving only\npositive ratings without any negative ones). Consequently, imbalanced user\nfeedback, which accounts for the majority of task training data, may dominate\nthe user adaptation process and prevent meta-learning algorithms from learning\nmeaningful meta-knowledge for personalized recommendations. To alleviate this\nlimitation, we propose a novel sequential recommendation framework based on\ngradient-based meta-learning that captures the imbalanced rating distribution\nof each user and computes adaptive loss for user-specific learning. Our work is\nthe first to tackle the impact of imbalanced ratings in cold-start sequential\nrecommendation scenarios. Through extensive experiments conducted on real-world\ndatasets, we demonstrate the effectiveness of our framework.\n","authors":["Minchang Kim","Yongjin Yang","Jung Hyun Ryu","Taesup Kim"],"pdf_url":"https://arxiv.org/pdf/2302.14640v2.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2212.01953v3","updated":"2023-08-21T08:18:53Z","published":"2022-12-04T23:40:14Z","title":"Context-aware multi-head self-attentional neural network model for next\n location prediction","summary":" Accurate activity location prediction is a crucial component of many mobility\napplications and is particularly required to develop personalized, sustainable\ntransportation systems. Despite the widespread adoption of deep learning\nmodels, next location prediction models lack a comprehensive discussion and\nintegration of mobility-related spatio-temporal contexts. Here, we utilize a\nmulti-head self-attentional (MHSA) neural network that learns location\ntransition patterns from historical location visits, their visit time and\nactivity duration, as well as their surrounding land use functions, to infer an\nindividual's next location. Specifically, we adopt point-of-interest data and\nlatent Dirichlet allocation for representing locations' land use contexts at\nmultiple spatial scales, generate embedding vectors of the spatio-temporal\nfeatures, and learn to predict the next location with an MHSA network. Through\nexperiments on two large-scale GNSS tracking datasets, we demonstrate that the\nproposed model outperforms other state-of-the-art prediction models, and reveal\nthe contribution of various spatio-temporal contexts to the model's\nperformance. Moreover, we find that the model trained on population data\nachieves higher prediction performance with fewer parameters than\nindividual-level models due to learning from collective movement patterns. We\nalso reveal mobility conducted in the recent past and one week before has the\nlargest influence on the current prediction, showing that learning from a\nsubset of the historical mobility is sufficient to obtain an accurate location\nprediction result. We believe that the proposed model is vital for\ncontext-aware mobility prediction. The gained insights will help to understand\nlocation prediction models and promote their implementation for mobility\napplications.\n","authors":["Ye Hong","Yatao Zhang","Konrad Schindler","Martin Raubal"],"pdf_url":"https://arxiv.org/pdf/2212.01953v3.pdf","comment":"updated Discussion section; accepted by Transportation Research Part\n C"},{"id":"http://arxiv.org/abs/2212.05853v3","updated":"2023-08-21T08:11:41Z","published":"2022-12-12T12:31:46Z","title":"DeepCut: Unsupervised Segmentation using Graph Neural Networks\n Clustering","summary":" Image segmentation is a fundamental task in computer vision. Data annotation\nfor training supervised methods can be labor-intensive, motivating unsupervised\nmethods. Current approaches often rely on extracting deep features from\npre-trained networks to construct a graph, and classical clustering methods\nlike k-means and normalized-cuts are then applied as a post-processing step.\nHowever, this approach reduces the high-dimensional information encoded in the\nfeatures to pair-wise scalar affinities. To address this limitation, this study\nintroduces a lightweight Graph Neural Network (GNN) to replace classical\nclustering methods while optimizing for the same clustering objective function.\nUnlike existing methods, our GNN takes both the pair-wise affinities between\nlocal image features and the raw features as input. This direct connection\nbetween the raw features and the clustering objective enables us to implicitly\nperform classification of the clusters between different graphs, resulting in\npart semantic segmentation without the need for additional post-processing\nsteps. We demonstrate how classical clustering objectives can be formulated as\nself-supervised loss functions for training an image segmentation GNN.\nFurthermore, we employ the Correlation-Clustering (CC) objective to perform\nclustering without defining the number of clusters, allowing for k-less\nclustering. We apply the proposed method for object localization, segmentation,\nand semantic part segmentation tasks, surpassing state-of-the-art performance\non multiple benchmarks.\n","authors":["Amit Aflalo","Shai Bagon","Tamar Kashti","Yonina Eldar"],"pdf_url":"https://arxiv.org/pdf/2212.05853v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05712v2","updated":"2023-08-21T08:07:45Z","published":"2023-01-13T14:41:05Z","title":"A Survey of Self-supervised Learning from Multiple Perspectives:\n Algorithms, Applications and Future Trends","summary":" Deep supervised learning algorithms generally require large numbers of\nlabeled examples to achieve satisfactory performance. However, collecting and\nlabeling too many examples can be costly and time-consuming. As a subset of\nunsupervised learning, self-supervised learning (SSL) aims to learn useful\nfeatures from unlabeled examples without any human-annotated labels. SSL has\nrecently attracted much attention and many related algorithms have been\ndeveloped. However, there are few comprehensive studies that explain the\nconnections and evolution of different SSL variants. In this paper, we provide\na review of various SSL methods from the perspectives of algorithms,\napplications, three main trends, and open questions. First, the motivations of\nmost SSL algorithms are introduced in detail, and their commonalities and\ndifferences are compared. Second, typical applications of SSL in domains such\nas image processing and computer vision (CV), as well as natural language\nprocessing (NLP), are discussed. Finally, the three main trends of SSL and the\nopen research questions are discussed. A collection of useful materials is\navailable at https://github.com/guijiejie/SSL.\n","authors":["Jie Gui","Tuo Chen","Jing Zhang","Qiong Cao","Zhenan Sun","Hao Luo","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2301.05712v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10547v1","updated":"2023-08-21T08:02:16Z","published":"2023-08-21T08:02:16Z","title":"Decentralized Riemannian Conjugate Gradient Method on the Stiefel\n Manifold","summary":" The conjugate gradient method is a crucial first-order optimization method\nthat generally converges faster than the steepest descent method, and its\ncomputational cost is much lower than the second-order methods. However, while\nvarious types of conjugate gradient methods have been studied in Euclidean\nspaces and on Riemannian manifolds, there has little study for those in\ndistributed scenarios. This paper proposes a decentralized Riemannian conjugate\ngradient descent (DRCGD) method that aims at minimizing a global function over\nthe Stiefel manifold. The optimization problem is distributed among a network\nof agents, where each agent is associated with a local function, and\ncommunication between agents occurs over an undirected connected graph. Since\nthe Stiefel manifold is a non-convex set, a global function is represented as a\nfinite sum of possibly non-convex (but smooth) local functions. The proposed\nmethod is free from expensive Riemannian geometric operations such as\nretractions, exponential maps, and vector transports, thereby reducing the\ncomputational complexity required by each agent. To the best of our knowledge,\nDRCGD is the first decentralized Riemannian conjugate gradient algorithm to\nachieve global convergence over the Stiefel manifold.\n","authors":["Jun Chen","Haishan Ye","Mengmeng Wang","Tianxin Huang","Guang Dai","Ivor W. Tsang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2308.10547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10544v1","updated":"2023-08-21T07:58:15Z","published":"2023-08-21T07:58:15Z","title":"Towards Accelerated Model Training via Bayesian Data Selection","summary":" Mislabeled, duplicated, or biased data in real-world scenarios can lead to\nprolonged training and even hinder model convergence. Traditional solutions\nprioritizing easy or hard samples lack the flexibility to handle such a variety\nsimultaneously. Recent work has proposed a more reasonable data selection\nprinciple by examining the data's impact on the model's generalization loss.\nHowever, its practical adoption relies on less principled approximations and\nadditional clean holdout data. This work solves these problems by leveraging a\nlightweight Bayesian treatment and incorporating off-the-shelf zero-shot\npredictors built on large-scale pre-trained models. The resulting algorithm is\nefficient and easy-to-implement. We perform extensive empirical studies on\nchallenging benchmarks with considerable data noise and imbalance in the online\nbatch selection scenario, and observe superior training efficiency over\ncompetitive baselines. Notably, on the challenging WebVision benchmark, our\nmethod can achieve similar predictive performance with significantly fewer\ntraining iterations than leading data selection methods.\n","authors":["Zhijie Deng","Peng Cui","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.10544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10542v1","updated":"2023-08-21T07:52:39Z","published":"2023-08-21T07:52:39Z","title":"Learning Weakly Convex Regularizers for Convergent Image-Reconstruction\n Algorithms","summary":" We propose to learn non-convex regularizers with a prescribed upper bound on\ntheir weak-convexity modulus. Such regularizers give rise to variational\ndenoisers that minimize a convex energy. They rely on few parameters (less than\n15,000) and offer a signal-processing interpretation as they mimic handcrafted\nsparsity-promoting regularizers. Through numerical experiments, we show that\nsuch denoisers outperform convex-regularization methods as well as the popular\nBM3D denoiser. Additionally, the learned regularizer can be deployed to solve\ninverse problems with iterative schemes that provably converge. For both CT and\nMRI reconstruction, the regularizer generalizes well and offers an excellent\ntradeoff between performance, number of parameters, guarantees, and\ninterpretability when compared to other data-driven approaches.\n","authors":["Alexis Goujon","Sebastian Neumayer","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2308.10542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10537v1","updated":"2023-08-21T07:43:10Z","published":"2023-08-21T07:43:10Z","title":"KGrEaT: A Framework to Evaluate Knowledge Graphs via Downstream Tasks","summary":" In recent years, countless research papers have addressed the topics of\nknowledge graph creation, extension, or completion in order to create knowledge\ngraphs that are larger, more correct, or more diverse. This research is\ntypically motivated by the argumentation that using such enhanced knowledge\ngraphs to solve downstream tasks will improve performance. Nonetheless, this is\nhardly ever evaluated. Instead, the predominant evaluation metrics - aiming at\ncorrectness and completeness - are undoubtedly valuable but fail to capture the\ncomplete picture, i.e., how useful the created or enhanced knowledge graph\nactually is. Further, the accessibility of such a knowledge graph is rarely\nconsidered (e.g., whether it contains expressive labels, descriptions, and\nsufficient context information to link textual mentions to the entities of the\nknowledge graph). To better judge how well knowledge graphs perform on actual\ntasks, we present KGrEaT - a framework to estimate the quality of knowledge\ngraphs via actual downstream tasks like classification, clustering, or\nrecommendation. Instead of comparing different methods of processing knowledge\ngraphs with respect to a single task, the purpose of KGrEaT is to compare\nvarious knowledge graphs as such by evaluating them on a fixed task setup. The\nframework takes a knowledge graph as input, automatically maps it to the\ndatasets to be evaluated on, and computes performance metrics for the defined\ntasks. It is built in a modular way to be easily extendable with additional\ntasks and datasets.\n","authors":["Nicolas Heist","Sven Hertling","Heiko Paulheim"],"pdf_url":"https://arxiv.org/pdf/2308.10537v1.pdf","comment":"Accepted for the Short Paper track of CIKM'23, October 21-25, 2023,\n Birmingham, United Kingdom"},{"id":"http://arxiv.org/abs/2308.09228v2","updated":"2023-08-21T07:39:35Z","published":"2023-08-18T01:20:25Z","title":"Generalized Sum Pooling for Metric Learning","summary":" A common architectural choice for deep metric learning is a convolutional\nneural network followed by global average pooling (GAP). Albeit simple, GAP is\na highly effective way to aggregate information. One possible explanation for\nthe effectiveness of GAP is considering each feature vector as representing a\ndifferent semantic entity and GAP as a convex combination of them. Following\nthis perspective, we generalize GAP and propose a learnable generalized sum\npooling method (GSP). GSP improves GAP with two distinct abilities: i) the\nability to choose a subset of semantic entities, effectively learning to ignore\nnuisance information, and ii) learning the weights corresponding to the\nimportance of each entity. Formally, we propose an entropy-smoothed optimal\ntransport problem and show that it is a strict generalization of GAP, i.e., a\nspecific realization of the problem gives back GAP. We show that this\noptimization problem enjoys analytical gradients enabling us to use it as a\ndirect learnable replacement for GAP. We further propose a zero-shot loss to\nease the learning of GSP. We show the effectiveness of our method with\nextensive evaluations on 4 popular metric learning benchmarks. Code is\navailable at: GSP-DML Framework\n","authors":["Yeti Z. Gurbuz","Ozan Sener","A. Aydın Alatan"],"pdf_url":"https://arxiv.org/pdf/2308.09228v2.pdf","comment":"Accepted as a conference paper at International Conference on\n Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2308.10527v1","updated":"2023-08-21T07:26:09Z","published":"2023-08-21T07:26:09Z","title":"DPAN: Dynamic Preference-based and Attribute-aware Network for Relevant\n Recommendations","summary":" In e-commerce platforms, the relevant recommendation is a unique scenario\nproviding related items for a trigger item that users are interested in.\nHowever, users' preferences for the similarity and diversity of recommendation\nresults are dynamic and vary under different conditions. Moreover, individual\nitem-level diversity is too coarse-grained since all recommended items are\nrelated to the trigger item. Thus, the two main challenges are to learn\nfine-grained representations of similarity and diversity and capture users'\ndynamic preferences for them under different conditions. To address these\nchallenges, we propose a novel method called the Dynamic Preference-based and\nAttribute-aware Network (DPAN) for predicting Click-Through Rate (CTR) in\nrelevant recommendations. Specifically, based on Attribute-aware Activation\nValues Generation (AAVG), Bi-dimensional Compression-based Re-expression (BCR)\nis designed to obtain similarity and diversity representations of user\ninterests and item information. Then Shallow and Deep Union-based Fusion (SDUF)\nis proposed to capture users' dynamic preferences for the diverse degree of\nrecommendation results according to various conditions. DPAN has demonstrated\nits effectiveness through extensive offline experiments and online A/B testing,\nresulting in a significant 7.62% improvement in CTR. Currently, DPAN has been\nsuccessfully deployed on our e-commerce platform serving the primary traffic\nfor relevant recommendations. The code of DPAN has been made publicly\navailable.\n","authors":["Wei Dai","Yingmin Su","Xiaofeng Pan"],"pdf_url":"https://arxiv.org/pdf/2308.10527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06741v2","updated":"2023-08-21T07:21:28Z","published":"2023-05-11T11:53:31Z","title":"IVP-VAE: Modeling EHR Time Series with Initial Value Problem Solvers","summary":" Continuous-time models such as Neural ODEs and Neural Flows have shown\npromising results in analyzing irregularly sampled time series frequently\nencountered in electronic health records. Based on these models, time series\nare typically processed with a hybrid of an initial value problem (IVP) solver\nand a recurrent neural network within the variational autoencoder architecture.\nSequentially solving IVPs makes such models computationally less efficient. In\nthis paper, we propose to model time series purely with continuous processes\nwhose state evolution can be approximated directly by IVPs. This eliminates the\nneed for recurrent computation and enables multiple states to evolve in\nparallel. We further fuse the encoder and decoder with one IVP solver utilizing\nits invertibility, which leads to fewer parameters and faster convergence.\nExperiments on three real-world datasets show that the proposed method can\nsystematically outperform its predecessors, achieve state-of-the-art results,\nand have significant advantages in terms of data efficiency.\n","authors":["Jingge Xiao","Leonie Basso","Wolfgang Nejdl","Niloy Ganguly","Sandipan Sikdar"],"pdf_url":"https://arxiv.org/pdf/2305.06741v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.10522v1","updated":"2023-08-21T07:19:47Z","published":"2023-08-21T07:19:47Z","title":"Information Theory-Guided Heuristic Progressive Multi-View Coding","summary":" Multi-view representation learning aims to capture comprehensive information\nfrom multiple views of a shared context. Recent works intuitively apply\ncontrastive learning to different views in a pairwise manner, which is still\nscalable: view-specific noise is not filtered in learning view-shared\nrepresentations; the fake negative pairs, where the negative terms are actually\nwithin the same class as the positive, and the real negative pairs are\ncoequally treated; evenly measuring the similarities between terms might\ninterfere with optimization. Importantly, few works study the theoretical\nframework of generalized self-supervised multi-view learning, especially for\nmore than two views. To this end, we rethink the existing multi-view learning\nparadigm from the perspective of information theory and then propose a novel\ninformation theoretical framework for generalized multi-view learning. Guided\nby it, we build a multi-view coding method with a three-tier progressive\narchitecture, namely Information theory-guided hierarchical Progressive\nMulti-view Coding (IPMC). In the distribution-tier, IPMC aligns the\ndistribution between views to reduce view-specific noise. In the set-tier, IPMC\nconstructs self-adjusted contrasting pools, which are adaptively modified by a\nview filter. Lastly, in the instance-tier, we adopt a designed unified loss to\nlearn representations and reduce the gradient interference. Theoretically and\nempirically, we demonstrate the superiority of IPMC over state-of-the-art\nmethods.\n","authors":["Jiangmeng Li","Hang Gao","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.10522v1.pdf","comment":"This paper is accepted y the jourcal of Elsevier Neural Networks by\n 2023. arXiv admin note: substantial text overlap with arXiv:2109.02344"},{"id":"http://arxiv.org/abs/2303.08757v4","updated":"2023-08-21T07:02:23Z","published":"2023-03-15T16:53:19Z","title":"CT Perfusion is All We Need: 4D CNN Segmentation of Penumbra and Core in\n Patients With Suspected Ischemic Stroke","summary":" Precise and fast prediction methods for ischemic areas comprised of dead\ntissue, core, and salvageable tissue, penumbra, in acute ischemic stroke (AIS)\npatients are of significant clinical interest. They play an essential role in\nimproving diagnosis and treatment planning. Computed Tomography (CT) scan is\none of the primary modalities for early assessment in patients with suspected\nAIS. CT Perfusion (CTP) is often used as a primary assessment to determine\nstroke location, severity, and volume of ischemic lesions. Current automatic\nsegmentation methods for CTP mostly use already processed 3D parametric maps\nconventionally used for clinical interpretation by radiologists as input.\nAlternatively, the raw CTP data is used on a slice-by-slice basis as 2D+time\ninput, where the spatial information over the volume is ignored. In addition,\nthese methods are only interested in segmenting core regions, while predicting\npenumbra can be essential for treatment planning. This paper investigates\ndifferent methods to utilize the entire 4D CTP as input to fully exploit the\nspatio-temporal information, leading us to propose a novel 4D convolution\nlayer. Our comprehensive experiments on a local dataset of 152 patients divided\ninto three groups show that our proposed models generate more precise results\nthan other methods explored. Adopting the proposed 4D mJ-Net, a Dice\nCoefficient of 0.53 and 0.23 is achieved for segmenting penumbra and core\nareas, respectively. The code is available on\nhttps://github.com/Biomedical-Data-Analysis-Laboratory/4D-mJ-Net.git.\n","authors":["Luca Tomasetti","Kjersti Engan","Liv Jorunn Høllesli","Kathinka Dæhli Kurz","Mahdieh Khanmohammadi"],"pdf_url":"https://arxiv.org/pdf/2303.08757v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10511v1","updated":"2023-08-21T06:51:58Z","published":"2023-08-21T06:51:58Z","title":"Performance Enhancement Leveraging Mask-RCNN on Bengali Document Layout\n Analysis","summary":" Understanding digital documents is like solving a puzzle, especially\nhistorical ones. Document Layout Analysis (DLA) helps with this puzzle by\ndividing documents into sections like paragraphs, images, and tables. This is\ncrucial for machines to read and understand these documents.In the DL Sprint\n2.0 competition, we worked on understanding Bangla documents. We used a dataset\ncalled BaDLAD with lots of examples. We trained a special model called Mask\nR-CNN to help with this understanding. We made this model better by\nstep-by-step hyperparameter tuning, and we achieved a good dice score of\n0.889.However, not everything went perfectly. We tried using a model trained\nfor English documents, but it didn't fit well with Bangla. This showed us that\neach language has its own challenges. Our solution for the DL Sprint 2.0 is\npublicly available at\nhttps://www.kaggle.com/competitions/dlsprint2/discussion/432201 along with\nnotebooks, weights, and inference notebook.\n","authors":["Shrestha Datta","Md Adith Mollah","Raisa Fairooz","Tariful Islam Fahim"],"pdf_url":"https://arxiv.org/pdf/2308.10511v1.pdf","comment":"Contest paper, Conest: DL sprint 2.0 (Link:\n https://www.kaggle.com/competitions/dlsprint2), Solution link:\n https://www.kaggle.com/competitions/dlsprint2/discussion/432201"},{"id":"http://arxiv.org/abs/2209.01566v2","updated":"2023-08-21T06:48:05Z","published":"2022-09-04T08:35:16Z","title":"Towards Top-Down Automated Development in Limited Scopes: A\n Neuro-Symbolic Framework from Expressibles to Executables","summary":" Deep code generation is a topic of deep learning for software engineering\n(DL4SE), which adopts neural models to generate code for the intended\nfunctions. Since end-to-end neural methods lack domain knowledge and software\nhierarchy awareness, they tend to perform poorly w.r.t project-level tasks. To\nsystematically explore the potential improvements of code generation, we let it\nparticipate in the whole top-down development from \\emph{expressibles} to\n\\emph{executables}, which is possible in limited scopes. In the process, it\nbenefits from massive samples, features, and knowledge. As the foundation, we\nsuggest building a taxonomy on code data, namely code taxonomy, leveraging the\ncategorization of code information. Moreover, we introduce a three-layer\nsemantic pyramid (SP) to associate text data and code data. It identifies the\ninformation of different abstraction levels, and thus introduces the domain\nknowledge on development and reveals the hierarchy of software. Furthermore, we\npropose a semantic pyramid framework (SPF) as the approach, focusing on\nsoftware of high modularity and low complexity. SPF divides the code generation\nprocess into stages and reserves spots for potential interactions. In addition,\nwe conceived preliminary applications in software development to confirm the\nneuro-symbolic framework.\n","authors":["Jian Gu","Harald C. Gall"],"pdf_url":"https://arxiv.org/pdf/2209.01566v2.pdf","comment":"5 pages, 3 figures, 2 tables, accepted by ESEC/FSE 2023, the\n camera-ready version"},{"id":"http://arxiv.org/abs/2308.10505v1","updated":"2023-08-21T06:45:58Z","published":"2023-08-21T06:45:58Z","title":"A Clustering Algorithm to Organize Satellite Hotspot Data for the\n Purpose of Tracking Bushfires Remotely","summary":" This paper proposes a spatiotemporal clustering algorithm and its\nimplementation in the R package spotoroo. This work is motivated by the\ncatastrophic bushfires in Australia throughout the summer of 2019-2020 and made\npossible by the availability of satellite hotspot data. The algorithm is\ninspired by two existing spatiotemporal clustering algorithms but makes\nenhancements to cluster points spatially in conjunction with their movement\nacross consecutive time periods. It also allows for the adjustment of key\nparameters, if required, for different locations and satellite data sources.\nBushfire data from Victoria, Australia, is used to illustrate the algorithm and\nits use within the package.\n","authors":["Weihao Li","Emily Dodwell","Dianne Cook"],"pdf_url":"https://arxiv.org/pdf/2308.10505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10504v1","updated":"2023-08-21T06:45:28Z","published":"2023-08-21T06:45:28Z","title":"Adaptive Thresholding Heuristic for KPI Anomaly Detection","summary":" A plethora of outlier detectors have been explored in the time series domain,\nhowever, in a business sense, not all outliers are anomalies of interest.\nExisting anomaly detection solutions are confined to certain outlier detectors\nlimiting their applicability to broader anomaly detection use cases. Network\nKPIs (Key Performance Indicators) tend to exhibit stochastic behaviour\nproducing statistical outliers, most of which do not adversely affect business\noperations. Thus, a heuristic is required to capture the business definition of\nan anomaly for time series KPI. This article proposes an Adaptive Thresholding\nHeuristic (ATH) to dynamically adjust the detection threshold based on the\nlocal properties of the data distribution and adapt to changes in time series\npatterns. The heuristic derives the threshold based on the expected periodicity\nand the observed proportion of anomalies minimizing false positives and\naddressing concept drift. ATH can be used in conjunction with any underlying\nseasonality decomposition method and an outlier detector that yields an outlier\nscore. This method has been tested on EON1-Cell-U, a labeled KPI anomaly\ndataset produced by Ericsson, to validate our hypothesis. Experimental results\nshow that ATH is computationally efficient making it scalable for near real\ntime anomaly detection and flexible with multiple forecasters and outlier\ndetectors.\n","authors":["Ebenezer R. H. P. Isaac","Akshat Sharma"],"pdf_url":"https://arxiv.org/pdf/2308.10504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10502v1","updated":"2023-08-21T06:42:42Z","published":"2023-08-21T06:42:42Z","title":"GradientCoin: A Peer-to-Peer Decentralized Large Language Models","summary":" Since 2008, after the proposal of a Bitcoin electronic cash system, Bitcoin\nhas fundamentally changed the economic system over the last decade. Since 2022,\nlarge language models (LLMs) such as GPT have outperformed humans in many\nreal-life tasks. However, these large language models have several practical\nissues. For example, the model is centralized and controlled by a specific\nunit. One weakness is that if that unit decides to shut down the model, it\ncannot be used anymore. The second weakness is the lack of guaranteed\ndiscrepancy behind this model, as certain dishonest units may design their own\nmodels and feed them unhealthy training data.\n In this work, we propose a purely theoretical design of a decentralized LLM\nthat operates similarly to a Bitcoin cash system. However, implementing such a\nsystem might encounter various practical difficulties. Furthermore, this new\nsystem is unlikely to perform better than the standard Bitcoin system in\neconomics. Therefore, the motivation for designing such a system is limited. It\nis likely that only two types of people would be interested in setting up a\npractical system for it:\n $\\bullet$ Those who prefer to use a decentralized ChatGPT-like software.\n $\\bullet$ Those who believe that the purpose of carbon-based life is to\ncreate silicon-based life, such as Optimus Prime in Transformers.\n The reason the second type of people may be interested is that it is possible\nthat one day an AI system like this will awaken and become the next level of\nintelligence on this planet.\n","authors":["Yeqi Gao","Zhao Song","Junze Yin"],"pdf_url":"https://arxiv.org/pdf/2308.10502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10501v1","updated":"2023-08-21T06:42:33Z","published":"2023-08-21T06:42:33Z","title":"Deep Learning of Delay-Compensated Backstepping for Reaction-Diffusion\n PDEs","summary":" Deep neural networks that approximate nonlinear function-to-function\nmappings, i.e., operators, which are called DeepONet, have been demonstrated in\nrecent articles to be capable of encoding entire PDE control methodologies,\nsuch as backstepping, so that, for each new functional coefficient of a PDE\nplant, the backstepping gains are obtained through a simple function\nevaluation. These initial results have been limited to single PDEs from a given\nclass, approximating the solutions of only single-PDE operators for the gain\nkernels. In this paper we expand this framework to the approximation of\nmultiple (cascaded) nonlinear operators. Multiple operators arise in the\ncontrol of PDE systems from distinct PDE classes, such as the system in this\npaper: a reaction-diffusion plant, which is a parabolic PDE, with input delay,\nwhich is a hyperbolic PDE. The DeepONet-approximated nonlinear operator is a\ncascade/composition of the operators defined by one hyperbolic PDE of the\nGoursat form and one parabolic PDE on a rectangle, both of which are bilinear\nin their input functions and not explicitly solvable. For the delay-compensated\nPDE backstepping controller, which employs the learned control operator,\nnamely, the approximated gain kernel, we guarantee exponential stability in the\n$L^2$ norm of the plant state and the $H^1$ norm of the input delay state.\nSimulations illustrate the contributed theory.\n","authors":["Shanshan Wang","Mamadou Diagne","Miroslav Krstić"],"pdf_url":"https://arxiv.org/pdf/2308.10501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.01160v2","updated":"2023-08-21T06:40:41Z","published":"2022-10-31T01:45:42Z","title":"A Profit-Maximizing Strategy for Advertising on the e-Commerce Platforms","summary":" The online advertising management platform has become increasingly popular\namong e-commerce vendors/advertisers, offering a streamlined approach to reach\ntarget customers. Despite its advantages, configuring advertising strategies\ncorrectly remains a challenge for online vendors, particularly those with\nlimited resources. Ineffective strategies often result in a surge of\nunproductive ``just looking'' clicks, leading to disproportionately high\nadvertising expenses comparing to the growth of sales. In this paper, we\npresent a novel profit-maximing strategy for targeting options of online\nadvertising. The proposed model aims to find the optimal set of features to\nmaximize the probability of converting targeted audiences into actual buyers.\nWe address the optimization challenge by reformulating it as a multiple-choice\nknapsack problem (MCKP). We conduct an empirical study featuring real-world\ndata from Tmall to show that our proposed method can effectively optimize the\nadvertising strategy with budgetary constraints.\n","authors":["Lianghai Xiao","Yixing Zhao","Jiwei Chen"],"pdf_url":"https://arxiv.org/pdf/2211.01160v2.pdf","comment":"Online advertising campaigns"},{"id":"http://arxiv.org/abs/2308.10496v1","updated":"2023-08-21T06:35:08Z","published":"2023-08-21T06:35:08Z","title":"Using Autoencoders and AutoDiff to Reconstruct Missing Variables in a\n Set of Time Series","summary":" Existing black box modeling approaches in machine learning suffer from a\nfixed input and output feature combination. In this paper, a new approach to\nreconstruct missing variables in a set of time series is presented. An\nautoencoder is trained as usual with every feature on both sides and the neural\nnetwork parameters are fixed after this training. Then, the searched variables\nare defined as missing variables at the autoencoder input and optimized via\nautomatic differentiation. This optimization is performed with respect to the\navailable features loss calculation. With this method, different input and\noutput feature combinations of the trained model can be realized by defining\nthe searched variables as missing variables and reconstructing them. The\ncombination can be changed without training the autoencoder again. The approach\nis evaluated on the base of a strongly nonlinear electrical component. It is\nworking well for one of four variables missing and generally even for multiple\nmissing variables.\n","authors":["Jan-Philipp Roche","Oliver Niggemann","Jens Friebe"],"pdf_url":"https://arxiv.org/pdf/2308.10496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08708v2","updated":"2023-08-21T06:18:34Z","published":"2023-08-17T00:10:16Z","title":"Consciousness in Artificial Intelligence: Insights from the Science of\n Consciousness","summary":" Whether current or near-term AI systems could be conscious is a topic of\nscientific interest and increasing public concern. This report argues for, and\nexemplifies, a rigorous and empirically grounded approach to AI consciousness:\nassessing existing AI systems in detail, in light of our best-supported\nneuroscientific theories of consciousness. We survey several prominent\nscientific theories of consciousness, including recurrent processing theory,\nglobal workspace theory, higher-order theories, predictive processing, and\nattention schema theory. From these theories we derive \"indicator properties\"\nof consciousness, elucidated in computational terms that allow us to assess AI\nsystems for these properties. We use these indicator properties to assess\nseveral recent AI systems, and we discuss how future systems might implement\nthem. Our analysis suggests that no current AI systems are conscious, but also\nshows that there are no obvious barriers to building conscious AI systems.\n","authors":["Patrick Butlin","Robert Long","Eric Elmoznino","Yoshua Bengio","Jonathan Birch","Axel Constant","George Deane","Stephen M. Fleming","Chris Frith","Xu Ji","Ryota Kanai","Colin Klein","Grace Lindsay","Matthias Michel","Liad Mudrik","Megan A. K. Peters","Eric Schwitzgebel","Jonathan Simon","Rufin VanRullen"],"pdf_url":"https://arxiv.org/pdf/2308.08708v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10487v1","updated":"2023-08-21T06:04:53Z","published":"2023-08-21T06:04:53Z","title":"Deciphering Raw Data in Neuro-Symbolic Learning with Provable Guarantees","summary":" Neuro-symbolic hybrid systems are promising for integrating machine learning\nand symbolic reasoning, where perception models are facilitated with\ninformation inferred from a symbolic knowledge base through logical reasoning.\nDespite empirical evidence showing the ability of hybrid systems to learn\naccurate perception models, the theoretical understanding of learnability is\nstill lacking. Hence, it remains unclear why a hybrid system succeeds for a\nspecific task and when it may fail given a different knowledge base. In this\npaper, we introduce a novel way of characterising supervision signals from a\nknowledge base, and establish a criterion for determining the knowledge's\nefficacy in facilitating successful learning. This, for the first time, allows\nus to address the two questions above by inspecting the knowledge base under\ninvestigation. Our analysis suggests that many knowledge bases satisfy the\ncriterion, thus enabling effective learning, while some fail to satisfy it,\nindicating potential failures. Comprehensive experiments confirm the utility of\nour criterion on benchmark tasks.\n","authors":["Lue Tao","Yu-Xuan Huang","Wang-Zhou Dai","Yuan Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.10487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10486v1","updated":"2023-08-21T06:04:30Z","published":"2023-08-21T06:04:30Z","title":"Deep Metric Loss for Multimodal Learning","summary":" Multimodal learning often outperforms its unimodal counterparts by exploiting\nunimodal contributions and cross-modal interactions. However, focusing only on\nintegrating multimodal features into a unified comprehensive representation\noverlooks the unimodal characteristics. In real data, the contributions of\nmodalities can vary from instance to instance, and they often reinforce or\nconflict with each other. In this study, we introduce a novel \\text{MultiModal}\nloss paradigm for multimodal learning, which subgroups instances according to\ntheir unimodal contributions. \\text{MultiModal} loss can prevent inefficient\nlearning caused by overfitting and efficiently optimize multimodal models. On\nsynthetic data, \\text{MultiModal} loss demonstrates improved classification\nperformance by subgrouping difficult instances within certain modalities. On\nfour real multimodal datasets, our loss is empirically shown to improve the\nperformance of recent models. Ablation studies verify the effectiveness of our\nloss. Additionally, we show that our loss generates a reliable prediction score\nfor each modality, which is essential for subgrouping. Our \\text{MultiModal}\nloss is a novel loss function to subgroup instances according to the\ncontribution of modalities in multimodal learning and is applicable to a\nvariety of multimodal models with unimodal decisions. Our code is available at\nhttps://github.com/SehwanMoon/MultiModalLoss.\n","authors":["Sehwan Moon","Hyunju Lee"],"pdf_url":"https://arxiv.org/pdf/2308.10486v1.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.10482v1","updated":"2023-08-21T05:46:40Z","published":"2023-08-21T05:46:40Z","title":"An Effective Method using Phrase Mechanism in Neural Machine Translation","summary":" Machine Translation is one of the essential tasks in Natural Language\nProcessing (NLP), which has massive applications in real life as well as\ncontributing to other tasks in the NLP research community. Recently,\nTransformer -based methods have attracted numerous researchers in this domain\nand achieved state-of-the-art results in most of the pair languages. In this\npaper, we report an effective method using a phrase mechanism,\nPhraseTransformer, to improve the strong baseline model Transformer in\nconstructing a Neural Machine Translation (NMT) system for parallel corpora\nVietnamese-Chinese. Our experiments on the MT dataset of the VLSP 2022\ncompetition achieved the BLEU score of 35.3 on Vietnamese to Chinese and 33.2\nBLEU scores on Chinese to Vietnamese data. Our code is available at\nhttps://github.com/phuongnm94/PhraseTransformer.\n","authors":["Phuong Minh Nguyen","Le Minh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.10482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08306v2","updated":"2023-08-21T05:26:45Z","published":"2023-06-14T07:23:36Z","title":"Towards Balanced Active Learning for Multimodal Classification","summary":" Training multimodal networks requires a vast amount of data due to their\nlarger parameter space compared to unimodal networks. Active learning is a\nwidely used technique for reducing data annotation costs by selecting only\nthose samples that could contribute to improving model performance. However,\ncurrent active learning strategies are mostly designed for unimodal tasks, and\nwhen applied to multimodal data, they often result in biased sample selection\nfrom the dominant modality. This unfairness hinders balanced multimodal\nlearning, which is crucial for achieving optimal performance. To address this\nissue, we propose three guidelines for designing a more balanced multimodal\nactive learning strategy. Following these guidelines, a novel approach is\nproposed to achieve more fair data selection by modulating the gradient\nembedding with the dominance degree among modalities. Our studies demonstrate\nthat the proposed method achieves more balanced multimodal learning by avoiding\ngreedy sample selection from the dominant modality. Our approach outperforms\nexisting active learning strategies on a variety of multimodal classification\ntasks. Overall, our work highlights the importance of balancing sample\nselection in multimodal active learning and provides a practical solution for\nachieving more balanced active learning for multimodal classification.\n","authors":["Meng Shen","Yizheng Huang","Jianxiong Yin","Heqing Zou","Deepu Rajan","Simon See"],"pdf_url":"https://arxiv.org/pdf/2306.08306v2.pdf","comment":"12 pages, accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2306.06236v3","updated":"2023-08-21T05:06:36Z","published":"2023-06-09T20:12:02Z","title":"iPLAN: Intent-Aware Planning in Heterogeneous Traffic via Distributed\n Multi-Agent Reinforcement Learning","summary":" Navigating safely and efficiently in dense and heterogeneous traffic\nscenarios is challenging for autonomous vehicles (AVs) due to their inability\nto infer the behaviors or intentions of nearby drivers. In this work, we\nintroduce a distributed multi-agent reinforcement learning (MARL) algorithm\nthat can predict trajectories and intents in dense and heterogeneous traffic\nscenarios. Our approach for intent-aware planning, iPLAN, allows agents to\ninfer nearby drivers' intents solely from their local observations. We model\ntwo distinct incentives for agents' strategies: Behavioral Incentive for\nhigh-level decision-making based on their driving behavior or personality and\nInstant Incentive for motion planning for collision avoidance based on the\ncurrent traffic state. Our approach enables agents to infer their opponents'\nbehavior incentives and integrate this inferred information into their\ndecision-making and motion-planning processes. We perform experiments on two\nsimulation environments, Non-Cooperative Navigation and Heterogeneous Highway.\nIn Heterogeneous Highway, results show that, compared with centralized training\ndecentralized execution (CTDE) MARL baselines such as QMIX and MAPPO, our\nmethod yields a 4.3% and 38.4% higher episodic reward in mild and chaotic\ntraffic, with 48.1% higher success rate and 80.6% longer survival time in\nchaotic traffic. We also compare with a decentralized training decentralized\nexecution (DTDE) baseline IPPO and demonstrate a higher episodic reward of\n12.7% and 6.3% in mild traffic and chaotic traffic, 25.3% higher success rate,\nand 13.7% longer survival time.\n","authors":["Xiyang Wu","Rohan Chandra","Tianrui Guan","Amrit Singh Bedi","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2306.06236v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10462v1","updated":"2023-08-21T04:31:06Z","published":"2023-08-21T04:31:06Z","title":"Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation\n with Large Language Models","summary":" Large Language Models (LLMs) possess impressive capabilities to generate\nmeaningful code snippets given natural language intents in zero-shot, i.e.,\nwithout the need for specific fine-tuning. In the perspective of unleashing\ntheir full potential, prior work has demonstrated the benefits of fine-tuning\nthe models to task-specific data. However, fine-tuning process demands heavy\ncomputational costs and is intractable when resources are scarce, especially\nfor models with billions of parameters. In light of these challenges, previous\nstudies explored In-Context Learning (ICL) as an effective strategy to generate\ncontextually appropriate code without fine-tuning. However, it operates at\ninference time and does not involve learning task-specific parameters,\npotentially limiting the model's performance on downstream tasks. In this\ncontext, we foresee that Parameter-Efficient Fine-Tuning (PEFT) techniques\ncarry a high potential for efficiently specializing LLMs to task-specific data.\nIn this paper, we deliver a comprehensive study of LLMs with the impact of PEFT\ntechniques under the automated code generation scenario. Our experimental\nresults reveal the superiority and potential of such techniques over ICL on a\nwide range of LLMs in reducing the computational burden and improving\nperformance. Therefore, the study opens opportunities for broader applications\nof PEFT in software engineering scenarios.\n","authors":["Martin Weyssow","Xin Zhou","Kisub Kim","David Lo","Houari Sahraoui"],"pdf_url":"https://arxiv.org/pdf/2308.10462v1.pdf","comment":"10+2 pages"},{"id":"http://arxiv.org/abs/2308.10457v1","updated":"2023-08-21T04:09:59Z","published":"2023-08-21T04:09:59Z","title":"Adaptive Local Steps Federated Learning with Differential Privacy Driven\n by Convergence Analysis","summary":" Federated Learning (FL) is a distributed machine learning technique that\nallows model training among multiple devices or organizations without sharing\ndata. However, while FL ensures that the raw data is not directly accessible to\nexternal adversaries, adversaries can still obtain some statistical information\nabout the data through differential attacks. Differential Privacy (DP) has been\nproposed, which adds noise to the model or gradients to prevent adversaries\nfrom inferring private information from the transmitted parameters. We\nreconsider the framework of differential privacy federated learning in\nresource-constrained scenarios (privacy budget and communication resources). We\nanalyze the convergence of federated learning with differential privacy (DPFL)\non resource-constrained scenarios and propose an Adaptive Local Steps\nDifferential Privacy Federated Learning (ALS-DPFL) algorithm. We experiment our\nalgorithm on the FashionMNIST and Cifar-10 datasets and achieve quite good\nperformance relative to previous work.\n","authors":["Xinpeng Ling","Jie Fu","Zhili Chen"],"pdf_url":"https://arxiv.org/pdf/2308.10457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10453v1","updated":"2023-08-21T03:58:04Z","published":"2023-08-21T03:58:04Z","title":"DOMINO++: Domain-aware Loss Regularization for Deep Learning\n Generalizability","summary":" Out-of-distribution (OOD) generalization poses a serious challenge for modern\ndeep learning (DL). OOD data consists of test data that is significantly\ndifferent from the model's training data. DL models that perform well on\nin-domain test data could struggle on OOD data. Overcoming this discrepancy is\nessential to the reliable deployment of DL. Proper model calibration decreases\nthe number of spurious connections that are made between model features and\nclass outputs. Hence, calibrated DL can improve OOD generalization by only\nlearning features that are truly indicative of the respective classes. Previous\nwork proposed domain-aware model calibration (DOMINO) to improve DL\ncalibration, but it lacks designs for model generalizability to OOD data. In\nthis work, we propose DOMINO++, a dual-guidance and dynamic domain-aware loss\nregularization focused on OOD generalizability. DOMINO++ integrates\nexpert-guided and data-guided knowledge in its regularization. Unlike DOMINO\nwhich imposed a fixed scaling and regularization rate, DOMINO++ designs a\ndynamic scaling factor and an adaptive regularization rate. Comprehensive\nevaluations compare DOMINO++ with DOMINO and the baseline model for head tissue\nsegmentation from magnetic resonance images (MRIs) on OOD data. The OOD data\nconsists of synthetic noisy and rotated datasets, as well as real data using a\ndifferent MRI scanner from a separate site. DOMINO++'s superior performance\ndemonstrates its potential to improve the trustworthy deployment of DL on real\nclinical data.\n","authors":["Skylar E. Stolte","Kyle Volle","Aprinda Indahlastari","Alejandro Albizu","Adam J. Woods","Kevin Brink","Matthew Hale","Ruogu Fang"],"pdf_url":"https://arxiv.org/pdf/2308.10453v1.pdf","comment":"12 pages, 5 figures, 5 tables, Accepted by the International\n Conference on Medical Image Computing and Computer Assisted Intervention\n (MICCAI) 2023"},{"id":"http://arxiv.org/abs/2308.10449v1","updated":"2023-08-21T03:50:09Z","published":"2023-08-21T03:50:09Z","title":"CVFC: Attention-Based Cross-View Feature Consistency for Weakly\n Supervised Semantic Segmentation of Pathology Images","summary":" Histopathology image segmentation is the gold standard for diagnosing cancer,\nand can indicate cancer prognosis. However, histopathology image segmentation\nrequires high-quality masks, so many studies now use imagelevel labels to\nachieve pixel-level segmentation to reduce the need for fine-grained\nannotation. To solve this problem, we propose an attention-based cross-view\nfeature consistency end-to-end pseudo-mask generation framework named CVFC\nbased on the attention mechanism. Specifically, CVFC is a three-branch joint\nframework composed of two Resnet38 and one Resnet50, and the independent branch\nmulti-scale integrated feature map to generate a class activation map (CAM); in\neach branch, through down-sampling and The expansion method adjusts the size of\nthe CAM; the middle branch projects the feature matrix to the query and key\nfeature spaces, and generates a feature space perception matrix through the\nconnection layer and inner product to adjust and refine the CAM of each branch;\nfinally, through the feature consistency loss and feature cross loss to\noptimize the parameters of CVFC in co-training mode. After a large number of\nexperiments, An IoU of 0.7122 and a fwIoU of 0.7018 are obtained on the\nWSSS4LUAD dataset, which outperforms HistoSegNet, SEAM, C-CAM, WSSS-Tissue, and\nOEEM, respectively.\n","authors":["Liangrui Pan","Lian Wang","Zhichao Feng","Liwen Xu","Shaoliang Peng"],"pdf_url":"https://arxiv.org/pdf/2308.10449v1.pdf","comment":"Submitted to BIBM2023"},{"id":"http://arxiv.org/abs/2307.03206v2","updated":"2023-08-21T03:41:25Z","published":"2023-07-06T04:10:12Z","title":"Optimal Bandwidth Selection for DENCLUE Algorithm","summary":" In modern day industry, clustering algorithms are daily routines of algorithm\nengineers. Although clustering algorithms experienced rapid growth before 2010.\nInnovation related to the research topic has stagnated after deep learning\nbecame the de facto industrial standard for machine learning applications. In\n2007, a density-based clustering algorithm named DENCLUE was invented to solve\nclustering problem for nonlinear data structures. However, its parameter\nselection problem was largely neglected until 2011. In this paper, we propose a\nnew approach to compute the optimal parameters for the DENCLUE algorithm, and\ndiscuss its performance in the experiment section.\n","authors":["Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2307.03206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10442v1","updated":"2023-08-21T03:28:34Z","published":"2023-08-21T03:28:34Z","title":"DySuse: Susceptibility Estimation in Dynamic Social Networks","summary":" Influence estimation aims to predict the total influence spread in social\nnetworks and has received surged attention in recent years. Most current\nstudies focus on estimating the total number of influenced users in a social\nnetwork, and neglect susceptibility estimation that aims to predict the\nprobability of each user being influenced from the individual perspective. As a\nmore fine-grained estimation task, susceptibility estimation is full of\nattractiveness and practical value. Based on the significance of susceptibility\nestimation and dynamic properties of social networks, we propose a task, called\nsusceptibility estimation in dynamic social networks, which is even more\nrealistic and valuable in real-world applications. Susceptibility estimation in\ndynamic networks has yet to be explored so far and is computationally\nintractable to naively adopt Monte Carlo simulation to obtain the results. To\nthis end, we propose a novel end-to-end framework DySuse based on dynamic graph\nembedding technology. Specifically, we leverage a structural feature module to\nindependently capture the structural information of influence diffusion on each\nsingle graph snapshot. Besides, {we propose the progressive mechanism according\nto the property of influence diffusion,} to couple the structural and temporal\ninformation during diffusion tightly. Moreover, a self-attention block {is\ndesigned to} further capture temporal dependency by flexibly weighting\nhistorical timestamps. Experimental results show that our framework is superior\nto the existing dynamic graph embedding models and has satisfactory prediction\nperformance in multiple influence diffusion models.\n","authors":["Yingdan Shi","Jingya Zhou","Congcong Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10442v1.pdf","comment":"This paper has been published in Expert Systems With Applications"},{"id":"http://arxiv.org/abs/2209.14609v6","updated":"2023-08-21T03:15:35Z","published":"2022-09-29T07:58:32Z","title":"Dataset Distillation Using Parameter Pruning","summary":" In this study, we propose a novel dataset distillation method based on\nparameter pruning. The proposed method can synthesize more robust distilled\ndatasets and improve distillation performance by pruning difficult-to-match\nparameters during the distillation process. Experimental results on two\nbenchmark datasets show the superiority of the proposed method.\n","authors":["Guang Li","Ren Togo","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2209.14609v6.pdf","comment":"Published as a journal paper at IEICE Trans. Fund"},{"id":"http://arxiv.org/abs/2308.10436v1","updated":"2023-08-21T03:13:38Z","published":"2023-08-21T03:13:38Z","title":"Approximately Equivariant Graph Networks","summary":" Graph neural networks (GNNs) are commonly described as being permutation\nequivariant with respect to node relabeling in the graph. This symmetry of GNNs\nis often compared to the translation equivariance symmetry of Euclidean\nconvolution neural networks (CNNs). However, these two symmetries are\nfundamentally different: The translation equivariance of CNNs corresponds to\nsymmetries of the fixed domain acting on the image signal (sometimes known as\nactive symmetries), whereas in GNNs any permutation acts on both the graph\nsignals and the graph domain (sometimes described as passive symmetries). In\nthis work, we focus on the active symmetries of GNNs, by considering a learning\nsetting where signals are supported on a fixed graph. In this case, the natural\nsymmetries of GNNs are the automorphisms of the graph. Since real-world graphs\ntend to be asymmetric, we relax the notion of symmetries by formalizing\napproximate symmetries via graph coarsening. We present a bias-variance formula\nthat quantifies the tradeoff between the loss in expressivity and the gain in\nthe regularity of the learned estimator, depending on the chosen symmetry\ngroup. To illustrate our approach, we conduct extensive experiments on image\ninpainting, traffic flow prediction, and human pose estimation with different\nchoices of symmetries. We show theoretically and empirically that the best\ngeneralization performance can be achieved by choosing a suitably larger group\nthan the graph automorphism group, but smaller than the full permutation group.\n","authors":["Ningyuan Huang","Ron Levie","Soledad Villar"],"pdf_url":"https://arxiv.org/pdf/2308.10436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02442v2","updated":"2023-08-21T03:11:28Z","published":"2023-08-04T16:14:43Z","title":"Adaptive Preferential Attached kNN Graph with Distribution-Awareness","summary":" Graph-based kNN algorithms have garnered widespread popularity for machine\nlearning tasks due to their simplicity and effectiveness. However, as factual\ndata often inherit complex distributions, the conventional kNN graph's reliance\non a unified k-value can hinder its performance. A crucial factor behind this\nchallenge is the presence of ambiguous samples along decision boundaries that\nare inevitably more prone to incorrect classifications. To address the\nsituation, we propose the Preferential Attached k-Nearest Neighbors Graph\n(paNNG), which adopts distribution-aware adaptive-k into graph construction. By\nincorporating distribution information as a cohesive entity, paNNG can\nsignificantly improve performance on ambiguous samples by \"pulling\" them\ntowards their original classes and hence enhance overall generalization\ncapability. Through rigorous evaluations on diverse datasets, paNNG outperforms\nstate-of-the-art algorithms, showcasing its adaptability and efficacy across\nvarious real-world scenarios.\n","authors":["Shaojie Min","Ji Liu"],"pdf_url":"https://arxiv.org/pdf/2308.02442v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07221v4","updated":"2023-08-21T02:56:43Z","published":"2023-08-14T15:47:25Z","title":"AudioFormer: Audio Transformer learns audio feature representations from\n discrete acoustic codes","summary":" We propose a method named AudioFormer,which learns audio feature\nrepresentations through the acquisition of discrete acoustic codes and\nsubsequently fine-tunes them for audio classification tasks. Initially,we\nintroduce a novel perspective by considering the audio classification task as a\nform of natural language understanding (NLU). Leveraging an existing neural\naudio codec model,we generate discrete acoustic codes and utilize them to train\na masked language model (MLM),thereby obtaining audio feature representations.\nFurthermore,we pioneer the integration of a Multi-Positive sample Contrastive\n(MPC) learning approach. This method enables the learning of joint\nrepresentations among multiple discrete acoustic codes within the same audio\ninput. In our experiments,we treat discrete acoustic codes as textual data and\ntrain a masked language model using a cloze-like methodology,ultimately\nderiving high-quality audio representations. Notably,the MPC learning technique\neffectively captures collaborative representations among distinct positive\nsamples. Our research outcomes demonstrate that AudioFormer attains\nsignificantly improved performance compared to prevailing monomodal audio\nclassification models across multiple datasets,and even outperforms\naudio-visual multimodal classification models on select datasets.\nSpecifically,our approach achieves remarkable results on datasets including\nAudioSet (2M,20K),and FSD50K,with performance scores of 53.9,45.1,and\n65.6,respectively. We have openly shared both the code and models:\nhttps://github.com/LZH-0225/AudioFormer.git.\n","authors":["Zhaohui Li","Haitao Wang","Xinghua Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.07221v4.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2201.09636v4","updated":"2023-08-21T02:55:53Z","published":"2022-01-24T12:31:24Z","title":"Neural Implicit Surface Evolution","summary":" This work investigates the use of smooth neural networks for modeling dynamic\nvariations of implicit surfaces under the level set equation (LSE). For this,\nit extends the representation of neural implicit surfaces to the space-time\n$\\mathbb{R}^3\\times \\mathbb{R}$, which opens up mechanisms for continuous\ngeometric transformations. Examples include evolving an initial surface towards\ngeneral vector fields, smoothing and sharpening using the mean curvature\nequation, and interpolations of initial conditions.\n The network training considers two constraints. A data term is responsible\nfor fitting the initial condition to the corresponding time instant, usually\n$\\mathbb{R}^3 \\times \\{0\\}$. Then, a LSE term forces the network to approximate\nthe underlying geometric evolution given by the LSE, without any supervision.\nThe network can also be initialized based on previously trained initial\nconditions, resulting in faster convergence compared to the standard approach.\n","authors":["Tiago Novello","Vinicius da Silva","Guilherme Schardong","Luiz Schirmer","Helio Lopes","Luiz Velho"],"pdf_url":"https://arxiv.org/pdf/2201.09636v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10427v1","updated":"2023-08-21T02:43:38Z","published":"2023-08-21T02:43:38Z","title":"Federated Learning Robust to Byzantine Attacks: Achieving Zero\n Optimality Gap","summary":" In this paper, we propose a robust aggregation method for federated learning\n(FL) that can effectively tackle malicious Byzantine attacks. At each user,\nmodel parameter is firstly updated by multiple steps, which is adjustable over\niterations, and then pushed to the aggregation center directly. This decreases\nthe number of interactions between the aggregation center and users, allows\neach user to set training parameter in a flexible way, and reduces computation\nburden compared with existing works that need to combine multiple historical\nmodel parameters. At the aggregation center, geometric median is leveraged to\ncombine the received model parameters from each user. Rigorous proof shows that\nzero optimality gap is achieved by our proposed method with linear convergence,\nas long as the fraction of Byzantine attackers is below half. Numerical results\nverify the effectiveness of our proposed method.\n","authors":["Shiyuan Zuo","Rongfei Fan","Han Hu","Ning Zhang","Shimin Gong"],"pdf_url":"https://arxiv.org/pdf/2308.10427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10425v1","updated":"2023-08-21T02:27:13Z","published":"2023-08-21T02:27:13Z","title":"Spatio-Temporal Adaptive Embedding Makes Vanilla Transformer SOTA for\n Traffic Forecasting","summary":" With the rapid development of the Intelligent Transportation System (ITS),\naccurate traffic forecasting has emerged as a critical challenge. The key\nbottleneck lies in capturing the intricate spatio-temporal traffic patterns. In\nrecent years, numerous neural networks with complicated architectures have been\nproposed to address this issue. However, the advancements in network\narchitectures have encountered diminishing performance gains. In this study, we\npresent a novel component called spatio-temporal adaptive embedding that can\nyield outstanding results with vanilla transformers. Our proposed\nSpatio-Temporal Adaptive Embedding transformer (STAEformer) achieves\nstate-of-the-art performance on five real-world traffic forecasting datasets.\nFurther experiments demonstrate that spatio-temporal adaptive embedding plays a\ncrucial role in traffic forecasting by effectively capturing intrinsic\nspatio-temporal relations and chronological information in traffic time series.\n","authors":["Hangchen Liu","Zheng Dong","Renhe Jiang","Jiewen Deng","Jinliang Deng","Quanjun Chen","Xuan Song"],"pdf_url":"https://arxiv.org/pdf/2308.10425v1.pdf","comment":"Accepted as CIKM2023 Short Paper"},{"id":"http://arxiv.org/abs/2305.00795v3","updated":"2023-08-21T02:14:41Z","published":"2023-05-01T12:47:55Z","title":"SelfDocSeg: A Self-Supervised vision-based Approach towards Document\n Segmentation","summary":" Document layout analysis is a known problem to the documents research\ncommunity and has been vastly explored yielding a multitude of solutions\nranging from text mining, and recognition to graph-based representation, visual\nfeature extraction, etc. However, most of the existing works have ignored the\ncrucial fact regarding the scarcity of labeled data. With growing internet\nconnectivity to personal life, an enormous amount of documents had been\navailable in the public domain and thus making data annotation a tedious task.\nWe address this challenge using self-supervision and unlike, the few existing\nself-supervised document segmentation approaches which use text mining and\ntextual labels, we use a complete vision-based approach in pre-training without\nany ground-truth label or its derivative. Instead, we generate pseudo-layouts\nfrom the document images to pre-train an image encoder to learn the document\nobject representation and localization in a self-supervised framework before\nfine-tuning it with an object detection model. We show that our pipeline sets a\nnew benchmark in this context and performs at par with the existing methods and\nthe supervised counterparts, if not outperforms. The code is made publicly\navailable at: https://github.com/MaitySubhajit/SelfDocSeg\n","authors":["Subhajit Maity","Sanket Biswas","Siladittya Manna","Ayan Banerjee","Josep Lladós","Saumik Bhattacharya","Umapada Pal"],"pdf_url":"https://arxiv.org/pdf/2305.00795v3.pdf","comment":"Accepted at The 17th International Conference on Document Analysis\n and Recognition (ICDAR 2023)"},{"id":"http://arxiv.org/abs/2308.10415v1","updated":"2023-08-21T01:52:01Z","published":"2023-08-21T01:52:01Z","title":"TokenSplit: Using Discrete Speech Representations for Direct, Refined,\n and Transcript-Conditioned Speech Separation and Recognition","summary":" We present TokenSplit, a speech separation model that acts on discrete token\nsequences. The model is trained on multiple tasks simultaneously: separate and\ntranscribe each speech source, and generate speech from text. The model\noperates on transcripts and audio token sequences and achieves multiple tasks\nthrough masking of inputs. The model is a sequence-to-sequence encoder-decoder\nmodel that uses the Transformer architecture. We also present a \"refinement\"\nversion of the model that predicts enhanced audio tokens from the audio tokens\nof speech separated by a conventional separation model. Using both objective\nmetrics and subjective MUSHRA listening tests, we show that our model achieves\nexcellent performance in terms of separation, both with or without transcript\nconditioning. We also measure the automatic speech recognition (ASR)\nperformance and provide audio samples of speech synthesis to demonstrate the\nadditional utility of our model.\n","authors":["Hakan Erdogan","Scott Wisdom","Xuankai Chang","Zalán Borsos","Marco Tagliasacchi","Neil Zeghidour","John R. Hershey"],"pdf_url":"https://arxiv.org/pdf/2308.10415v1.pdf","comment":"INTERSPEECH 2023, project webpage with audio demos at\n https://google-research.github.io/sound-separation/papers/tokensplit"},{"id":"http://arxiv.org/abs/2211.15046v5","updated":"2023-08-21T01:50:49Z","published":"2022-11-28T04:08:55Z","title":"PCT-CycleGAN: Paired Complementary Temporal Cycle-Consistent Adversarial\n Networks for Radar-Based Precipitation Nowcasting","summary":" The precipitation nowcasting methods have been elaborated over the centuries\nbecause rain has a crucial impact on human life. Not only quantitative\nprecipitation forecast (QPF) models and convolutional long short-term memory\n(ConvLSTM), but also various sophisticated methods such as the latest MetNet-2\nare emerging. In this paper, we propose a paired complementary temporal\ncycle-consistent adversarial networks (PCT-CycleGAN) for radar-based\nprecipitation nowcasting, inspired by cycle-consistent adversarial networks\n(CycleGAN), which shows strong performance in image-to-image translation.\nPCT-CycleGAN generates temporal causality using two generator networks with\nforward and backward temporal dynamics in paired complementary cycles. Each\ngenerator network learns a huge number of one-to-one mappings about\ntime-dependent radar-based precipitation data to approximate a mapping function\nrepresenting the temporal dynamics in each direction. To create robust temporal\ncausality between paired complementary cycles, novel connection loss is\nproposed. And torrential loss to cover exceptional heavy rain events is also\nproposed. The generator network learning forward temporal dynamics in\nPCT-CycleGAN generates radar-based precipitation data 10 minutes from the\ncurrent time. Also, it provides a reliable prediction of up to 2 hours with\niterative forecasting. The superiority of PCT-CycleGAN is demonstrated through\nqualitative and quantitative comparisons with several previous methods.\n","authors":["Jaeho Choi","Yura Kim","Kwang-Ho Kim","Sung-Hwa Jung","Ikhyun Cho"],"pdf_url":"https://arxiv.org/pdf/2211.15046v5.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2308.10407v1","updated":"2023-08-21T01:21:21Z","published":"2023-08-21T01:21:21Z","title":"Federated Learning for Connected and Automated Vehicles: A Survey of\n Existing Approaches and Challenges","summary":" Machine learning (ML) is widely used for key tasks in Connected and Automated\nVehicles (CAV), including perception, planning, and control. However, its\nreliance on vehicular data for model training presents significant challenges\nrelated to in-vehicle user privacy and communication overhead generated by\nmassive data volumes. Federated learning (FL) is a decentralized ML approach\nthat enables multiple vehicles to collaboratively develop models, broadening\nlearning from various driving environments, enhancing overall performance, and\nsimultaneously securing local vehicle data privacy and security. This survey\npaper presents a review of the advancements made in the application of FL for\nCAV (FL4CAV). First, centralized and decentralized frameworks of FL are\nanalyzed, highlighting their key characteristics and methodologies. Second,\ndiverse data sources, models, and data security techniques relevant to FL in\nCAVs are reviewed, emphasizing their significance in ensuring privacy and\nconfidentiality. Third, specific and important applications of FL are explored,\nproviding insight into the base models and datasets employed for each\napplication. Finally, existing challenges for FL4CAV are listed and potential\ndirections for future work are discussed to further enhance the effectiveness\nand efficiency of FL in the context of CAV.\n","authors":["Vishnu Pandi Chellapandi","Liangqi Yuan","Christopher G. Brinton","Stanislaw H Zak","Ziran Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.08786v3","updated":"2023-08-21T01:14:25Z","published":"2021-02-17T14:28:41Z","title":"Walking Out of the Weisfeiler Leman Hierarchy: Graph Learning Beyond\n Message Passing","summary":" We propose CRaWl, a novel neural network architecture for graph learning.\nLike graph neural networks, CRaWl layers update node features on a graph and\nthus can freely be combined or interleaved with GNN layers. Yet CRaWl operates\nfundamentally different from message passing graph neural networks. CRaWl\nlayers extract and aggregate information on subgraphs appearing along random\nwalks through a graph using 1D Convolutions. Thereby it detects long range\ninteractions and computes non-local features. As the theoretical basis for our\napproach, we prove a theorem stating that the expressiveness of CRaWl is\nincomparable with that of the Weisfeiler Leman algorithm and hence with graph\nneural networks. That is, there are functions expressible by CRaWl, but not by\nGNNs and vice versa. This result extends to higher levels of the Weisfeiler\nLeman hierarchy and thus to higher-order GNNs. Empirically, we show that CRaWl\nmatches state-of-the-art GNN architectures across a multitude of benchmark\ndatasets for classification and regression on graphs.\n","authors":["Jan Tönshoff","Martin Ritzert","Hinrikus Wolf","Martin Grohe"],"pdf_url":"https://arxiv.org/pdf/2102.08786v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01792v2","updated":"2023-08-21T01:07:53Z","published":"2023-06-01T08:10:03Z","title":"Task Relation-aware Continual User Representation Learning","summary":" User modeling, which learns to represent users into a low-dimensional\nrepresentation space based on their past behaviors, got a surge of interest\nfrom the industry for providing personalized services to users. Previous\nefforts in user modeling mainly focus on learning a task-specific user\nrepresentation that is designed for a single task. However, since learning\ntask-specific user representations for every task is infeasible, recent studies\nintroduce the concept of universal user representation, which is a more\ngeneralized representation of a user that is relevant to a variety of tasks.\nDespite their effectiveness, existing approaches for learning universal user\nrepresentations are impractical in real-world applications due to the data\nrequirement, catastrophic forgetting and the limited learning capability for\ncontinually added tasks. In this paper, we propose a novel continual user\nrepresentation learning method, called TERACON, whose learning capability is\nnot limited as the number of learned tasks increases while capturing the\nrelationship between the tasks. The main idea is to introduce an embedding for\neach task, i.e., task embedding, which is utilized to generate task-specific\nsoft masks that not only allow the entire model parameters to be updated until\nthe end of training sequence, but also facilitate the relationship between the\ntasks to be captured. Moreover, we introduce a novel knowledge retention module\nwith pseudo-labeling strategy that successfully alleviates the long-standing\nproblem of continual learning, i.e., catastrophic forgetting. Extensive\nexperiments on public and proprietary real-world datasets demonstrate the\nsuperiority and practicality of TERACON. Our code is available at\nhttps://github.com/Sein-Kim/TERACON.\n","authors":["Sein Kim","Namkyeong Lee","Donghyun Kim","Minchul Yang","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2306.01792v2.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2302.11068v2","updated":"2023-08-21T01:07:47Z","published":"2023-02-21T23:49:36Z","title":"Low Rank Matrix Completion via Robust Alternating Minimization in Nearly\n Linear Time","summary":" Given a matrix $M\\in \\mathbb{R}^{m\\times n}$, the low rank matrix completion\nproblem asks us to find a rank-$k$ approximation of $M$ as $UV^\\top$ for $U\\in\n\\mathbb{R}^{m\\times k}$ and $V\\in \\mathbb{R}^{n\\times k}$ by only observing a\nfew entries specified by a set of entries $\\Omega\\subseteq [m]\\times [n]$. In\nparticular, we examine an approach that is widely used in practice -- the\nalternating minimization framework. Jain, Netrapalli and Sanghavi~\\cite{jns13}\nshowed that if $M$ has incoherent rows and columns, then alternating\nminimization provably recovers the matrix $M$ by observing a nearly linear in\n$n$ number of entries. While the sample complexity has been subsequently\nimproved~\\cite{glz17}, alternating minimization steps are required to be\ncomputed exactly. This hinders the development of more efficient algorithms and\nfails to depict the practical implementation of alternating minimization, where\nthe updates are usually performed approximately in favor of efficiency.\n In this paper, we take a major step towards a more efficient and error-robust\nalternating minimization framework. To this end, we develop an analytical\nframework for alternating minimization that can tolerate moderate amount of\nerrors caused by approximate updates. Moreover, our algorithm runs in time\n$\\widetilde O(|\\Omega| k)$, which is nearly linear in the time to verify the\nsolution while preserving the sample complexity. This improves upon all prior\nknown alternating minimization approaches which require $\\widetilde O(|\\Omega|\nk^2)$ time.\n","authors":["Yuzhou Gu","Zhao Song","Junze Yin","Lichen Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.11068v2.pdf","comment":"Improve the runtime from $O(mnk)$ to $O|\\Omega| k)$"},{"id":"http://arxiv.org/abs/2011.13772v5","updated":"2023-08-21T00:53:49Z","published":"2020-11-27T15:08:34Z","title":"Gradient Descent for Deep Matrix Factorization: Dynamics and Implicit\n Bias towards Low Rank","summary":" In deep learning, it is common to use more network parameters than training\npoints. In such scenarioof over-parameterization, there are usually multiple\nnetworks that achieve zero training error so that thetraining algorithm induces\nan implicit bias on the computed solution. In practice, (stochastic)\ngradientdescent tends to prefer solutions which generalize well, which provides\na possible explanation of thesuccess of deep learning. In this paper we analyze\nthe dynamics of gradient descent in the simplifiedsetting of linear networks\nand of an estimation problem. Although we are not in an\noverparameterizedscenario, our analysis nevertheless provides insights into the\nphenomenon of implicit bias. In fact, wederive a rigorous analysis of the\ndynamics of vanilla gradient descent, and characterize the dynamicalconvergence\nof the spectrum. We are able to accurately locate time intervals where the\neffective rankof the iterates is close to the effective rank of a low-rank\nprojection of the ground-truth matrix. Inpractice, those intervals can be used\nas criteria for early stopping if a certain regularity is desired. Wealso\nprovide empirical evidence for implicit bias in more general scenarios, such as\nmatrix sensing andrandom initialization. This suggests that deep learning\nprefers trajectories whose complexity (measuredin terms of effective rank) is\nmonotonically increasing, which we believe is a fundamental concept for\nthetheoretical understanding of deep learning.\n","authors":["Hung-Hsu Chou","Carsten Gieshoff","Johannes Maly","Holger Rauhut"],"pdf_url":"https://arxiv.org/pdf/2011.13772v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09450v2","updated":"2023-08-21T00:36:44Z","published":"2022-11-28T06:43:32Z","title":"Accelerating Antimicrobial Peptide Discovery with Latent Structure","summary":" Antimicrobial peptides (AMPs) are promising therapeutic approaches against\ndrug-resistant pathogens. Recently, deep generative models are used to discover\nnew AMPs. However, previous studies mainly focus on peptide sequence attributes\nand do not consider crucial structure information. In this paper, we propose a\nlatent sequence-structure model for designing AMPs (LSSAMP). LSSAMP exploits\nmulti-scale vector quantization in the latent space to represent secondary\nstructures (e.g. alpha helix and beta sheet). By sampling in the latent space,\nLSSAMP can simultaneously generate peptides with ideal sequence attributes and\nsecondary structures. Experimental results show that the peptides generated by\nLSSAMP have a high probability of antimicrobial activity. Our wet laboratory\nexperiments verified that two of the 21 candidates exhibit strong antimicrobial\nactivity. The code is released at https://github.com/dqwang122/LSSAMP.\n","authors":["Danqing Wang","Zeyu Wen","Fei Ye","Lei Li","Hao Zhou"],"pdf_url":"https://arxiv.org/pdf/2212.09450v2.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2308.10396v1","updated":"2023-08-21T00:22:32Z","published":"2023-08-21T00:22:32Z","title":"Label Selection Approach to Learning from Crowds","summary":" Supervised learning, especially supervised deep learning, requires large\namounts of labeled data. One approach to collect large amounts of labeled data\nis by using a crowdsourcing platform where numerous workers perform the\nannotation tasks. However, the annotation results often contain label noise, as\nthe annotation skills vary depending on the crowd workers and their ability to\ncomplete the task correctly. Learning from Crowds is a framework which directly\ntrains the models using noisy labeled data from crowd workers. In this study,\nwe propose a novel Learning from Crowds model, inspired by SelectiveNet\nproposed for the selective prediction problem. The proposed method called Label\nSelection Layer trains a prediction model by automatically determining whether\nto use a worker's label for training using a selector network. A major\nadvantage of the proposed method is that it can be applied to almost all\nvariants of supervised learning problems by simply adding a selector network\nand changing the objective function for existing models, without explicitly\nassuming a model of the noise in crowd annotations. The experimental results\nshow that the performance of the proposed method is almost equivalent to or\nbetter than the Crowd Layer, which is one of the state-of-the-art methods for\nDeep Learning from Crowds, except for the regression problem case.\n","authors":["Kosuke Yoshimura","Hisashi Kashima"],"pdf_url":"https://arxiv.org/pdf/2308.10396v1.pdf","comment":"15 pages, 1 figure"},{"id":"http://arxiv.org/abs/2210.09107v2","updated":"2023-08-21T23:49:48Z","published":"2022-10-17T13:57:07Z","title":"ISEE.U: Distributed online active target localization with unpredictable\n targets","summary":" This paper addresses target localization with an online active learning\nalgorithm defined by distributed, simple and fast computations at each node,\nwith no parameters to tune and where the estimate of the target position at\neach agent is asymptotically equal in expectation to the centralized\nmaximum-likelihood estimator. ISEE.U takes noisy distances at each agent and\nfinds a control that maximizes localization accuracy. We do not assume specific\ntarget dynamics and, thus, our method is robust when facing unpredictable\ntargets. Each agent computes the control that maximizes overall target position\naccuracy via a local estimate of the Fisher Information Matrix. We compared the\nproposed method with a state of the art algorithm outperforming it when the\ntarget movements do not follow a prescribed trajectory, with x100 less\ncomputation time, even when our method is running in one central CPU.\n","authors":["Miguel Vasques","Claudia Soares","João Gomes"],"pdf_url":"https://arxiv.org/pdf/2210.09107v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11080v1","updated":"2023-08-21T23:28:26Z","published":"2023-08-21T23:28:26Z","title":"Stress representations for tensor basis neural networks: alternative\n formulations to Finger-Rivlin-Ericksen","summary":" Data-driven constitutive modeling frameworks based on neural networks and\nclassical representation theorems have recently gained considerable attention\ndue to their ability to easily incorporate constitutive constraints and their\nexcellent generalization performance. In these models, the stress prediction\nfollows from a linear combination of invariant-dependent coefficient functions\nand known tensor basis generators. However, thus far the formulations have been\nlimited to stress representations based on the classical Rivlin and Ericksen\nform, while the performance of alternative representations has yet to be\ninvestigated. In this work, we survey a variety of tensor basis neural network\nmodels for modeling hyperelastic materials in a finite deformation context,\nincluding a number of so far unexplored formulations which use theoretically\nequivalent invariants and generators to Finger-Rivlin-Ericksen. Furthermore, we\ncompare potential-based and coefficient-based approaches, as well as different\ncalibration techniques. Nine variants are tested against both noisy and\nnoiseless datasets for three different materials. Theoretical and practical\ninsights into the performance of each formulation are given.\n","authors":["Jan N. Fuhg","Nikolaos Bouklas","Reese E. Jones"],"pdf_url":"https://arxiv.org/pdf/2308.11080v1.pdf","comment":"32 pages, 20 figures, 4 appendices"},{"id":"http://arxiv.org/abs/2308.11079v1","updated":"2023-08-21T23:16:58Z","published":"2023-08-21T23:16:58Z","title":"Long-Term Prediction of Natural Video Sequences with Robust Video\n Predictors","summary":" Predicting high dimensional video sequences is a curiously difficult problem.\nThe number of possible futures for a given video sequence grows exponentially\nover time due to uncertainty. This is especially evident when trying to predict\ncomplicated natural video scenes from a limited snapshot of the world. The\ninherent uncertainty accumulates the further into the future you predict making\nlong-term prediction very difficult. In this work we introduce a number of\nimprovements to existing work that aid in creating Robust Video Predictors\n(RoViPs). We show that with a combination of deep Perceptual and\nuncertainty-based reconstruction losses we are able to create high quality\nshort-term predictions. Attention-based skip connections are utilised to allow\nfor long range spatial movement of input features to further improve\nperformance. Finally, we show that by simply making the predictor robust to its\nown prediction errors, it is possible to produce very long, realistic natural\nvideo sequences using an iterated single-step prediction task.\n","authors":["Luke Ditria","Tom Drummond"],"pdf_url":"https://arxiv.org/pdf/2308.11079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11075v1","updated":"2023-08-21T22:50:54Z","published":"2023-08-21T22:50:54Z","title":"A Deep Dive into the Connections Between the Renormalization Group and\n Deep Learning in the Ising Model","summary":" The renormalization group (RG) is an essential technique in statistical\nphysics and quantum field theory, which considers scale-invariant properties of\nphysical theories and how these theories' parameters change with scaling. Deep\nlearning is a powerful computational technique that uses multi-layered neural\nnetworks to solve a myriad of complicated problems. Previous research suggests\nthe possibility that unsupervised deep learning may be a form of RG flow, by\nbeing a layer-by-layer coarse graining of the original data. We examined this\nconnection on a more rigorous basis for the simple example of Kadanoff block\nrenormalization of the 2D nearest-neighbor Ising model, with our deep learning\naccomplished via Restricted Boltzmann Machines (RBMs). We developed extensive\nrenormalization techniques for the 1D and 2D Ising model to provide a baseline\nfor comparison. For the 1D Ising model, we successfully used Adam optimization\non a correlation length loss function to learn the group flow, yielding results\nconsistent with the analytical model for infinite N. For the 2D Ising model, we\nsuccessfully generated Ising model samples using the Wolff algorithm, and\nperformed the group flow using a quasi-deterministic method, validating these\nresults by calculating the critical exponent \\nu. We then examined RBM learning\nof the Ising model layer by layer, finding a blocking structure in the learning\nthat is qualitatively similar to RG. Lastly, we directly compared the weights\nof each layer from the learning to Ising spin renormalization, but found\nquantitative inconsistencies for the simple case of nearest-neighbor Ising\nmodels.\n","authors":["Kelsie Taylor"],"pdf_url":"https://arxiv.org/pdf/2308.11075v1.pdf","comment":"103 pages, 87 figures, Senior Thesis, Advisors: Maria Spiropulu and\n Joseph Lykken"},{"id":"http://arxiv.org/abs/2210.16371v3","updated":"2023-08-21T22:45:47Z","published":"2022-10-28T19:14:03Z","title":"Distributed Black-box Attack against Image Classification Cloud Services","summary":" Black-box adversarial attacks can fool image classifiers into misclassifying\nimages without requiring access to model structure and weights. Recent studies\nhave reported attack success rates of over 95% with less than 1,000 queries.\nThe question then arises of whether black-box attacks have become a real threat\nagainst IoT devices that rely on cloud APIs to achieve image classification. To\nshed some light on this, note that prior research has primarily focused on\nincreasing the success rate and reducing the number of queries. However,\nanother crucial factor for black-box attacks against cloud APIs is the time\nrequired to perform the attack. This paper applies black-box attacks directly\nto cloud APIs rather than to local models, thereby avoiding mistakes made in\nprior research that applied the perturbation before image encoding and\npre-processing. Further, we exploit load balancing to enable distributed\nblack-box attacks that can reduce the attack time by a factor of about five for\nboth local search and gradient estimation methods.\n","authors":["Han Wu","Sareh Rowlands","Johan Wahlstrom"],"pdf_url":"https://arxiv.org/pdf/2210.16371v3.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.11071v1","updated":"2023-08-21T22:40:36Z","published":"2023-08-21T22:40:36Z","title":"Neural Amortized Inference for Nested Multi-agent Reasoning","summary":" Multi-agent interactions, such as communication, teaching, and bluffing,\noften rely on higher-order social inference, i.e., understanding how others\ninfer oneself. Such intricate reasoning can be effectively modeled through\nnested multi-agent reasoning. Nonetheless, the computational complexity\nescalates exponentially with each level of reasoning, posing a significant\nchallenge. However, humans effortlessly perform complex social inferences as\npart of their daily lives. To bridge the gap between human-like inference\ncapabilities and computational limitations, we propose a novel approach:\nleveraging neural networks to amortize high-order social inference, thereby\nexpediting nested multi-agent reasoning. We evaluate our method in two\nchallenging multi-agent interaction domains. The experimental results\ndemonstrate that our method is computationally efficient while exhibiting\nminimal degradation in accuracy.\n","authors":["Kunal Jha","Tuan Anh Le","Chuanyang Jin","Yen-Ling Kuo","Joshua B. Tenenbaum","Tianmin Shu"],"pdf_url":"https://arxiv.org/pdf/2308.11071v1.pdf","comment":"8 pages, 10 figures"},{"id":"http://arxiv.org/abs/2203.03604v2","updated":"2023-08-21T22:31:18Z","published":"2022-03-07T18:55:20Z","title":"Differential Privacy Amplification in Quantum and Quantum-inspired\n Algorithms","summary":" Differential privacy provides a theoretical framework for processing a\ndataset about $n$ users, in a way that the output reveals a minimal information\nabout any single user. Such notion of privacy is usually ensured by\nnoise-adding mechanisms and amplified by several processes, including\nsubsampling, shuffling, iteration, mixing and diffusion. In this work, we\nprovide privacy amplification bounds for quantum and quantum-inspired\nalgorithms. In particular, we show for the first time, that algorithms running\non quantum encoding of a classical dataset or the outcomes of quantum-inspired\nclassical sampling, amplify differential privacy. Moreover, we prove that a\nquantum version of differential privacy is amplified by the composition of\nquantum channels, provided that they satisfy some mixing conditions.\n","authors":["Armando Angrisani","Mina Doosti","Elham Kashefi"],"pdf_url":"https://arxiv.org/pdf/2203.03604v2.pdf","comment":"This article is superseded by arXiv:2307.04733"},{"id":"http://arxiv.org/abs/2308.11068v1","updated":"2023-08-21T22:26:21Z","published":"2023-08-21T22:26:21Z","title":"Topological Graph Signal Compression","summary":" Recently emerged Topological Deep Learning (TDL) methods aim to extend\ncurrent Graph Neural Networks (GNN) by naturally processing higher-order\ninteractions, going beyond the pairwise relations and local neighborhoods\ndefined by graph representations. In this paper we propose a novel TDL-based\nmethod for compressing signals over graphs, consisting in two main steps:\nfirst, disjoint sets of higher-order structures are inferred based on the\noriginal signal --by clustering $N$ datapoints into $K\\ll N$ collections; then,\na topological-inspired message passing gets a compressed representation of the\nsignal within those multi-element sets. Our results show that our framework\nimproves both standard GNN and feed-forward architectures in compressing\ntemporal link-based signals from two real-word Internet Service Provider\nNetworks' datasets --from $30\\%$ up to $90\\%$ better reconstruction errors\nacross all evaluation scenarios--, suggesting that it better captures and\nexploits spatial and temporal correlations over the whole graph-based network\nstructure.\n","authors":["Guillermo Bernárdez","Lev Telyatnikov","Eduard Alarcón","Albert Cabellos-Aparicio","Pere Barlet-Ros","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2308.11068v1.pdf","comment":"9 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2203.03591v2","updated":"2023-08-21T22:25:13Z","published":"2022-03-07T18:38:02Z","title":"Quantum Local Differential Privacy and Quantum Statistical Query Model","summary":" Quantum statistical queries provide a theoretical framework for investigating\nthe computational power of a learner with limited quantum resources. This model\nis particularly relevant in the current context, where available quantum\ndevices are subject to severe noise and have limited quantum memory. On the\nother hand, the framework of quantum differential privacy demonstrates that\nnoise can, in some cases, benefit the computation, enhancing robustness and\nstatistical security. In this work, we establish an equivalence between quantum\nstatistical queries and quantum differential privacy in the local model,\nextending a celebrated classical result to the quantum setting. Furthermore, we\nderive strong data processing inequalities for the quantum relative entropy\nunder local differential privacy and apply this result to the task of\nasymmetric hypothesis testing with restricted measurements. Finally, we\nconsider the task of quantum multi-party computation under local differential\nprivacy. As a proof of principle, we demonstrate that the parity function is\nefficiently learnable in this model, whereas the corresponding classical task\nrequires exponentially many samples.\n","authors":["Armando Angrisani","Elham Kashefi"],"pdf_url":"https://arxiv.org/pdf/2203.03591v2.pdf","comment":"This version significantly extends the previous one with new entropic\n inequalities under local privacy, a private version of the quantum Stein's\n Lemma and an application to private multi-party quantum computation"},{"id":"http://arxiv.org/abs/2308.11062v1","updated":"2023-08-21T22:15:20Z","published":"2023-08-21T22:15:20Z","title":"UnLoc: A Unified Framework for Video Localization Tasks","summary":" While large-scale image-text pretrained models such as CLIP have been used\nfor multiple video-level tasks on trimmed videos, their use for temporal\nlocalization in untrimmed videos is still a relatively unexplored task. We\ndesign a new approach for this called UnLoc, which uses pretrained image and\ntext towers, and feeds tokens to a video-text fusion model. The output of the\nfusion module are then used to construct a feature pyramid in which each level\nconnects to a head to predict a per-frame relevancy score and start/end time\ndisplacements. Unlike previous works, our architecture enables Moment\nRetrieval, Temporal Localization, and Action Segmentation with a single stage\nmodel, without the need for action proposals, motion based pretrained features\nor representation masking. Unlike specialized models, we achieve state of the\nart results on all three different localization tasks with a unified approach.\nCode will be available at: \\url{https://github.com/google-research/scenic}.\n","authors":["Shen Yan","Xuehan Xiong","Arsha Nagrani","Anurag Arnab","Zhonghao Wang","Weina Ge","David Ross","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2308.11062v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11053v1","updated":"2023-08-21T21:36:56Z","published":"2023-08-21T21:36:56Z","title":"Ultra Dual-Path Compression For Joint Echo Cancellation And Noise\n Suppression","summary":" Echo cancellation and noise reduction are essential for full-duplex\ncommunication, yet most existing neural networks have high computational costs\nand are inflexible in tuning model complexity. In this paper, we introduce\ntime-frequency dual-path compression to achieve a wide range of compression\nratios on computational cost. Specifically, for frequency compression,\ntrainable filters are used to replace manually designed filters for dimension\nreduction. For time compression, only using frame skipped prediction causes\nlarge performance degradation, which can be alleviated by a post-processing\nnetwork with full sequence modeling. We have found that under fixed compression\nratios, dual-path compression combining both the time and frequency methods\nwill give further performance improvement, covering compression ratios from 4x\nto 32x with little model size change. Moreover, the proposed models show\ncompetitive performance compared with fast FullSubNet and DeepFilterNet. A demo\npage can be found at\nhangtingchen.github.io/ultra_dual_path_compression.github.io/.\n","authors":["Hangting Chen","Jianwei Yu","Yi Luo","Rongzhi Gu","Weihua Li","Zhuocheng Lu","Chao Weng"],"pdf_url":"https://arxiv.org/pdf/2308.11053v1.pdf","comment":"Accepted by Interspeech 2023"},{"id":"http://arxiv.org/abs/2304.14633v2","updated":"2023-08-21T21:15:49Z","published":"2023-04-28T05:30:19Z","title":"CVRecon: Rethinking 3D Geometric Feature Learning For Neural\n Reconstruction","summary":" Recent advances in neural reconstruction using posed image sequences have\nmade remarkable progress. However, due to the lack of depth information,\nexisting volumetric-based techniques simply duplicate 2D image features of the\nobject surface along the entire camera ray. We contend this duplication\nintroduces noise in empty and occluded spaces, posing challenges for producing\nhigh-quality 3D geometry. Drawing inspiration from traditional multi-view\nstereo methods, we propose an end-to-end 3D neural reconstruction framework\nCVRecon, designed to exploit the rich geometric embedding in the cost volumes\nto facilitate 3D geometric feature learning. Furthermore, we present\nRay-contextual Compensated Cost Volume (RCCV), a novel 3D geometric feature\nrepresentation that encodes view-dependent information with improved integrity\nand robustness. Through comprehensive experiments, we demonstrate that our\napproach significantly improves the reconstruction quality in various metrics\nand recovers clear fine details of the 3D geometries. Our extensive ablation\nstudies provide insights into the development of effective 3D geometric feature\nlearning schemes. Project page: https://cvrecon.ziyue.cool/\n","authors":["Ziyue Feng","Leon Yang","Pengsheng Guo","Bing Li"],"pdf_url":"https://arxiv.org/pdf/2304.14633v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11047v1","updated":"2023-08-21T21:13:30Z","published":"2023-08-21T21:13:30Z","title":"Harmonization Across Imaging Locations(HAIL): One-Shot Learning for\n Brain MRI","summary":" For machine learning-based prognosis and diagnosis of rare diseases, such as\npediatric brain tumors, it is necessary to gather medical imaging data from\nmultiple clinical sites that may use different devices and protocols. Deep\nlearning-driven harmonization of radiologic images relies on generative\nadversarial networks (GANs). However, GANs notoriously generate pseudo\nstructures that do not exist in the original training data, a phenomenon known\nas \"hallucination\". To prevent hallucination in medical imaging, such as\nmagnetic resonance images (MRI) of the brain, we propose a one-shot learning\nmethod where we utilize neural style transfer for harmonization. At test time,\nthe method uses one image from a clinical site to generate an image that\nmatches the intensity scale of the collaborating sites. Our approach combines\nlearning a feature extractor, neural style transfer, and adaptive instance\nnormalization. We further propose a novel strategy to evaluate the\neffectiveness of image harmonization approaches with evaluation metrics that\nboth measure image style harmonization and assess the preservation of\nanatomical structures. Experimental results demonstrate the effectiveness of\nour method in preserving patient anatomy while adjusting the image intensities\nto a new clinical site. Our general harmonization model can be used on unseen\ndata from new sites, making it a valuable tool for real-world medical\napplications and clinical trials.\n","authors":["Abhijeet Parida","Zhifan Jiang","Syed Muhammad Anwar","Nicholas Foreman","Nicholas Stence","Michael J. Fisher","Roger J. Packer","Robert A. Avery","Marius George Linguraru"],"pdf_url":"https://arxiv.org/pdf/2308.11047v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2210.09975v2","updated":"2023-08-21T21:10:21Z","published":"2022-10-18T16:38:32Z","title":"Risk of re-identification for shared clinical speech recordings","summary":" Large, curated datasets are required to leverage speech-based tools in\nhealthcare. These are costly to produce, resulting in increased interest in\ndata sharing. As speech can potentially identify speakers (i.e., voiceprints),\nsharing recordings raises privacy concerns. We examine the re-identification\nrisk for speech recordings, without reference to demographic or metadata, using\na state-of-the-art speaker recognition system. We demonstrate that the risk is\ninversely related to the number of comparisons an adversary must consider,\ni.e., the search space. Risk is high for a small search space but drops as the\nsearch space grows ($precision >0.85$ for $<1*10^{6}$ comparisons, $precision\n<0.5$ for $>3*10^{6}$ comparisons). Next, we show that the nature of a speech\nrecording influences re-identification risk, with non-connected speech (e.g.,\nvowel prolongation) being harder to identify. Our findings suggest that speaker\nrecognition systems can be used to re-identify participants in specific\ncircumstances, but in practice, the re-identification risk appears low.\n","authors":["Daniela A. Wiepert","Bradley A. Malin","Joseph R. Duffy","Rene L. Utianski","John L. Stricker","David T. Jones","Hugo Botha"],"pdf_url":"https://arxiv.org/pdf/2210.09975v2.pdf","comment":"24 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.11043v1","updated":"2023-08-21T21:06:36Z","published":"2023-08-21T21:06:36Z","title":"Spurious Correlations and Where to Find Them","summary":" Spurious correlations occur when a model learns unreliable features from the\ndata and are a well-known drawback of data-driven learning. Although there are\nseveral algorithms proposed to mitigate it, we are yet to jointly derive the\nindicators of spurious correlations. As a result, the solutions built upon\nstandalone hypotheses fail to beat simple ERM baselines. We collect some of the\ncommonly studied hypotheses behind the occurrence of spurious correlations and\ninvestigate their influence on standard ERM baselines using synthetic datasets\ngenerated from causal graphs. Subsequently, we observe patterns connecting\nthese hypotheses and model design choices.\n","authors":["Gautam Sreekumar","Vishnu Naresh Boddeti"],"pdf_url":"https://arxiv.org/pdf/2308.11043v1.pdf","comment":"2nd Workshop on SCIS, ICML 2023"},{"id":"http://arxiv.org/abs/2305.08559v2","updated":"2023-08-21T20:50:58Z","published":"2023-05-15T11:41:30Z","title":"Designing Discontinuities","summary":" Discontinuities can be fairly arbitrary but also cause a significant impact\non outcomes in social systems. Indeed, their arbitrariness is why they have\nbeen used to infer causal relationships among variables in numerous settings.\nRegression discontinuity from econometrics assumes the existence of a\ndiscontinuous variable that splits the population into distinct partitions to\nestimate the causal effects of a given phenomenon. Here we consider the design\nof partitions for a given discontinuous variable to optimize a certain effect\npreviously studied using regression discontinuity. To do so, we propose a\nquantization-theoretic approach to optimize the effect of interest, first\nlearning the causal effect size of a given discontinuous variable and then\napplying dynamic programming for optimal quantization design of discontinuities\nthat balance the gain and loss in the effect size. We also develop a\ncomputationally-efficient reinforcement learning algorithm for the dynamic\nprogramming formulation of optimal quantization. We demonstrate our approach by\ndesigning optimal time zone borders for counterfactuals of social capital,\nsocial mobility, and health. This is based on regression discontinuity analyses\nwe perform on novel data, which may be of independent empirical interest in\nshowing a causal relationship between sunset time and social capital.\n","authors":["Ibtihal Ferwana","Suyoung Park","Ting-Yi Wu","Lav R. Varshney"],"pdf_url":"https://arxiv.org/pdf/2305.08559v2.pdf","comment":"A short version is accepted in Neural Compression ICML Worksop July\n 19th, 2023"},{"id":"http://arxiv.org/abs/2308.11027v1","updated":"2023-08-21T20:30:51Z","published":"2023-08-21T20:30:51Z","title":"Split Learning for Distributed Collaborative Training of Deep Learning\n Models in Health Informatics","summary":" Deep learning continues to rapidly evolve and is now demonstrating remarkable\npotential for numerous medical prediction tasks. However, realizing deep\nlearning models that generalize across healthcare organizations is challenging.\nThis is due, in part, to the inherent siloed nature of these organizations and\npatient privacy requirements. To address this problem, we illustrate how split\nlearning can enable collaborative training of deep learning models across\ndisparate and privately maintained health datasets, while keeping the original\nrecords and model parameters private. We introduce a new privacy-preserving\ndistributed learning framework that offers a higher level of privacy compared\nto conventional federated learning. We use several biomedical imaging and\nelectronic health record (EHR) datasets to show that deep learning models\ntrained via split learning can achieve highly similar performance to their\ncentralized and federated counterparts while greatly improving computational\nefficiency and reducing privacy risks.\n","authors":["Zhuohang Li","Chao Yan","Xinmeng Zhang","Gharib Gharibi","Zhijun Yin","Xiaoqian Jiang","Bradley A. Malin"],"pdf_url":"https://arxiv.org/pdf/2308.11027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11022v1","updated":"2023-08-21T20:23:23Z","published":"2023-08-21T20:23:23Z","title":"Extreme Multilabel Classification for Specialist Doctor Recommendation\n with Implicit Feedback and Limited Patient Metadata","summary":" Recommendation Systems (RS) are often used to address the issue of medical\ndoctor referrals. However, these systems require access to patient feedback and\nmedical records, which may not always be available in real-world scenarios. Our\nresearch focuses on medical referrals and aims to predict recommendations in\ndifferent specialties of physicians for both new patients and those with a\nconsultation history. We use Extreme Multilabel Classification (XML), commonly\nemployed in text-based classification tasks, to encode available features and\nexplore different scenarios. While its potential for recommendation tasks has\noften been suggested, this has not been thoroughly explored in the literature.\nMotivated by the doctor referral case, we show how to recast a traditional\nrecommender setting into a multilabel classification problem that current XML\nmethods can solve. Further, we propose a unified model leveraging patient\nhistory across different specialties. Compared to state-of-the-art RS using the\nsame features, our approach consistently improves standard recommendation\nmetrics up to approximately $10\\%$ for patients with a previous consultation\nhistory. For new patients, XML proves better at exploiting available features,\noutperforming the benchmark in favorable scenarios, with particular emphasis on\nrecall metrics. Thus, our approach brings us one step closer to creating more\neffective and personalized doctor referral systems. Additionally, it highlights\nXML as a promising alternative to current hybrid or content-based RS, while\nidentifying key aspects to take into account when using XML for recommendation\ntasks.\n","authors":["Filipa Valdeira","Stevo Racković","Valeria Danalachi","Qiwei Han","Cláudia Soares"],"pdf_url":"https://arxiv.org/pdf/2308.11022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11021v1","updated":"2023-08-21T20:22:51Z","published":"2023-08-21T20:22:51Z","title":"Multi-Task Hypergraphs for Semi-supervised Learning using Earth\n Observations","summary":" There are many ways of interpreting the world and they are highly\ninterdependent. We exploit such complex dependencies and introduce a powerful\nmulti-task hypergraph, in which every node is a task and different paths\nthrough the hypergraph reaching a given task become unsupervised teachers, by\nforming ensembles that learn to generate reliable pseudolabels for that task.\nEach hyperedge is part of an ensemble teacher for a given task and it is also a\nstudent of the self-supervised hypergraph system. We apply our model to one of\nthe most important problems of our times, that of Earth Observation, which is\nhighly multi-task and it often suffers from missing ground-truth data. By\nperforming extensive experiments on the NASA NEO Dataset, spanning a period of\n22 years, we demonstrate the value of our multi-task semi-supervised approach,\nby consistent improvements over strong baselines and recent work. We also show\nthat the hypergraph can adapt unsupervised to gradual data distribution shifts\nand reliably recover, through its multi-task self-supervision process, the\nmissing data for several observational layers for up to seven years.\n","authors":["Mihai Pirvu","Alina Marcu","Alexandra Dobrescu","Nabil Belbachir","Marius Leordeanu"],"pdf_url":"https://arxiv.org/pdf/2308.11021v1.pdf","comment":"Accepted in ICCV 2023 Workshops"},{"id":"http://arxiv.org/abs/2308.11019v1","updated":"2023-08-21T20:15:35Z","published":"2023-08-21T20:15:35Z","title":"Instance-based Learning with Prototype Reduction for Real-Time\n Proportional Myocontrol: A Randomized User Study Demonstrating\n Accuracy-preserving Data Reduction for Prosthetic Embedded Systems","summary":" This work presents the design, implementation and validation of learning\ntechniques based on the kNN scheme for gesture detection in prosthetic control.\nTo cope with high computational demands in instance-based prediction, methods\nof dataset reduction are evaluated considering real-time determinism to allow\nfor the reliable integration into battery-powered portable devices. The\ninfluence of parameterization and varying proportionality schemes is analyzed,\nutilizing an eight-channel-sEMG armband. Besides offline cross-validation\naccuracy, success rates in real-time pilot experiments (online target\nachievement tests) are determined. Based on the assessment of specific dataset\nreduction techniques' adequacy for embedded control applications regarding\naccuracy and timing behaviour, Decision Surface Mapping (DSM) proves itself\npromising when applying kNN on the reduced set. A randomized, double-blind user\nstudy was conducted to evaluate the respective methods (kNN and kNN with\nDSM-reduction) against Ridge Regression (RR) and RR with Random Fourier\nFeatures (RR-RFF). The kNN-based methods performed significantly better\n(p<0.0005) than the regression techniques. Between DSM-kNN and kNN, there was\nno statistically significant difference (significance level 0.05). This is\nremarkable in consideration of only one sample per class in the reduced set,\nthus yielding a reduction rate of over 99% while preserving success rate. The\nsame behaviour could be confirmed in an extended user study. With k=1, which\nturned out to be an excellent choice, the runtime complexity of both kNN (in\nevery prediction step) as well as DSM-kNN (in the training phase) becomes\nlinear concerning the number of original samples, favouring dependable wearable\nprosthesis applications.\n","authors":["Tim Sziburis","Markus Nowak","Davide Brunelli"],"pdf_url":"https://arxiv.org/pdf/2308.11019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16474v2","updated":"2023-08-21T20:09:24Z","published":"2023-05-25T21:07:20Z","title":"FairDP: Certified Fairness with Differential Privacy","summary":" This paper introduces FairDP, a novel mechanism designed to achieve certified\nfairness with differential privacy (DP). FairDP independently trains models for\ndistinct individual groups, using group-specific clipping terms to assess and\nbound the disparate impacts of DP. Throughout the training process, the\nmechanism progressively integrates knowledge from group models to formulate a\ncomprehensive model that balances privacy, utility, and fairness in downstream\ntasks. Extensive theoretical and empirical analyses validate the efficacy of\nFairDP and improved trade-offs between model utility, privacy, and fairness\ncompared with existing methods.\n","authors":["Khang Tran","Ferdinando Fioretto","Issa Khalil","My T. Thai","NhatHai Phan"],"pdf_url":"https://arxiv.org/pdf/2305.16474v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11013v1","updated":"2023-08-21T20:03:16Z","published":"2023-08-21T20:03:16Z","title":"Personalized Event Prediction for Electronic Health Records","summary":" Clinical event sequences consist of hundreds of clinical events that\nrepresent records of patient care in time. Developing accurate predictive\nmodels of such sequences is of a great importance for supporting a variety of\nmodels for interpreting/classifying the current patient condition, or\npredicting adverse clinical events and outcomes, all aimed to improve patient\ncare. One important challenge of learning predictive models of clinical\nsequences is their patient-specific variability. Based on underlying clinical\nconditions, each patient's sequence may consist of different sets of clinical\nevents (observations, lab results, medications, procedures). Hence, simple\npopulation-wide models learned from event sequences for many different patients\nmay not accurately predict patient-specific dynamics of event sequences and\ntheir differences. To address the problem, we propose and investigate multiple\nnew event sequence prediction models and methods that let us better adjust the\nprediction for individual patients and their specific conditions. The methods\ndeveloped in this work pursue refinement of population-wide models to\nsubpopulations, self-adaptation, and a meta-level model switching that is able\nto adaptively select the model with the best chance to support the immediate\nprediction. We analyze and test the performance of these models on clinical\nevent sequences of patients in MIMIC-III database.\n","authors":["Jeong Min Lee","Milos Hauskrecht"],"pdf_url":"https://arxiv.org/pdf/2308.11013v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2104.01787"},{"id":"http://arxiv.org/abs/2308.11006v1","updated":"2023-08-21T19:45:48Z","published":"2023-08-21T19:45:48Z","title":"Using language models in the implicit automated assessment of\n mathematical short answer items","summary":" We propose a new way to assess certain short constructed responses to\nmathematics items. Our approach uses a pipeline that identifies the key values\nspecified by the student in their response. This allows us to determine the\ncorrectness of the response, as well as identify any misconceptions. The\ninformation from the value identification pipeline can then be used to provide\nfeedback to the teacher and student. The value identification pipeline consists\nof two fine-tuned language models. The first model determines if a value is\nimplicit in the student response. The second model identifies where in the\nresponse the key value is specified. We consider both a generic model that can\nbe used for any prompt and value, as well as models that are specific to each\nprompt and value. The value identification pipeline is a more accurate and\ninformative way to assess short constructed responses than traditional\nrubric-based scoring. It can be used to provide more targeted feedback to\nstudents, which can help them improve their understanding of mathematics.\n","authors":["Christopher Ormerod"],"pdf_url":"https://arxiv.org/pdf/2308.11006v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2308.11003v1","updated":"2023-08-21T19:36:50Z","published":"2023-08-21T19:36:50Z","title":"Autonomous Detection of Methane Emissions in Multispectral Satellite\n Data Using Deep Learning","summary":" Methane is one of the most potent greenhouse gases, and its short atmospheric\nhalf-life makes it a prime target to rapidly curb global warming. However,\ncurrent methane emission monitoring techniques primarily rely on approximate\nemission factors or self-reporting, which have been shown to often dramatically\nunderestimate emissions. Although initially designed to monitor surface\nproperties, satellite multispectral data has recently emerged as a powerful\nmethod to analyze atmospheric content. However, the spectral resolution of\nmultispectral instruments is poor, and methane measurements are typically very\nnoisy. Methane data products are also sensitive to absorption by the surface\nand other atmospheric gases (water vapor in particular) and therefore provide\nnoisy maps of potential methane plumes, that typically require extensive human\nanalysis. Here, we show that the image recognition capabilities of deep\nlearning methods can be leveraged to automatize the detection of methane leaks\nin Sentinel-2 satellite multispectral data, with dramatically reduced false\npositive rates compared with state-of-the-art multispectral methane data\nproducts, and without the need for a priori knowledge of potential leak sites.\nOur proposed approach paves the way for the automated, high-definition and\nhigh-frequency monitoring of point-source methane emissions across the world.\n","authors":["Bertrand Rouet-Leduc","Thomas Kerdreux","Alexandre Tuel","Claudia Hulbert"],"pdf_url":"https://arxiv.org/pdf/2308.11003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03212v2","updated":"2023-08-21T18:54:56Z","published":"2023-08-06T21:23:22Z","title":"Average-Hard Attention Transformers are Constant-Depth Uniform Threshold\n Circuits","summary":" Transformers have emerged as a widely used neural network model for various\nnatural language processing tasks. Previous research explored their\nrelationship with constant-depth threshold circuits, making two assumptions:\naverage-hard attention and logarithmic precision for internal computations\nrelative to input length. Merrill et al. (2022) prove that average-hard\nattention transformers recognize languages that fall within the complexity\nclass TC0, denoting the set of languages that can be recognized by\nconstant-depth polynomial-size threshold circuits. Likewise, Merrill and\nSabharwal (2023) show that log-precision transformers recognize languages\nwithin the class of uniform TC0. This shows that both transformer models can be\nsimulated by constant-depth threshold circuits, with the latter being more\nrobust due to generating a uniform circuit family. Our paper shows that the\nfirst result can be extended to yield uniform circuits as well.\n","authors":["Lena Strobl"],"pdf_url":"https://arxiv.org/pdf/2308.03212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03266v2","updated":"2023-08-21T18:53:31Z","published":"2023-07-06T20:00:52Z","title":"Empirical Analysis of a Segmentation Foundation Model in Prostate\n Imaging","summary":" Most state-of-the-art techniques for medical image segmentation rely on\ndeep-learning models. These models, however, are often trained on\nnarrowly-defined tasks in a supervised fashion, which requires expensive\nlabeled datasets. Recent advances in several machine learning domains, such as\nnatural language generation have demonstrated the feasibility and utility of\nbuilding foundation models that can be customized for various downstream tasks\nwith little to no labeled data. This likely represents a paradigm shift for\nmedical imaging, where we expect that foundation models may shape the future of\nthe field. In this paper, we consider a recently developed foundation model for\nmedical image segmentation, UniverSeg. We conduct an empirical evaluation study\nin the context of prostate imaging and compare it against the conventional\napproach of training a task-specific segmentation model. Our results and\ndiscussion highlight several important factors that will likely be important in\nthe development and adoption of foundation models for medical image\nsegmentation.\n","authors":["Heejong Kim","Victor Ion Butoi","Adrian V. Dalca","Daniel J. A. Margolis","Mert R. Sabuncu"],"pdf_url":"https://arxiv.org/pdf/2307.03266v2.pdf","comment":"Accepted"},{"id":"http://arxiv.org/abs/2308.10973v1","updated":"2023-08-21T18:41:11Z","published":"2023-08-21T18:41:11Z","title":"SupEuclid: Extremely Simple, High Quality OoD Detection with Supervised\n Contrastive Learning and Euclidean Distance","summary":" Out-of-Distribution (OoD) detection has developed substantially in the past\nfew years, with available methods approaching, and in a few cases achieving,\nperfect data separation on standard benchmarks. These results generally involve\nlarge or complex models, pretraining, exposure to OoD examples or extra\nhyperparameter tuning. Remarkably, it is possible to achieve results that can\nexceed many of these state-of-the-art methods with a very simple method. We\ndemonstrate that ResNet18 trained with Supervised Contrastive Learning (SCL)\nproduces state-of-the-art results out-of-the-box on near and far OoD detection\nbenchmarks using only Euclidean distance as a scoring rule. This may obviate\nthe need in some cases for more sophisticated methods or larger models, and at\nthe very least provides a very strong, easy to use baseline for further\nexperimentation and analysis.\n","authors":["Jarrod Haas"],"pdf_url":"https://arxiv.org/pdf/2308.10973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10968v1","updated":"2023-08-21T18:26:35Z","published":"2023-08-21T18:26:35Z","title":"MRI Field-transfer Reconstruction with Limited Data: Regularization by\n Neural Style Transfer","summary":" Recent works have demonstrated success in MRI reconstruction using deep\nlearning-based models. However, most reported approaches require training on a\ntask-specific, large-scale dataset. Regularization by denoising (RED) is a\ngeneral pipeline which embeds a denoiser as a prior for image reconstruction.\nThe potential of RED has been demonstrated for multiple image-related tasks\nsuch as denoising, deblurring and super-resolution. In this work, we propose a\nregularization by neural style transfer (RNST) method to further leverage the\npriors from the neural transfer and denoising engine. This enables RNST to\nreconstruct a high-quality image from a noisy low-quality image with different\nimage styles and limited data. We validate RNST with clinical MRI scans from\n1.5T and 3T and show that RNST can significantly boost image quality. Our\nresults highlight the capability of the RNST framework for MRI reconstruction\nand the potential for reconstruction tasks with limited data.\n","authors":["Guoyao Shen","Yancheng Zhu","Hernan Jara","Sean B. Andersson","Chad W. Farris","Stephan Anderson","Xin Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10968v1.pdf","comment":"30 pages, 8 figures, 2 tables, 1 algorithm chart"},{"id":"http://arxiv.org/abs/2308.05525v2","updated":"2023-08-21T18:21:24Z","published":"2023-08-10T12:06:03Z","title":"Critical Points ++: An Agile Point Cloud Importance Measure for Robust\n Classification, Adversarial Defense and Explainable AI","summary":" The ability to cope accurately and fast with Out-Of-Distribution (OOD)\nsamples is crucial in real-world safety demanding applications. In this work we\nfirst study the interplay between critical points of 3D point clouds and OOD\nsamples. Our findings are that common corruptions and outliers are often\ninterpreted as critical points. We generalize the notion of critical points\ninto importance measures. We show that training a classification network based\nonly on less important points dramatically improves robustness, at a cost of\nminor performance loss on the clean set. We observe that normalized entropy is\nhighly informative for corruption analysis. An adaptive threshold based on\nnormalized entropy is suggested for selecting the set of uncritical points. Our\nproposed importance measure is extremely fast to compute. We show it can be\nused for a variety of applications, such as Explainable AI (XAI), Outlier\nRemoval, Uncertainty Estimation, Robust Classification and Adversarial Defense.\nWe reach SOTA results on the two latter tasks. Code is available at:\nhttps://github.com/yossilevii100/critical_points2\n","authors":["Meir Yossef Levi","Guy Gilboa"],"pdf_url":"https://arxiv.org/pdf/2308.05525v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2306.13592v2","updated":"2023-08-21T16:37:46Z","published":"2023-06-23T16:28:12Z","title":"TACOformer:Token-channel compounded Cross Attention for Multimodal\n Emotion Recognition","summary":" Recently, emotion recognition based on physiological signals has emerged as a\nfield with intensive research. The utilization of multi-modal, multi-channel\nphysiological signals has significantly improved the performance of emotion\nrecognition systems, due to their complementarity. However, effectively\nintegrating emotion-related semantic information from different modalities and\ncapturing inter-modal dependencies remains a challenging issue. Many existing\nmultimodal fusion methods ignore either token-to-token or channel-to-channel\ncorrelations of multichannel signals from different modalities, which limits\nthe classification capability of the models to some extent. In this paper, we\npropose a comprehensive perspective of multimodal fusion that integrates\nchannel-level and token-level cross-modal interactions. Specifically, we\nintroduce a unified cross attention module called Token-chAnnel COmpound (TACO)\nCross Attention to perform multimodal fusion, which simultaneously models\nchannel-level and token-level dependencies between modalities. Additionally, we\npropose a 2D position encoding method to preserve information about the spatial\ndistribution of EEG signal channels, then we use two transformer encoders ahead\nof the fusion module to capture long-term temporal dependencies from the EEG\nsignal and the peripheral physiological signal, respectively.\nSubject-independent experiments on emotional dataset DEAP and Dreamer\ndemonstrate that the proposed model achieves state-of-the-art performance.\n","authors":["Xinda Li"],"pdf_url":"https://arxiv.org/pdf/2306.13592v2.pdf","comment":"Accepted by IJCAI 2023- AI4TS workshop"},{"id":"http://arxiv.org/abs/2308.09357v2","updated":"2023-08-21T11:58:14Z","published":"2023-08-18T07:38:30Z","title":"Multi-scale Target-Aware Framework for Constrained Image Splicing\n Detection and Localization","summary":" Constrained image splicing detection and localization (CISDL) is a\nfundamental task of multimedia forensics, which detects splicing operation\nbetween two suspected images and localizes the spliced region on both images.\nRecent works regard it as a deep matching problem and have made significant\nprogress. However, existing frameworks typically perform feature extraction and\ncorrelation matching as separate processes, which may hinder the model's\nability to learn discriminative features for matching and can be susceptible to\ninterference from ambiguous background pixels. In this work, we propose a\nmulti-scale target-aware framework to couple feature extraction and correlation\nmatching in a unified pipeline. In contrast to previous methods, we design a\ntarget-aware attention mechanism that jointly learns features and performs\ncorrelation matching between the probe and donor images. Our approach can\neffectively promote the collaborative learning of related patches, and perform\nmutual promotion of feature learning and correlation matching. Additionally, in\norder to handle scale transformations, we introduce a multi-scale projection\nmethod, which can be readily integrated into our target-aware framework that\nenables the attention process to be conducted between tokens containing\ninformation of varying scales. Our experiments demonstrate that our model,\nwhich uses a unified pipeline, outperforms state-of-the-art methods on several\nbenchmark datasets and is robust against scale transformations.\n","authors":["Yuxuan Tan","Yuanman Li","Limin Zeng","Jiaxiong Ye","Wei wang","Xia Li"],"pdf_url":"https://arxiv.org/pdf/2308.09357v2.pdf","comment":"accepted by ACMMM2023"},{"id":"http://arxiv.org/abs/2305.09381v6","updated":"2023-08-21T09:04:44Z","published":"2023-05-16T12:09:30Z","title":"AMD: Autoregressive Motion Diffusion","summary":" Human motion generation aims to produce plausible human motion sequences\naccording to various conditional inputs, such as text or audio. Despite the\nfeasibility of existing methods in generating motion based on short prompts and\nsimple motion patterns, they encounter difficulties when dealing with long\nprompts or complex motions. The challenges are two-fold: 1) the scarcity of\nhuman motion-captured data for long prompts and complex motions. 2) the high\ndiversity of human motions in the temporal domain and the substantial\ndivergence of distributions from conditional modalities, leading to a\nmany-to-many mapping problem when generating motion with complex and long\ntexts. In this work, we address these gaps by 1) elaborating the first dataset\npairing long textual descriptions and 3D complex motions (HumanLong3D), and 2)\nproposing an autoregressive motion diffusion model (AMD). Specifically, AMD\nintegrates the text prompt at the current timestep with the text prompt and\naction sequences at the previous timestep as conditional information to predict\nthe current action sequences in an iterative manner. Furthermore, we present\nits generalization for X-to-Motion with \"No Modality Left Behind\", enabling the\ngeneration of high-definition and high-fidelity human motions based on\nuser-defined modality input.\n","authors":["Bo Han","Hao Peng","Minjing Dong","Yi Ren","Yixuan Shen","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2305.09381v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09300v2","updated":"2023-08-21T07:51:00Z","published":"2023-08-18T04:49:38Z","title":"V2A-Mapper: A Lightweight Solution for Vision-to-Audio Generation by\n Connecting Foundation Models","summary":" Building artificial intelligence (AI) systems on top of a set of foundation\nmodels (FMs) is becoming a new paradigm in AI research. Their representative\nand generative abilities learnt from vast amounts of data can be easily adapted\nand transferred to a wide range of downstream tasks without extra training from\nscratch. However, leveraging FMs in cross-modal generation remains\nunder-researched when audio modality is involved. On the other hand,\nautomatically generating semantically-relevant sound from visual input is an\nimportant problem in cross-modal generation studies. To solve this\nvision-to-audio (V2A) generation problem, existing methods tend to design and\nbuild complex systems from scratch using modestly sized datasets. In this\npaper, we propose a lightweight solution to this problem by leveraging\nfoundation models, specifically CLIP, CLAP, and AudioLDM. We first investigate\nthe domain gap between the latent space of the visual CLIP and the auditory\nCLAP models. Then we propose a simple yet effective mapper mechanism\n(V2A-Mapper) to bridge the domain gap by translating the visual input between\nCLIP and CLAP spaces. Conditioned on the translated CLAP embedding, pretrained\naudio generative FM AudioLDM is adopted to produce high-fidelity and\nvisually-aligned sound. Compared to previous approaches, our method only\nrequires a quick training of the V2A-Mapper. We further analyze and conduct\nextensive experiments on the choice of the V2A-Mapper and show that a\ngenerative mapper is better at fidelity and variability (FD) while a regression\nmapper is slightly better at relevance (CS). Both objective and subjective\nevaluation on two V2A datasets demonstrate the superiority of our proposed\nmethod compared to current state-of-the-art approaches - trained with 86% fewer\nparameters but achieving 53% and 19% improvement in FD and CS, respectively.\n","authors":["Heng Wang","Jianbo Ma","Santiago Pascual","Richard Cartwright","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2308.09300v2.pdf","comment":"13 pages, 10 figures. Demo page: https://v2a-mapper.github.io/"},{"id":"http://arxiv.org/abs/2306.08306v2","updated":"2023-08-21T05:26:45Z","published":"2023-06-14T07:23:36Z","title":"Towards Balanced Active Learning for Multimodal Classification","summary":" Training multimodal networks requires a vast amount of data due to their\nlarger parameter space compared to unimodal networks. Active learning is a\nwidely used technique for reducing data annotation costs by selecting only\nthose samples that could contribute to improving model performance. However,\ncurrent active learning strategies are mostly designed for unimodal tasks, and\nwhen applied to multimodal data, they often result in biased sample selection\nfrom the dominant modality. This unfairness hinders balanced multimodal\nlearning, which is crucial for achieving optimal performance. To address this\nissue, we propose three guidelines for designing a more balanced multimodal\nactive learning strategy. Following these guidelines, a novel approach is\nproposed to achieve more fair data selection by modulating the gradient\nembedding with the dominance degree among modalities. Our studies demonstrate\nthat the proposed method achieves more balanced multimodal learning by avoiding\ngreedy sample selection from the dominant modality. Our approach outperforms\nexisting active learning strategies on a variety of multimodal classification\ntasks. Overall, our work highlights the importance of balancing sample\nselection in multimodal active learning and provides a practical solution for\nachieving more balanced active learning for multimodal classification.\n","authors":["Meng Shen","Yizheng Huang","Jianxiong Yin","Heqing Zou","Deepu Rajan","Simon See"],"pdf_url":"https://arxiv.org/pdf/2306.08306v2.pdf","comment":"12 pages, accepted by ACMMM 2023"},{"id":"http://arxiv.org/abs/2308.10917v1","updated":"2023-08-21T03:54:21Z","published":"2023-08-21T03:54:21Z","title":"PACS: Prediction and analysis of cancer subtypes from multi-omics data\n based on a multi-head attention mechanism model","summary":" Due to the high heterogeneity and clinical characteristics of cancer, there\nare significant differences in multi-omic data and clinical characteristics\namong different cancer subtypes. Therefore, accurate classification of cancer\nsubtypes can help doctors choose the most appropriate treatment options,\nimprove treatment outcomes, and provide more accurate patient survival\npredictions. In this study, we propose a supervised multi-head attention\nmechanism model (SMA) to classify cancer subtypes successfully. The attention\nmechanism and feature sharing module of the SMA model can successfully learn\nthe global and local feature information of multi-omics data. Second, it\nenriches the parameters of the model by deeply fusing multi-head attention\nencoders from Siamese through the fusion module. Validated by extensive\nexperiments, the SMA model achieves the highest accuracy, F1 macroscopic, F1\nweighted, and accurate classification of cancer subtypes in simulated,\nsingle-cell, and cancer multiomics datasets compared to AE, CNN, and GNN-based\nmodels. Therefore, we contribute to future research on multiomics data using\nour attention-based approach.\n","authors":["Liangrui Pan","Dazheng Liu","Zhichao Feng","Wenjuan Liu","Shaoliang Peng"],"pdf_url":"https://arxiv.org/pdf/2308.10917v1.pdf","comment":"Submitted to BIBM2023"}]},"2023-08-20T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.10390v1","updated":"2023-08-20T23:47:23Z","published":"2023-08-20T23:47:23Z","title":"LibriSQA: Pioneering Free-form and Open-ended Spoken Question Answering\n with a Novel Dataset and Framework","summary":" While Large Language Models (LLMs) have demonstrated commendable performance\nacross a myriad of domains and tasks, existing LLMs still exhibit a palpable\ndeficit in handling multimodal functionalities, especially for the Spoken\nQuestion Answering (SQA) task which necessitates precise alignment and deep\ninteraction between speech and text features. To address the SQA challenge on\nLLMs, we initially curated the free-form and open-ended LibriSQA dataset from\nLibrispeech, comprising Part I with natural conversational formats and Part II\nencompassing multiple-choice questions followed by answers and analytical\nsegments. Both parts collectively include 107k SQA pairs that cover various\ntopics. Given the evident paucity of existing speech-text LLMs, we propose a\nlightweight, end-to-end framework to execute the SQA task on the LibriSQA,\nwitnessing significant results. By reforming ASR into the SQA format, we\nfurther substantiate our framework's capability in handling ASR tasks. Our\nempirical findings bolster the LLMs' aptitude for aligning and comprehending\nmultimodal information, paving the way for the development of universal\nmultimodal LLMs. The dataset and demo can be found at\nhttps://github.com/ZihanZhaoSJTU/LibriSQA.\n","authors":["Zihan Zhao","Yiyang Jiang","Heyang Liu","Yanfeng Wang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10380v1","updated":"2023-08-20T22:42:04Z","published":"2023-08-20T22:42:04Z","title":"A Human-on-the-Loop Optimization Autoformalism Approach for\n Sustainability","summary":" This paper outlines a natural conversational approach to solving personalized\nenergy-related problems using large language models (LLMs). We focus on\ncustomizable optimization problems that necessitate repeated solving with\nslight variations in modeling and are user-specific, hence posing a challenge\nto devising a one-size-fits-all model. We put forward a strategy that augments\nan LLM with an optimization solver, enhancing its proficiency in understanding\nand responding to user specifications and preferences while providing nonlinear\nreasoning capabilities. Our approach pioneers the novel concept of human-guided\noptimization autoformalism, translating a natural language task specification\nautomatically into an optimization instance. This enables LLMs to analyze,\nexplain, and tackle a variety of instance-specific energy-related problems,\npushing beyond the limits of current prompt-based techniques.\n Our research encompasses various commonplace tasks in the energy sector, from\nelectric vehicle charging and Heating, Ventilation, and Air Conditioning (HVAC)\ncontrol to long-term planning problems such as cost-benefit evaluations for\ninstalling rooftop solar photovoltaics (PVs) or heat pumps. This pilot study\nmarks an essential stride towards the context-based formulation of optimization\nusing LLMs, with the potential to democratize optimization processes. As a\nresult, stakeholders are empowered to optimize their energy consumption,\npromoting sustainable energy practices customized to personal needs and\npreferences.\n","authors":["Ming Jin","Bilgehan Sel","Fnu Hardeep","Wotao Yin"],"pdf_url":"https://arxiv.org/pdf/2308.10380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10379v1","updated":"2023-08-20T22:36:23Z","published":"2023-08-20T22:36:23Z","title":"Algorithm of Thoughts: Enhancing Exploration of Ideas in Large Language\n Models","summary":" Current literature, aiming to surpass the \"Chain-of-Thought\" approach, often\nresorts to an external modus operandi involving halting, modifying, and then\nresuming the generation process to boost Large Language Models' (LLMs)\nreasoning capacities. This mode escalates the number of query requests, leading\nto increased costs, memory, and computational overheads. Addressing this, we\npropose the Algorithm of Thoughts -- a novel strategy that propels LLMs through\nalgorithmic reasoning pathways, pioneering a new mode of in-context learning.\nBy employing algorithmic examples, we exploit the innate recurrence dynamics of\nLLMs, expanding their idea exploration with merely one or a few queries. Our\ntechnique outperforms earlier single-query methods and stands on par with a\nrecent multi-query strategy that employs an extensive tree search algorithm.\nIntriguingly, our results suggest that instructing an LLM using an algorithm\ncan lead to performance surpassing that of the algorithm itself, hinting at\nLLM's inherent ability to weave its intuition into optimized searches. We probe\ninto the underpinnings of our method's efficacy and its nuances in application.\n","authors":["Bilgehan Sel","Ahmad Al-Tawaha","Vanshaj Khattar","Lu Wang","Ruoxi Jia","Ming Jin"],"pdf_url":"https://arxiv.org/pdf/2308.10379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10370v1","updated":"2023-08-20T21:30:34Z","published":"2023-08-20T21:30:34Z","title":"cantnlp@LT-EDI@RANLP-2023: Homophobia/Transphobia Detection in Social\n Media Comments using Spatio-Temporally Retrained Language Models","summary":" This paper describes our multiclass classification system developed as part\nof the LTEDI@RANLP-2023 shared task. We used a BERT-based language model to\ndetect homophobic and transphobic content in social media comments across five\nlanguage conditions: English, Spanish, Hindi, Malayalam, and Tamil. We\nretrained a transformer-based crosslanguage pretrained language model,\nXLMRoBERTa, with spatially and temporally relevant social media language data.\nWe also retrained a subset of models with simulated script-mixed social media\nlanguage data with varied performance. We developed the best performing\nseven-label classification system for Malayalam based on weighted macro\naveraged F1 score (ranked first out of six) with variable performance for other\nlanguage and class-label conditions. We found the inclusion of this\nspatio-temporal data improved the classification performance for all language\nand task conditions when compared with the baseline. The results suggests that\ntransformer-based language classification systems are sensitive to\nregister-specific and language-specific retraining.\n","authors":["Sidney G. -J. Wong","Matthew Durward","Benjamin Adams","Jonathan Dunn"],"pdf_url":"https://arxiv.org/pdf/2308.10370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10354v1","updated":"2023-08-20T20:10:55Z","published":"2023-08-20T20:10:55Z","title":"Imaginations of WALL-E : Reconstructing Experiences with an\n Imagination-Inspired Module for Advanced AI Systems","summary":" In this paper, we introduce a novel Artificial Intelligence (AI) system\ninspired by the philosophical and psychoanalytical concept of imagination as a\n``Re-construction of Experiences\". Our AI system is equipped with an\nimagination-inspired module that bridges the gap between textual inputs and\nother modalities, enriching the derived information based on previously learned\nexperiences. A unique feature of our system is its ability to formulate\nindependent perceptions of inputs. This leads to unique interpretations of a\nconcept that may differ from human interpretations but are equally valid, a\nphenomenon we term as ``Interpretable Misunderstanding\". We employ large-scale\nmodels, specifically a Multimodal Large Language Model (MLLM), enabling our\nproposed system to extract meaningful information across modalities while\nprimarily remaining unimodal. We evaluated our system against other large\nlanguage models across multiple tasks, including emotion recognition and\nquestion-answering, using a zero-shot methodology to ensure an unbiased\nscenario that may happen by fine-tuning. Significantly, our system outperformed\nthe best Large Language Models (LLM) on the MELD, IEMOCAP, and CoQA datasets,\nachieving Weighted F1 (WF1) scores of 46.74%, 25.23%, and Overall F1 (OF1)\nscore of 17%, respectively, compared to 22.89%, 12.28%, and 7% from the\nwell-performing LLM. The goal is to go beyond the statistical view of language\nprocessing and tie it to human concepts such as philosophy and psychoanalysis.\nThis work represents a significant advancement in the development of\nimagination-inspired AI systems, opening new possibilities for AI to generate\ndeep and interpretable information across modalities, thereby enhancing\nhuman-AI interaction.\n","authors":["Zeinab Sadat Taghavi","Soroush Gooran","Seyed Arshan Dalili","Hamidreza Amirzadeh","Mohammad Jalal Nematbakhsh","Hossein Sameti"],"pdf_url":"https://arxiv.org/pdf/2308.10354v1.pdf","comment":"18 pages,"},{"id":"http://arxiv.org/abs/2308.10335v1","updated":"2023-08-20T18:36:28Z","published":"2023-08-20T18:36:28Z","title":"A Study on Robustness and Reliability of Large Language Model Code\n Generation","summary":" Recently, the large language models (LLMs) have shown extraordinary ability\nin understanding natural language and generating programming code. It has been\na common practice of software engineers to consult LLMs when encountering\ncoding questions. Although efforts have been made to avoid syntax errors and\nalign the code with the intended semantics, the reliability and robustness of\nthe code generationfrom LLMs have not yet been thoroughly studied. The\nexecutable code is not equivalent to the reliable and robust code, especially\nin the context of real-world software development.The misuse of APIs in the\ngenerated code could lead to severe problem, such as resource leaks, program\ncrashes, etc.To make things worse, the users of LLM code generation services\nare actually the developers that are most vulnerable to these code that seems\nright -- They are always novice developers that are not familiar with the APIs\nthat LLMs generate code for them. Therefore, they could hardly tell the misuse\nin the code generated by LLMs, which further facilitates the incorrect code\napplied in real-world software. Existing code evaluation benchmark and datasets\nfocus on crafting small tasks such as programming questions in coding\ninterviews, which however deviates from the problem that developers would ask\nLLM for real-world coding help. To fill the missing piece, in this work, we\npropose a dataset RobustAPI for evaluating the reliability and robustness of\ncode generated by LLMs. We collect 1208 coding questions from StackOverflow on\n24 representative Java APIs. We summarize thecommon misuse patterns of these\nAPIs and evaluate them oncurrent popular LLMs. The evaluation results show that\nevenfor GPT-4, 62% of the generated code contains API misuses,which would cause\nunexpected consequences if the code isintroduced into real-world software.\n","authors":["Li Zhong","Zilong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10335v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10652v2","updated":"2023-08-20T16:32:56Z","published":"2023-07-20T07:33:30Z","title":"Exploring the Landscape of Natural Language Processing Research","summary":" As an efficient approach to understand, generate, and process natural\nlanguage texts, research in natural language processing (NLP) has exhibited a\nrapid spread and wide adoption in recent years. Given the increasing research\nwork in this area, several NLP-related approaches have been surveyed in the\nresearch community. However, a comprehensive study that categorizes established\ntopics, identifies trends, and outlines areas for future research remains\nabsent. Contributing to closing this gap, we have systematically classified and\nanalyzed research papers in the ACL Anthology. As a result, we present a\nstructured overview of the research landscape, provide a taxonomy of fields of\nstudy in NLP, analyze recent developments in NLP, summarize our findings, and\nhighlight directions for future work.\n","authors":["Tim Schopf","Karim Arabi","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.10652v2.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2307.03104v4","updated":"2023-08-20T16:01:30Z","published":"2023-07-06T16:26:34Z","title":"Efficient Domain Adaptation of Sentence Embeddings Using Adapters","summary":" Sentence embeddings enable us to capture the semantic similarity of short\ntexts. Most sentence embedding models are trained for general semantic textual\nsimilarity tasks. Therefore, to use sentence embeddings in a particular domain,\nthe model must be adapted to it in order to achieve good results. Usually, this\nis done by fine-tuning the entire sentence embedding model for the domain of\ninterest. While this approach yields state-of-the-art results, all of the\nmodel's weights are updated during fine-tuning, making this method\nresource-intensive. Therefore, instead of fine-tuning entire sentence embedding\nmodels for each target domain individually, we propose to train lightweight\nadapters. These domain-specific adapters do not require fine-tuning all\nunderlying sentence embedding model parameters. Instead, we only train a small\nnumber of additional parameters while keeping the weights of the underlying\nsentence embedding model fixed. Training domain-specific adapters allows always\nusing the same base model and only exchanging the domain-specific adapters to\nadapt sentence embeddings to a specific domain. We show that using adapters for\nparameter-efficient domain adaptation of sentence embeddings yields competitive\nperformance within 1% of a domain-adapted, entirely fine-tuned sentence\nembedding model while only training approximately 3.6% of the parameters.\n","authors":["Tim Schopf","Dennis N. Schneider","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.03104v4.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2308.10304v1","updated":"2023-08-20T16:00:53Z","published":"2023-08-20T16:00:53Z","title":"Economic Policy Uncertainty: A Review on Applications and Measurement\n Methods with Focus on Text Mining Methods","summary":" Economic Policy Uncertainty (EPU) represents the uncertainty realized by the\ninvestors during economic policy alterations. EPU is a critical indicator in\neconomic studies to predict future investments, the unemployment rate, and\nrecessions. EPU values can be estimated based on financial parameters directly\nor implied uncertainty indirectly using the text mining methods. Although EPU\nis a well-studied topic within the economy, the methods utilized to measure it\nare understudied. In this article, we define the EPU briefly and review the\nmethods used to measure the EPU, and survey the areas influenced by the changes\nin EPU level. We divide the EPU measurement methods into three major groups\nwith respect to their input data. Examples of each group of methods are\nenlisted, and the pros and cons of the groups are discussed. Among the EPU\nmeasures, text mining-based ones are dominantly studied. These methods measure\nthe realized uncertainty by taking into account the uncertainty represented in\nthe news and publicly available sources of financial information. Finally, we\nsurvey the research areas that rely on measuring the EPU index with the hope\nthat studying the impacts of uncertainty would attract further attention of\nresearchers from various research fields. In addition, we propose a list of\nfuture research approaches focusing on measuring EPU using textual material.\n","authors":["Fatemeh Kaveh-Yazdy","Sajjad Zarifzadeh"],"pdf_url":"https://arxiv.org/pdf/2308.10304v1.pdf","comment":"JEL Classification: C53, C38, A13, O38, H50"},{"id":"http://arxiv.org/abs/2212.13939v3","updated":"2023-08-20T15:32:29Z","published":"2022-12-28T16:38:43Z","title":"Data Augmentation using Transformers and Similarity Measures for\n Improving Arabic Text Classification","summary":" The performance of learning models heavily relies on the availability and\nadequacy of training data. To address the dataset adequacy issue, researchers\nhave extensively explored data augmentation (DA) as a promising approach. DA\ngenerates new data instances through transformations applied to the available\ndata, thereby increasing dataset size and variability. This approach has\nenhanced model performance and accuracy, particularly in addressing class\nimbalance problems in classification tasks. However, few studies have explored\nDA for the Arabic language, relying on traditional approaches such as\nparaphrasing or noising-based techniques. In this paper, we propose a new\nArabic DA method that employs the recent powerful modeling technique, namely\nthe AraGPT-2, for the augmentation process. The generated sentences are\nevaluated in terms of context, semantics, diversity, and novelty using the\nEuclidean, cosine, Jaccard, and BLEU distances. Finally, the AraBERT\ntransformer is used on sentiment classification tasks to evaluate the\nclassification performance of the augmented Arabic dataset. The experiments\nwere conducted on four sentiment Arabic datasets: AraSarcasm, ASTD, ATT, and\nMOVIE. The selected datasets vary in size, label number, and unbalanced\nclasses. The results show that the proposed methodology enhanced the Arabic\nsentiment text classification on all datasets with an increase in F1 score by\n4% in AraSarcasm, 6% in ASTD, 9% in ATT, and 13% in MOVIE.\n","authors":["Dania Refai","Saleh Abo-Soud","Mohammad Abdel-Rahman"],"pdf_url":"https://arxiv.org/pdf/2212.13939v3.pdf","comment":"15 pages, 16 Figures, this work has been submitted to the IEEE Access\n Journal for possible publication"},{"id":"http://arxiv.org/abs/2308.10278v1","updated":"2023-08-20T14:24:26Z","published":"2023-08-20T14:24:26Z","title":"CharacterChat: Learning towards Conversational AI with Personalized\n Social Support","summary":" In our modern, fast-paced, and interconnected world, the importance of mental\nwell-being has grown into a matter of great urgency. However, traditional\nmethods such as Emotional Support Conversations (ESC) face challenges in\neffectively addressing a diverse range of individual personalities. In\nresponse, we introduce the Social Support Conversation (S2Conv) framework. It\ncomprises a series of support agents and the interpersonal matching mechanism,\nlinking individuals with persona-compatible virtual supporters. Utilizing\npersona decomposition based on the MBTI (Myers-Briggs Type Indicator), we have\ncreated the MBTI-1024 Bank, a group that of virtual characters with distinct\nprofiles. Through improved role-playing prompts with behavior preset and\ndynamic memory, we facilitate the development of the MBTI-S2Conv dataset, which\ncontains conversations between the characters in the MBTI-1024 Bank. Building\nupon these foundations, we present CharacterChat, a comprehensive S2Conv\nsystem, which includes a conversational model driven by personas and memories,\nalong with an interpersonal matching plugin model that dispatches the optimal\nsupporters from the MBTI-1024 Bank for individuals with specific personas.\nEmpirical results indicate the remarkable efficacy of CharacterChat in\nproviding personalized social support and highlight the substantial advantages\nderived from interpersonal matching. The source code is available in\n\\url{https://github.com/morecry/CharacterChat}.\n","authors":["Quan Tu","Chuanqi Chen","Jinpeng Li","Yanran Li","Shuo Shang","Dongyan Zhao","Ran Wang","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2308.10278v1.pdf","comment":"10 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.10263v1","updated":"2023-08-20T13:20:54Z","published":"2023-08-20T13:20:54Z","title":"Scaled-up Discovery of Latent Concepts in Deep NLP Models","summary":" Pre-trained language models (pLMs) learn intricate patterns and contextual\ndependencies via unsupervised learning on vast text data, driving breakthroughs\nacross NLP tasks. Despite these achievements, these models remain black boxes,\nnecessitating research into understanding their decision-making processes.\nRecent studies explore representation analysis by clustering latent spaces\nwithin pre-trained models. However, these approaches are limited in terms of\nscalability and the scope of interpretation because of high computation costs\nof clustering algorithms. This study focuses on comparing clustering algorithms\nfor the purpose of scaling encoded concept discovery of representations from\npLMs. Specifically, we compare three algorithms in their capacity to unveil the\nencoded concepts through their alignment to human-defined ontologies:\nAgglomerative Hierarchical Clustering, Leaders Algorithm, and K-Means\nClustering. Our results show that K-Means has the potential to scale to very\nlarge datasets, allowing rich latent concept discovery, both on the word and\nphrase level.\n","authors":["Majd Hawasly","Fahim Dalvi","Nadir Durrani"],"pdf_url":"https://arxiv.org/pdf/2308.10263v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10261v1","updated":"2023-08-20T13:15:18Z","published":"2023-08-20T13:15:18Z","title":"How Good Are Large Language Models at Out-of-Distribution Detection?","summary":" Out-of-distribution (OOD) detection plays a vital role in enhancing the\nreliability of machine learning (ML) models. The emergence of large language\nmodels (LLMs) has catalyzed a paradigm shift within the ML community,\nshowcasing their exceptional capabilities across diverse natural language\nprocessing tasks. While existing research has probed OOD detection with smaller\nencoder-based Transformers like BERT and RoBERTa, the stark differences in\nscales, pre-training objectives, and inference paradigms call into question the\napplicability of these findings to LLMs. This paper embarks on a pioneering\nempirical investigation of OOD detection in the domain of LLMs, focusing on\nLLaMA series ranging from 7B to 65B in size. We thoroughly evaluate\ncommonly-used OOD detectors, scrutinizing their performance in both zero-grad\nand fine-tuning scenarios. Notably, we alter previous discriminative\nin-distribution fine-tuning into generative fine-tuning, aligning the\npre-training objective of LLMs with downstream tasks. Our findings unveil that\na simple cosine distance OOD detector demonstrates superior efficacy,\noutperforming other OOD detectors. We provide an intriguing explanation for\nthis phenomenon by highlighting the isotropic nature of the embedding spaces of\nLLMs, which distinctly contrasts with the anisotropic property observed in\nsmaller BERT family models. The new insight enhances our understanding of how\nLLMs detect OOD data, thereby enhancing their adaptability and reliability in\ndynamic environments.\n","authors":["Bo Liu","Liming Zhan","Zexin Lu","Yujie Feng","Lei Xue","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2308.10261v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.10253v1","updated":"2023-08-20T12:43:52Z","published":"2023-08-20T12:43:52Z","title":"StableLLaVA: Enhanced Visual Instruction Tuning with Synthesized\n Image-Dialogue Data","summary":" The remarkable multimodal capabilities demonstrated by OpenAI's GPT-4 have\nsparked significant interest in the development of multimodal Large Language\nModels (LLMs). A primary research objective of such models is to align visual\nand textual modalities effectively while comprehending human instructions.\nCurrent methodologies often rely on annotations derived from benchmark datasets\nto construct image-dialogue datasets for training purposes, akin to instruction\ntuning in LLMs. However, these datasets often exhibit domain bias, potentially\nconstraining the generative capabilities of the models. In an effort to\nmitigate these limitations, we propose a novel data collection methodology that\nsynchronously synthesizes images and dialogues for visual instruction tuning.\nThis approach harnesses the power of generative models, marrying the abilities\nof ChatGPT and text-to-image generative models to yield a diverse and\ncontrollable dataset with varied image content. This not only provides greater\nflexibility compared to existing methodologies but also significantly enhances\nseveral model capabilities. Our research includes comprehensive experiments\nconducted on various datasets using the open-source LLAVA model as a testbed\nfor our proposed pipeline. Our results underscore marked enhancements across\nmore than ten commonly assessed capabilities,\n","authors":["Yanda Li","Chi Zhang","Gang Yu","Zhibin Wang","Bin Fu","Guosheng Lin","Chunhua Shen","Ling Chen","Yunchao Wei"],"pdf_url":"https://arxiv.org/pdf/2308.10253v1.pdf","comment":"Project page: https://github.com/icoz69/StableLLAVA"},{"id":"http://arxiv.org/abs/2308.10252v1","updated":"2023-08-20T12:42:19Z","published":"2023-08-20T12:42:19Z","title":"LMTuner: An user-friendly and highly-integrable Training Framework for\n fine-tuning Large Language Models","summary":" With the burgeoning development in the realm of large language models (LLMs),\nthe demand for efficient incremental training tailored to specific industries\nand domains continues to increase. Currently, the predominantly employed\nframeworks lack modular design, it often takes a lot of coding work to\nkickstart the training of LLM. To address this, we present \"LMTuner\", a highly\nusable, integrable, and scalable system for training LLMs expeditiously and\nwith minimal user-input. LMTuner comprises three main modules - the\nInteraction, Training, and Inference Modules. We advocate that LMTuner's\nusability and integrality alleviate the complexities in training large language\nmodels. Remarkably, even a novice user could commence training large language\nmodels within five minutes. Furthermore, it integrates DeepSpeed frameworks and\nsupports Efficient Fine-Tuning methodologies like Low Rank Adaptation (LoRA),\nQuantized LoRA (QLoRA), etc., enabling the training of language models scaling\nfrom 300M to a whopping 130B parameters using a single server. The LMTuner's\nhomepage (https://wengsyx.github.io/LMTuner/)and screencast video\n(https://youtu.be/nsXmWOmN3rE) are now publicly available.\n","authors":["Yixuan Weng","Zhiqi Wang","Huanxuan Liao","Shizhu He","Shengping Liu","Kang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.10252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10248v1","updated":"2023-08-20T12:21:05Z","published":"2023-08-20T12:21:05Z","title":"Activation Addition: Steering Language Models Without Optimization","summary":" Reliably controlling the behavior of large language models (LLMs) is a\npressing open problem. Existing methods include supervised finetuning,\nreinforcement learning from human feedback (RLHF), prompt engineering and\nguided decoding. We instead investigate activation engineering: modifying\nactivations at inference time to predictably alter model behavior. In\nparticular, we bias the forward pass with an added 'steering vector' implicitly\nspecified through natural language.\n Unlike past work which learned these steering vectors (Subramani, Suresh, and\nPeters 2022; Hernandez, Li, and Andreas 2023), our Activation Addition (ActAdd)\nmethod computes them by taking the activation differences that result from\npairs of prompts. We demonstrate ActAdd on GPT-2 on OpenWebText and ConceptNet.\nOur inference-time approach yields control over high-level properties of output\nand preserves off-target model performance. It involves far less compute and\nimplementation effort compared to finetuning or RLHF, allows users to provide\nnatural language specifications, and its overhead scales naturally with model\nsize.\n","authors":["Alex Turner","Lisa Thiergart","David Udell","Gavin Leech","Ulisse Mini","Monte MacDiarmid"],"pdf_url":"https://arxiv.org/pdf/2308.10248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13833v2","updated":"2023-08-20T08:42:07Z","published":"2023-05-23T08:53:33Z","title":"Reducing Sensitivity on Speaker Names for Text Generation from Dialogues","summary":" Changing speaker names consistently throughout a dialogue should not affect\nits meaning and corresponding outputs for text generation from dialogues.\nHowever, pre-trained language models, serving as the backbone for\ndialogue-processing tasks, have shown to be sensitive to nuances. This may\nresult in unfairness in real-world applications. No comprehensive analysis of\nthis problem has been done in the past. In this work, we propose to\nquantitatively measure a model's sensitivity on speaker names, and\ncomprehensively evaluate a number of known methods for reducing speaker name\nsensitivity, including a novel approach of our own. Extensive experiments on\nmultiple datasets provide a benchmark for this problem and show the favorable\nperformance of our approach in sensitivity reduction and quality of generation.\n","authors":["Qi Jia","Haifeng Tang","Kenny Q. Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.13833v2.pdf","comment":"findings of ACL'23"},{"id":"http://arxiv.org/abs/2308.10195v1","updated":"2023-08-20T07:56:34Z","published":"2023-08-20T07:56:34Z","title":"WMFormer++: Nested Transformer for Visible Watermark Removal via Implict\n Joint Learning","summary":" Watermarking serves as a widely adopted approach to safeguard media\ncopyright. In parallel, the research focus has extended to watermark removal\ntechniques, offering an adversarial means to enhance watermark robustness and\nfoster advancements in the watermarking field. Existing watermark removal\nmethods often rely on UNet architectures with multiple decoder branches -- one\nfor watermark localization and the other for background image restoration.\nThese methods involve complex module designs to guide information flow for\nrespective tasks, which can lead to suboptimal performance and an overly\ncumbersome model. To simplify the existing framework, we propose a novel\nTransformer-based approach with a unified decoder branch, treating watermark\nextraction and background restoration as a single task and allowing thenetwork\nto learn information flow between them without artificial design patterns.\nAdditionally, we utilize nested structures to facilitate multi-scale feature\nfusion, forming a parallel ensemble of nested structures that constitute the\nUNet. Supervision is applied to UNets with varying depths to facilitate\nknowledge learning across all levels. Extensive experiments are conducted on\nvarious challenging benchmarks to validate the effectiveness of our proposed\nmethod. The results demonstrate that our approach achieves state-of-the-art\nperformance and produces high-quality images.\n","authors":["Dongjian Huo","Zehong Zhang","Hanjing Su","Guanbin Li","Chaowei Fang","Qingyao Wu"],"pdf_url":"https://arxiv.org/pdf/2308.10195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10173v1","updated":"2023-08-20T05:58:33Z","published":"2023-08-20T05:58:33Z","title":"FoodGPT: A Large Language Model in Food Testing Domain with Incremental\n Pre-training and Knowledge Graph Prompt","summary":" Currently, the construction of large language models in specific domains is\ndone by fine-tuning on a base model. Some models also incorporate knowledge\nbases without the need for pre-training. This is because the base model already\ncontains domain-specific knowledge during the pre-training process. We build a\nlarge language model for food testing. Unlike the above approach, a significant\namount of data in this domain exists in Scanning format for domain standard\ndocuments. In addition, there is a large amount of untrained structured\nknowledge. Therefore, we introduce an incremental pre-training step to inject\nthis knowledge into a large language model. In this paper, we propose a method\nfor handling structured knowledge and scanned documents in incremental\npre-training. To overcome the problem of machine hallucination, we constructe a\nknowledge graph to serve as an external knowledge base for supporting retrieval\nin the large language model. It is worth mentioning that this paper is a\ntechnical report of our pre-release version, and we will report our specific\nexperimental data in future versions.\n","authors":["Zhixiao Qi","Yijiong Yu","Meiqi Tu","Junyi Tan","Yongfeng Huang"],"pdf_url":"https://arxiv.org/pdf/2308.10173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10170v1","updated":"2023-08-20T05:44:18Z","published":"2023-08-20T05:44:18Z","title":"FashionNTM: Multi-turn Fashion Image Retrieval via Cascaded Memory","summary":" Multi-turn textual feedback-based fashion image retrieval focuses on a\nreal-world setting, where users can iteratively provide information to refine\nretrieval results until they find an item that fits all their requirements. In\nthis work, we present a novel memory-based method, called FashionNTM, for such\na multi-turn system. Our framework incorporates a new Cascaded Memory Neural\nTuring Machine (CM-NTM) approach for implicit state management, thereby\nlearning to integrate information across all past turns to retrieve new images,\nfor a given turn. Unlike vanilla Neural Turing Machine (NTM), our CM-NTM\noperates on multiple inputs, which interact with their respective memories via\nindividual read and write heads, to learn complex relationships. Extensive\nevaluation results show that our proposed method outperforms the previous\nstate-of-the-art algorithm by 50.5%, on Multi-turn FashionIQ -- the only\nexisting multi-turn fashion dataset currently, in addition to having a relative\nimprovement of 12.6% on Multi-turn Shoes -- an extension of the single-turn\nShoes dataset that we created in this work. Further analysis of the model in a\nreal-world interactive setting demonstrates two important capabilities of our\nmodel -- memory retention across turns, and agnosticity to turn order for\nnon-contradictory feedback. Finally, user study results show that images\nretrieved by FashionNTM were favored by 83.1% over other multi-turn models.\nProject page: https://sites.google.com/eng.ucsd.edu/fashionntm\n","authors":["Anwesan Pal","Sahil Wadhwa","Ayush Jaiswal","Xu Zhang","Yue Wu","Rakesh Chada","Pradeep Natarajan","Henrik I. Christensen"],"pdf_url":"https://arxiv.org/pdf/2308.10170v1.pdf","comment":"Paper accepted at ICCV-2023"},{"id":"http://arxiv.org/abs/2308.10168v1","updated":"2023-08-20T05:31:03Z","published":"2023-08-20T05:31:03Z","title":"Head-to-Tail: How Knowledgeable are Large Language Models (LLM)? A.K.A.\n Will LLMs Replace Knowledge Graphs?","summary":" Since the recent prosperity of Large Language Models (LLMs), there have been\ninterleaved discussions regarding how to reduce hallucinations from LLM\nresponses, how to increase the factuality of LLMs, and whether Knowledge Graphs\n(KGs), which store the world knowledge in a symbolic form, will be replaced\nwith LLMs. In this paper, we try to answer these questions from a new angle:\nHow knowledgeable are LLMs?\n To answer this question, we constructed Head-to-Tail, a benchmark that\nconsists of 18K question-answer (QA) pairs regarding head, torso, and tail\nfacts in terms of popularity. We designed an automated evaluation method and a\nset of metrics that closely approximate the knowledge an LLM confidently\ninternalizes. Through a comprehensive evaluation of 14 publicly available LLMs,\nwe show that existing LLMs are still far from being perfect in terms of their\ngrasp of factual knowledge, especially for facts of torso-to-tail entities.\n","authors":["Kai Sun","Yifan Ethan Xu","Hanwen Zha","Yue Liu","Xin Luna Dong"],"pdf_url":"https://arxiv.org/pdf/2308.10168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09438v2","updated":"2023-08-20T04:54:10Z","published":"2023-05-16T13:50:24Z","title":"MPI-rical: Data-Driven MPI Distributed Parallelism Assistance with\n Transformers","summary":" Automatic source-to-source parallelization of serial code for shared and\ndistributed memory systems is a challenging task in high-performance computing.\nWhile many attempts were made to translate serial code into parallel code for a\nshared memory environment (usually using OpenMP), none has managed to do so for\na distributed memory environment. In this paper, we propose a novel approach,\ncalled MPI-rical, for automated MPI code generation using a transformer-based\nmodel trained on approximately 25,000 serial code snippets and their\ncorresponding parallelized MPI code out of more than 50,000 code snippets in\nour corpus (MPICodeCorpus). To evaluate the performance of the model, we first\nbreak down the serial code to MPI-based parallel code translation problem into\ntwo sub-problems and develop two research objectives: code completion defined\nas given a location in the source code, predict the MPI function for that\nlocation, and code translation defined as predicting an MPI function as well as\nits location in the source code. We evaluate MPI-rical on MPICodeCorpus dataset\nand on real-world scientific code benchmarks and compare its performance\nbetween the code completion and translation tasks. Our experimental results\nshow that while MPI-rical performs better on the code completion task than the\ncode translation task, the latter is better suited for real-world programming\nassistance, in which the tool suggests the need for an MPI function regardless\nof prior knowledge. Overall, our approach represents a significant step forward\nin automating the parallelization of serial code for distributed memory\nsystems, which can save valuable time and resources for software developers and\nresearchers. The source code used in this work, as well as other relevant\nsources, are available at:\nhttps://github.com/Scientific-Computing-Lab-NRCN/MPI-rical\n","authors":["Nadav Schneider","Tal Kadosh","Niranjan Hasabnis","Timothy Mattson","Yuval Pinter","Gal Oren"],"pdf_url":"https://arxiv.org/pdf/2305.09438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10149v1","updated":"2023-08-20T03:30:22Z","published":"2023-08-20T03:30:22Z","title":"A Survey on Fairness in Large Language Models","summary":" Large language models (LLMs) have shown powerful performance and development\nprospect and are widely deployed in the real world. However, LLMs can capture\nsocial biases from unprocessed training data and propagate the biases to\ndownstream tasks. Unfair LLM systems have undesirable social impacts and\npotential harms. In this paper, we provide a comprehensive review of related\nresearch on fairness in LLMs. First, for medium-scale LLMs, we introduce\nevaluation metrics and debiasing methods from the perspectives of intrinsic\nbias and extrinsic bias, respectively. Then, for large-scale LLMs, we introduce\nrecent fairness research, including fairness evaluation, reasons for bias, and\ndebiasing methods. Finally, we discuss and provide insight on the challenges\nand future directions for the development of fairness in LLMs.\n","authors":["Yingji Li","Mengnan Du","Rui Song","Xin Wang","Ying Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10149v1.pdf","comment":"12 pages, 2 figures, 101 references"},{"id":"http://arxiv.org/abs/2308.10144v1","updated":"2023-08-20T03:03:34Z","published":"2023-08-20T03:03:34Z","title":"ExpeL: LLM Agents Are Experiential Learners","summary":" The recent surge in research interest in applying large language models\n(LLMs) to decision-making tasks has flourished by leveraging the extensive\nworld knowledge embedded in LLMs. While there is a growing demand to tailor\nLLMs for custom decision-making tasks, finetuning them for specific tasks is\nresource-intensive and may diminish the model's generalization capabilities.\nMoreover, state-of-the-art language models like GPT-4 and Claude are primarily\naccessible through API calls, with their parametric weights remaining\nproprietary and unavailable to the public. This scenario emphasizes the growing\nneed for new methodologies that allow learning from agent experiences without\nrequiring parametric updates. To address these problems, we introduce the\nExperiential Learning (ExpeL) agent. Our agent autonomously gathers experiences\nand extracts knowledge using natural language from a collection of training\ntasks. At inference, the agent recalls its extracted insights and past\nexperiences to make informed decisions. Our empirical results highlight the\nrobust learning efficacy of the ExpeL agent, indicating a consistent\nenhancement in its performance as it accumulates experiences. We further\nexplore the emerging capabilities and transfer learning potential of the ExpeL\nagent through qualitative observations and additional experiments.\n","authors":["Andrew Zhao","Daniel Huang","Quentin Xu","Matthieu Lin","Yong-Jin Liu","Gao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.10144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05534v3","updated":"2023-08-20T00:25:44Z","published":"2022-09-12T18:29:55Z","title":"PreSTU: Pre-Training for Scene-Text Understanding","summary":" The ability to recognize and reason about text embedded in visual inputs is\noften lacking in vision-and-language (V&L) models, perhaps because V&L\npre-training methods have often failed to include such an ability in their\ntraining objective. In this paper, we propose PreSTU, a novel pre-training\nrecipe dedicated to scene-text understanding (STU). PreSTU introduces OCR-aware\npre-training objectives that encourage the model to recognize text from an\nimage and connect it to the rest of the image content. We implement PreSTU\nusing a simple transformer-based encoder-decoder architecture, combined with\nlarge-scale image-text datasets with scene text obtained from an off-the-shelf\nOCR system. We empirically demonstrate the effectiveness of this pre-training\napproach on eight visual question answering and four image captioning\nbenchmarks.\n","authors":["Jihyung Kil","Soravit Changpinyo","Xi Chen","Hexiang Hu","Sebastian Goodman","Wei-Lun Chao","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2209.05534v3.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11462v1","updated":"2023-08-20T22:08:03Z","published":"2023-08-20T22:08:03Z","title":"LegalBench: A Collaboratively Built Benchmark for Measuring Legal\n Reasoning in Large Language Models","summary":" The advent of large language models (LLMs) and their adoption by the legal\ncommunity has given rise to the question: what types of legal reasoning can\nLLMs perform? To enable greater study of this question, we present LegalBench:\na collaboratively constructed legal reasoning benchmark consisting of 162 tasks\ncovering six different types of legal reasoning. LegalBench was built through\nan interdisciplinary process, in which we collected tasks designed and\nhand-crafted by legal professionals. Because these subject matter experts took\na leading role in construction, tasks either measure legal reasoning\ncapabilities that are practically useful, or measure reasoning skills that\nlawyers find interesting. To enable cross-disciplinary conversations about LLMs\nin the law, we additionally show how popular legal frameworks for describing\nlegal reasoning -- which distinguish between its many forms -- correspond to\nLegalBench tasks, thus giving lawyers and LLM developers a common vocabulary.\nThis paper describes LegalBench, presents an empirical evaluation of 20\nopen-source and commercial LLMs, and illustrates the types of research\nexplorations LegalBench enables.\n","authors":["Neel Guha","Julian Nyarko","Daniel E. Ho","Christopher Ré","Adam Chilton","Aditya Narayana","Alex Chohlas-Wood","Austin Peters","Brandon Waldon","Daniel N. Rockmore","Diego Zambrano","Dmitry Talisman","Enam Hoque","Faiz Surani","Frank Fagan","Galit Sarfaty","Gregory M. Dickinson","Haggai Porat","Jason Hegland","Jessica Wu","Joe Nudell","Joel Niklaus","John Nay","Jonathan H. Choi","Kevin Tobia","Margaret Hagan","Megan Ma","Michael Livermore","Nikon Rasumov-Rahe","Nils Holzenberger","Noam Kolt","Peter Henderson","Sean Rehaag","Sharad Goel","Shang Gao","Spencer Williams","Sunny Gandhi","Tom Zur","Varun Iyer","Zehua Li"],"pdf_url":"https://arxiv.org/pdf/2308.11462v1.pdf","comment":"143 pages, 79 tables, 4 figures"},{"id":"http://arxiv.org/abs/2308.11589v1","updated":"2023-08-20T09:59:40Z","published":"2023-08-20T09:59:40Z","title":"Indonesian Automatic Speech Recognition with XLSR-53","summary":" This study focuses on the development of Indonesian Automatic Speech\nRecognition (ASR) using the XLSR-53 pre-trained model, the XLSR stands for\ncross-lingual speech representations. The use of this XLSR-53 pre-trained model\nis to significantly reduce the amount of training data in non-English languages\nrequired to achieve a competitive Word Error Rate (WER). The total amount of\ndata used in this study is 24 hours, 18 minutes, and 1 second: (1) TITML-IDN 14\nhours and 31 minutes; (2) Magic Data 3 hours and 33 minutes; and (3) Common\nVoice 6 hours, 14 minutes, and 1 second. With a WER of 20%, the model built in\nthis study can compete with similar models using the Common Voice dataset split\ntest. WER can be decreased by around 8% using a language model, resulted in WER\nfrom 20% to 12%. Thus, the results of this study have succeeded in perfecting\nprevious research in contributing to the creation of a better Indonesian ASR\nwith a smaller amount of data.\n","authors":["Panji Arisaputra","Amalia Zahra"],"pdf_url":"https://arxiv.org/pdf/2308.11589v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.10392v1","updated":"2023-08-20T23:50:22Z","published":"2023-08-20T23:50:22Z","title":"Towards Generalizable Morph Attack Detection with Consistency\n Regularization","summary":" Though recent studies have made significant progress in morph attack\ndetection by virtue of deep neural networks, they often fail to generalize well\nto unseen morph attacks. With numerous morph attacks emerging frequently,\ngeneralizable morph attack detection has gained significant attention. This\npaper focuses on enhancing the generalization capability of morph attack\ndetection from the perspective of consistency regularization. Consistency\nregularization operates under the premise that generalizable morph attack\ndetection should output consistent predictions irrespective of the possible\nvariations that may occur in the input space. In this work, to reach this\nobjective, two simple yet effective morph-wise augmentations are proposed to\nexplore a wide space of realistic morph transformations in our consistency\nregularization. Then, the model is regularized to learn consistently at the\nlogit as well as embedding levels across a wide range of morph-wise augmented\nimages. The proposed consistency regularization aligns the abstraction in the\nhidden layers of our model across the morph attack images which are generated\nfrom diverse domains in the wild. Experimental results demonstrate the superior\ngeneralization and robustness performance of our proposed method compared to\nthe state-of-the-art studies.\n","authors":["Hossein Kashiani","Niloufar Alipour Talemi","Mohammad Saeed Ebrahimi Saadabadi","Nasser M. Nasrabadi"],"pdf_url":"https://arxiv.org/pdf/2308.10392v1.pdf","comment":"Accepted to the IEEE International Joint Conference on Biometrics\n (IJCB), 2023"},{"id":"http://arxiv.org/abs/2301.09253v2","updated":"2023-08-20T23:34:39Z","published":"2023-01-23T03:32:57Z","title":"CircNet: Meshing 3D Point Clouds with Circumcenter Detection","summary":" Reconstructing 3D point clouds into triangle meshes is a key problem in\ncomputational geometry and surface reconstruction. Point cloud triangulation\nsolves this problem by providing edge information to the input points. Since no\nvertex interpolation is involved, it is beneficial to preserve sharp details on\nthe surface. Taking advantage of learning-based techniques in triangulation,\nexisting methods enumerate the complete combinations of candidate triangles,\nwhich is both complex and inefficient. In this paper, we leverage the duality\nbetween a triangle and its circumcenter, and introduce a deep neural network\nthat detects the circumcenters to achieve point cloud triangulation.\nSpecifically, we introduce multiple anchor priors to divide the neighborhood\nspace of each point. The neural network then learns to predict the presences\nand locations of circumcenters under the guidance of those anchors. We extract\nthe triangles dual to the detected circumcenters to form a primitive mesh, from\nwhich an edge-manifold mesh is produced via simple post-processing. Unlike\nexisting learning-based triangulation methods, the proposed method bypasses an\nexhaustive enumeration of triangle combinations and local surface\nparameterization. We validate the efficiency, generalization, and robustness of\nour method on prominent datasets of both watertight and open surfaces. The code\nand trained models are provided at https://github.com/EnyaHermite/CircNet.\n","authors":["Huan Lei","Ruitao Leng","Liang Zheng","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2301.09253v2.pdf","comment":"accepted to ICLR2023"},{"id":"http://arxiv.org/abs/2308.10382v1","updated":"2023-08-20T23:01:46Z","published":"2023-08-20T23:01:46Z","title":"False Negative/Positive Control for SAM on Noisy Medical Images","summary":" The Segment Anything Model (SAM) is a recently developed all-range foundation\nmodel for image segmentation. It can use sparse manual prompts such as bounding\nboxes to generate pixel-level segmentation in natural images but struggles in\nmedical images such as low-contrast, noisy ultrasound images. We propose a\nrefined test-phase prompt augmentation technique designed to improve SAM's\nperformance in medical image segmentation. The method couples multi-box prompt\naugmentation and an aleatoric uncertainty-based false-negative (FN) and\nfalse-positive (FP) correction (FNPC) strategy. We evaluate the method on two\nultrasound datasets and show improvement in SAM's performance and robustness to\ninaccurate prompts, without the necessity for further training or tuning.\nMoreover, we present the Single-Slice-to-Volume (SS2V) method, enabling 3D\npixel-level segmentation using only the bounding box annotation from a single\n2D slice. Our results allow efficient use of SAM in even noisy, low-contrast\nmedical images. The source code will be released soon.\n","authors":["Xing Yao","Han Liu","Dewei Hu","Daiwei Lu","Ange Lou","Hao Li","Ruining Deng","Gabriel Arenas","Baris Oguz","Nadav Schwartz","Brett C Byram","Ipek Oguz"],"pdf_url":"https://arxiv.org/pdf/2308.10382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.01921v3","updated":"2023-08-20T22:57:46Z","published":"2023-05-03T06:38:35Z","title":"DiffFacto: Controllable Part-Based 3D Point Cloud Generation with Cross\n Diffusion","summary":" While the community of 3D point cloud generation has witnessed a big growth\nin recent years, there still lacks an effective way to enable intuitive user\ncontrol in the generation process, hence limiting the general utility of such\nmethods. Since an intuitive way of decomposing a shape is through its parts, we\npropose to tackle the task of controllable part-based point cloud generation.\nWe introduce DiffFacto, a novel probabilistic generative model that learns the\ndistribution of shapes with part-level control. We propose a factorization that\nmodels independent part style and part configuration distributions and presents\na novel cross-diffusion network that enables us to generate coherent and\nplausible shapes under our proposed factorization. Experiments show that our\nmethod is able to generate novel shapes with multiple axes of control. It\nachieves state-of-the-art part-level generation quality and generates plausible\nand coherent shapes while enabling various downstream editing applications such\nas shape interpolation, mixing, and transformation editing. Project website:\nhttps://difffacto.github.io/\n","authors":["Kiyohiro Nakayama","Mikaela Angelina Uy","Jiahui Huang","Shi-Min Hu","Ke Li","Leonidas J Guibas"],"pdf_url":"https://arxiv.org/pdf/2305.01921v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10373v1","updated":"2023-08-20T21:47:54Z","published":"2023-08-20T21:47:54Z","title":"HoSNN: Adversarially-Robust Homeostatic Spiking Neural Networks with\n Adaptive Firing Thresholds","summary":" Spiking neural networks (SNNs) offer promise for efficient and powerful\nneurally inspired computation. Common to other types of neural networks,\nhowever, SNNs face the severe issue of vulnerability to adversarial attacks. We\npresent the first study that draws inspiration from neural homeostasis to\ndevelop a bio-inspired solution that counters the susceptibilities of SNNs to\nadversarial onslaughts. At the heart of our approach is a novel\nthreshold-adapting leaky integrate-and-fire (TA-LIF) neuron model, which we\nadopt to construct the proposed adversarially robust homeostatic SNN (HoSNN).\nDistinct from traditional LIF models, our TA-LIF model incorporates a\nself-stabilizing dynamic thresholding mechanism, curtailing adversarial noise\npropagation and safeguarding the robustness of HoSNNs in an unsupervised\nmanner. Theoretical analysis is presented to shed light on the stability and\nconvergence properties of the TA-LIF neurons, underscoring their superior\ndynamic robustness under input distributional shifts over traditional LIF\nneurons. Remarkably, without explicit adversarial training, our HoSNNs\ndemonstrate inherent robustness on CIFAR-10, with accuracy improvements to\n72.6% and 54.19% against FGSM and PGD attacks, up from 20.97% and 0.6%,\nrespectively. Furthermore, with minimal FGSM adversarial training, our HoSNNs\nsurpass previous models by 29.99% under FGSM and 47.83% under PGD attacks on\nCIFAR-10. Our findings offer a new perspective on harnessing biological\nprinciples for bolstering SNNs adversarial robustness and defense, paving the\nway to more resilient neuromorphic computing.\n","authors":["Hejia Geng","Peng Li"],"pdf_url":"https://arxiv.org/pdf/2308.10373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10372v1","updated":"2023-08-20T21:46:05Z","published":"2023-08-20T21:46:05Z","title":"Developing a Machine Learning-Based Clinical Decision Support Tool for\n Uterine Tumor Imaging","summary":" Uterine leiomyosarcoma (LMS) is a rare but aggressive malignancy. On imaging,\nit is difficult to differentiate LMS from, for example, degenerated leiomyoma\n(LM), a prevalent but benign condition. We curated a data set of 115 axial\nT2-weighted MRI images from 110 patients (mean [range] age=45 [17-81] years)\nwith UTs that included five different tumor types. These data were randomly\nsplit stratifying on tumor volume into training (n=85) and test sets (n=30). An\nindependent second reader (reader 2) provided manual segmentations for all test\nset images. To automate segmentation, we applied nnU-Net and explored the\neffect of training set size on performance by randomly generating subsets with\n25, 45, 65 and 85 training set images. We evaluated the ability of radiomic\nfeatures to distinguish between types of UT individually and when combined\nthrough feature selection and machine learning. Using the entire training set\nthe mean [95% CI] fibroid DSC was measured as 0.87 [0.59-1.00] and the\nagreement between the two readers was 0.89 [0.77-1.0] on the test set. When\nclassifying degenerated LM from LMS we achieve a test set F1-score of 0.80.\nClassifying UTs based on radiomic features we identify classifiers achieving\nF1-scores of 0.53 [0.45, 0.61] and 0.80 [0.80, 0.80] on the test set for the\nbenign versus malignant, and degenerated LM versus LMS tasks. We show that it\nis possible to develop an automated method for 3D segmentation of the uterus\nand UT that is close to human-level performance with fewer than 150 annotated\nimages. For distinguishing UT types, while we train models that merit further\ninvestigation with additional data, reliable automatic differentiation of UTs\nremains a challenge.\n","authors":["Darryl E. Wright","Adriana V. Gregory","Deema Anaam","Sepideh Yadollahi","Sumana Ramanathan","Kafayat A. Oyemade","Reem Alsibai","Heather Holmes","Harrison Gottlich","Cherie-Akilah G. Browne","Sarah L. Cohen Rassier","Isabel Green","Elizabeth A. Stewart","Hiroaki Takahashi","Bohyun Kim","Shannon Laughlin-Tommaso","Timothy L. Kline"],"pdf_url":"https://arxiv.org/pdf/2308.10372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10368v1","updated":"2023-08-20T21:26:37Z","published":"2023-08-20T21:26:37Z","title":"Prediction of Pneumonia and COVID-19 Using Deep Neural Networks","summary":" Pneumonia, caused by bacteria and viruses, is a rapidly spreading viral\ninfection with global implications. Prompt identification of infected\nindividuals is crucial for containing its transmission. This study explores the\npotential of medical image analysis to address this challenge. We propose\nmachine-learning techniques for predicting Pneumonia from chest X-ray images.\nChest X-ray imaging is vital for Pneumonia diagnosis due to its accessibility\nand cost-effectiveness. However, interpreting X-rays for Pneumonia detection\ncan be complex, as radiographic features can overlap with other respiratory\nconditions. We evaluate the performance of different machine learning models,\nincluding DenseNet121, Inception Resnet-v2, Inception Resnet-v3, Resnet50, and\nXception, using chest X-ray images of pneumonia patients. Performance measures\nand confusion matrices are employed to assess and compare the models. The\nfindings reveal that DenseNet121 outperforms other models, achieving an\naccuracy rate of 99.58%. This study underscores the significance of machine\nlearning in the accurate detection of Pneumonia, leveraging chest X-ray images.\nOur study offers insights into the potential of technology to mitigate the\nspread of pneumonia through precise diagnostics.\n","authors":["M. S. Haque","M. S. Taluckder","S. B. Shawkat","M. A. Shahriyar","M. A. Sayed","C. Modak"],"pdf_url":"https://arxiv.org/pdf/2308.10368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10577v3","updated":"2023-08-20T21:24:13Z","published":"2023-07-20T04:41:39Z","title":"Ethosight: A Reasoning-Guided Iterative Learning System for Nuanced\n Perception based on Joint-Embedding & Contextual Label Affinity","summary":" Traditional computer vision models often necessitate extensive data\nacquisition, annotation, and validation. These models frequently struggle in\nreal-world applications, resulting in high false positive and negative rates,\nand exhibit poor adaptability to new scenarios, often requiring costly\nretraining. To address these issues, we present Ethosight, a flexible and\nadaptable zero-shot video analytics system. Ethosight begins from a clean slate\nbased on user-defined video analytics, specified through natural language or\nkeywords, and leverages joint embedding models and reasoning mechanisms\ninformed by ontologies such as WordNet and ConceptNet. Ethosight operates\neffectively on low-cost edge devices and supports enhanced runtime adaptation,\nthereby offering a new approach to continuous learning without catastrophic\nforgetting. We provide empirical validation of Ethosight's promising\neffectiveness across diverse and complex use cases, while highlighting areas\nfor further improvement. A significant contribution of this work is the release\nof all source code and datasets to enable full reproducibility and to foster\nfurther innovation in both the research and commercial domains.\n","authors":["Hugo Latapie","Shan Yu","Patrick Hammer","Kristinn R. Thorisson","Vahagn Petrosyan","Brandon Kynoch","Alind Khare","Payman Behnam","Alexey Tumanov","Aksheit Saxena","Anish Aralikatti","Hanning Chen","Mohsen Imani","Mike Archbold","Tangrui Li","Pei Wang","Justin Hart"],"pdf_url":"https://arxiv.org/pdf/2307.10577v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.12667v3","updated":"2023-08-20T21:19:13Z","published":"2023-01-30T05:08:05Z","title":"NeSyFOLD: Neurosymbolic Framework for Interpretable Image Classification","summary":" Deep learning models such as CNNs have surpassed human performance in\ncomputer vision tasks such as image classification. However, despite their\nsophistication, these models lack interpretability which can lead to biased\noutcomes reflecting existing prejudices in the data. We aim to make predictions\nmade by a CNN interpretable. Hence, we present a novel framework called\nNeSyFOLD to create a neurosymbolic (NeSy) model for image classification tasks.\nThe model is a CNN with all layers following the last convolutional layer\nreplaced by a stratified answer set program (ASP). A rule-based machine\nlearning algorithm called FOLD-SE-M is used to derive the stratified answer set\nprogram from binarized filter activations of the last convolutional layer. The\nanswer set program can be viewed as a rule-set, wherein the truth value of each\npredicate depends on the activation of the corresponding kernel in the CNN. The\nrule-set serves as a global explanation for the model and is interpretable. A\njustification for the predictions made by the NeSy model can be obtained using\nan ASP interpreter. We also use our NeSyFOLD framework with a CNN that is\ntrained using a sparse kernel learning technique called Elite BackProp (EBP).\nThis leads to a significant reduction in rule-set size without compromising\naccuracy or fidelity thus improving scalability of the NeSy model and\ninterpretability of its rule-set. Evaluation is done on datasets with varied\ncomplexity and sizes. To make the rule-set more intuitive to understand, we\npropose a novel algorithm for labelling each kernel's corresponding predicate\nin the rule-set with the semantic concept(s) it learns. We evaluate the\nperformance of our \"semantic labelling algorithm\" to quantify the efficacy of\nthe semantic labelling for both the NeSy model and the NeSy-EBP model.\n","authors":["Parth Padalkar","Huaduo Wang","Gopal Gupta"],"pdf_url":"https://arxiv.org/pdf/2301.12667v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10362v1","updated":"2023-08-20T20:43:11Z","published":"2023-08-20T20:43:11Z","title":"Vehicle Cameras Guide mmWave Beams: Approach and Real-World V2V\n Demonstration","summary":" Accurately aligning millimeter-wave (mmWave) and terahertz (THz) narrow beams\nis essential to satisfy reliability and high data rates of 5G and beyond\nwireless communication systems. However, achieving this objective is difficult,\nespecially in vehicle-to-vehicle (V2V) communication scenarios, where both\ntransmitter and receiver are constantly mobile. Recently, additional sensing\nmodalities, such as visual sensors, have attracted significant interest due to\ntheir capability to provide accurate information about the wireless\nenvironment. To that end, in this paper, we develop a deep learning solution\nfor V2V scenarios to predict future beams using images from a 360 camera\nattached to the vehicle. The developed solution is evaluated on a real-world\nmulti-modal mmWave V2V communication dataset comprising co-existing 360 camera\nand mmWave beam training data. The proposed vision-aided solution achieves\n$\\approx 85\\%$ top-5 beam prediction accuracy while significantly reducing the\nbeam training overhead. This highlights the potential of utilizing vision for\nenabling highly-mobile V2V communications.\n","authors":["Tawfik Osman","Gouranga Charan","Ahmed Alkhateeb"],"pdf_url":"https://arxiv.org/pdf/2308.10362v1.pdf","comment":"Dataset and code files are available on the DeepSense 6G website\n https://deepsense6g.net/"},{"id":"http://arxiv.org/abs/2301.01520v2","updated":"2023-08-20T20:37:31Z","published":"2023-01-04T10:17:16Z","title":"Towards Explainable Land Cover Mapping: a Counterfactual-based Strategy","summary":" Counterfactual explanations are an emerging tool to enhance interpretability\nof deep learning models. Given a sample, these methods seek to find and display\nto the user similar samples across the decision boundary. In this paper, we\npropose a generative adversarial counterfactual approach for satellite image\ntime series in a multi-class setting for the land cover classification task.\nOne of the distinctive features of the proposed approach is the lack of prior\nassumption on the targeted class for a given counterfactual explanation. This\ninherent flexibility allows for the discovery of interesting information on the\nrelationship between land cover classes. The other feature consists of\nencouraging the counterfactual to differ from the original sample only in a\nsmall and compact temporal segment. These time-contiguous perturbations allow\nfor a much sparser and, thus, interpretable solution. Furthermore,\nplausibility/realism of the generated counterfactual explanations is enforced\nvia the proposed adversarial learning strategy.\n","authors":["Cassio F. Dantas","Diego Marcos","Dino Ienco"],"pdf_url":"https://arxiv.org/pdf/2301.01520v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05784v2","updated":"2023-08-20T20:21:44Z","published":"2023-08-10T16:33:59Z","title":"High-performance Data Management for Whole Slide Image Analysis in\n Digital Pathology","summary":" When dealing with giga-pixel digital pathology in whole-slide imaging, a\nnotable proportion of data records holds relevance during each analysis\noperation. For instance, when deploying an image analysis algorithm on\nwhole-slide images (WSI), the computational bottleneck often lies in the\ninput-output (I/O) system. This is particularly notable as patch-level\nprocessing introduces a considerable I/O load onto the computer system.\nHowever, this data management process could be further paralleled, given the\ntypical independence of patch-level image processes across different patches.\nThis paper details our endeavors in tackling this data access challenge by\nimplementing the Adaptable IO System version 2 (ADIOS2). Our focus has been\nconstructing and releasing a digital pathology-centric pipeline using ADIOS2,\nwhich facilitates streamlined data management across WSIs. Additionally, we've\ndeveloped strategies aimed at curtailing data retrieval times. The performance\nevaluation encompasses two key scenarios: (1) a pure CPU-based image analysis\nscenario (\"CPU scenario\"), and (2) a GPU-based deep learning framework scenario\n(\"GPU scenario\"). Our findings reveal noteworthy outcomes. Under the CPU\nscenario, ADIOS2 showcases an impressive two-fold speed-up compared to the\nbrute-force approach. In the GPU scenario, its performance stands on par with\nthe cutting-edge GPU I/O acceleration framework, NVIDIA Magnum IO GPU Direct\nStorage (GDS). From what we know, this appears to be among the initial\ninstances, if any, of utilizing ADIOS2 within the field of digital pathology.\nThe source code has been made publicly available at\nhttps://github.com/hrlblab/adios.\n","authors":["Haoju Leng","Ruining Deng","Shunxing Bao","Dazheng Fang","Bryan A. Millis","Yucheng Tang","Haichun Yang","Xiao Wang","Yifan Peng","Lipeng Wan","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2308.05784v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.10347v1","updated":"2023-08-20T19:45:00Z","published":"2023-08-20T19:45:00Z","title":"Enhancing Transformers without Self-supervised Learning: A Loss\n Landscape Perspective in Sequential Recommendation","summary":" Transformer and its variants are a powerful class of architectures for\nsequential recommendation, owing to their ability of capturing a user's dynamic\ninterests from their past interactions. Despite their success,\nTransformer-based models often require the optimization of a large number of\nparameters, making them difficult to train from sparse data in sequential\nrecommendation. To address the problem of data sparsity, previous studies have\nutilized self-supervised learning to enhance Transformers, such as pre-training\nembeddings from item attributes or contrastive data augmentations. However,\nthese approaches encounter several training issues, including initialization\nsensitivity, manual data augmentations, and large batch-size memory\nbottlenecks.\n In this work, we investigate Transformers from the perspective of loss\ngeometry, aiming to enhance the models' data efficiency and generalization in\nsequential recommendation. We observe that Transformers (e.g., SASRec) can\nconverge to extremely sharp local minima if not adequately regularized.\nInspired by the recent Sharpness-Aware Minimization (SAM), we propose SAMRec,\nwhich significantly improves the accuracy and robustness of sequential\nrecommendation. SAMRec performs comparably to state-of-the-art self-supervised\nTransformers, such as S$^3$Rec and CL4SRec, without the need for pre-training\nor strong data augmentations.\n","authors":["Vivian Lai","Huiyuan Chen","Chin-Chia Michael Yeh","Minghua Xu","Yiwei Cai","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2308.10347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10220v1","updated":"2023-08-20T10:08:55Z","published":"2023-08-20T10:08:55Z","title":"Designing and Evaluating Presentation Strategies for Fact-Checked\n Content","summary":" With the rapid growth of online misinformation, it is crucial to have\nreliable fact-checking methods. Recent research on finding check-worthy claims\nand automated fact-checking have made significant advancements. However,\nlimited guidance exists regarding the presentation of fact-checked content to\neffectively convey verified information to users. We address this research gap\nby exploring the critical design elements in fact-checking reports and\ninvestigating whether credibility and presentation-based design improvements\ncan enhance users' ability to interpret the report accurately. We co-developed\npotential content presentation strategies through a workshop involving\nfact-checking professionals, communication experts, and researchers. The\nworkshop examined the significance and utility of elements such as veracity\nindicators and explored the feasibility of incorporating interactive components\nfor enhanced information disclosure. Building on the workshop outcomes, we\nconducted an online experiment involving 76 crowd workers to assess the\nefficacy of different design strategies. The results indicate that proposed\nstrategies significantly improve users' ability to accurately interpret the\nverdict of fact-checking articles. Our findings underscore the critical role of\neffective presentation of fact reports in addressing the spread of\nmisinformation. By adopting appropriate design enhancements, the effectiveness\nof fact-checking reports can be maximized, enabling users to make informed\njudgments.\n","authors":["Danula Hettiachchi","Kaixin Ji","Jenny Kennedy","Anthony McCosker","Flora Dylis Salim","Mark Sanderson","Falk Scholer","Damiano Spina"],"pdf_url":"https://arxiv.org/pdf/2308.10220v1.pdf","comment":"Accepted to the 32nd ACM International Conference on Information and\n Knowledge Management (CIKM '23)"},{"id":"http://arxiv.org/abs/2308.10191v1","updated":"2023-08-20T07:42:09Z","published":"2023-08-20T07:42:09Z","title":"Offline Pseudo Relevance Feedback for Efficient and Effective\n Single-pass Dense Retrieval","summary":" Dense retrieval has made significant advancements in information retrieval\n(IR) by achieving high levels of effectiveness while maintaining online\nefficiency during a single-pass retrieval process. However, the application of\npseudo relevance feedback (PRF) to further enhance retrieval effectiveness\nresults in a doubling of online latency. To address this challenge, this paper\npresents a single-pass dense retrieval framework that shifts the PRF process\noffline through the utilization of pre-generated pseudo-queries. As a result,\nonline retrieval is reduced to a single matching with the pseudo-queries, hence\nproviding faster online retrieval. The effectiveness of the proposed approach\nis evaluated on the standard TREC DL and HARD datasets, and the results\ndemonstrate its promise. Our code is openly available at\nhttps://github.com/Rosenberg37/OPRF.\n","authors":["Xueru Wen","Xiaoyang Chen","Xuanang Chen","Ben He","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2308.10191v1.pdf","comment":"Accepted at SIGIR2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.10386v1","updated":"2023-08-20T23:14:52Z","published":"2023-08-20T23:14:52Z","title":"Unsupervised Opinion Aggregation -- A Statistical Perspective","summary":" Complex decision-making systems rarely have direct access to the current\nstate of the world and they instead rely on opinions to form an understanding\nof what the ground truth could be. Even in problems where experts provide\nopinions without any intention to manipulate the decision maker, it is\nchallenging to decide which expert's opinion is more reliable -- a challenge\nthat is further amplified when decision-maker has limited, delayed, or no\naccess to the ground truth after the fact. This paper explores a statistical\napproach to infer the competence of each expert based on their opinions without\nany need for the ground truth. Echoing the logic behind what is commonly\nreferred to as \\textit{the wisdom of crowds}, we propose measuring the\ncompetence of each expert by their likeliness to agree with their peers. We\nfurther show that the more reliable an expert is the more likely it is that\nthey agree with their peers. We leverage this fact to propose a completely\nunsupervised version of the na\\\"{i}ve Bayes classifier and show that the\nproposed technique is asymptotically optimal for a large class of problems. In\naddition to aggregating a large block of opinions, we further apply our\ntechnique for online opinion aggregation and for decision-making based on a\nlimited the number of opinions.\n","authors":["Noyan C. Sevuktekin","Andrew C. Singer"],"pdf_url":"https://arxiv.org/pdf/2308.10386v1.pdf","comment":"This research was conducted during Noyan Sevuktekin's time at\n University of Illinois at Urbana-Champaign and the results were first\n presented in Chapter 3 of his dissertation, entitled \"Learning From\n Opinions\". Permalink: https://hdl.handle.net/2142/110814"},{"id":"http://arxiv.org/abs/2308.10373v1","updated":"2023-08-20T21:47:54Z","published":"2023-08-20T21:47:54Z","title":"HoSNN: Adversarially-Robust Homeostatic Spiking Neural Networks with\n Adaptive Firing Thresholds","summary":" Spiking neural networks (SNNs) offer promise for efficient and powerful\nneurally inspired computation. Common to other types of neural networks,\nhowever, SNNs face the severe issue of vulnerability to adversarial attacks. We\npresent the first study that draws inspiration from neural homeostasis to\ndevelop a bio-inspired solution that counters the susceptibilities of SNNs to\nadversarial onslaughts. At the heart of our approach is a novel\nthreshold-adapting leaky integrate-and-fire (TA-LIF) neuron model, which we\nadopt to construct the proposed adversarially robust homeostatic SNN (HoSNN).\nDistinct from traditional LIF models, our TA-LIF model incorporates a\nself-stabilizing dynamic thresholding mechanism, curtailing adversarial noise\npropagation and safeguarding the robustness of HoSNNs in an unsupervised\nmanner. Theoretical analysis is presented to shed light on the stability and\nconvergence properties of the TA-LIF neurons, underscoring their superior\ndynamic robustness under input distributional shifts over traditional LIF\nneurons. Remarkably, without explicit adversarial training, our HoSNNs\ndemonstrate inherent robustness on CIFAR-10, with accuracy improvements to\n72.6% and 54.19% against FGSM and PGD attacks, up from 20.97% and 0.6%,\nrespectively. Furthermore, with minimal FGSM adversarial training, our HoSNNs\nsurpass previous models by 29.99% under FGSM and 47.83% under PGD attacks on\nCIFAR-10. Our findings offer a new perspective on harnessing biological\nprinciples for bolstering SNNs adversarial robustness and defense, paving the\nway to more resilient neuromorphic computing.\n","authors":["Hejia Geng","Peng Li"],"pdf_url":"https://arxiv.org/pdf/2308.10373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10372v1","updated":"2023-08-20T21:46:05Z","published":"2023-08-20T21:46:05Z","title":"Developing a Machine Learning-Based Clinical Decision Support Tool for\n Uterine Tumor Imaging","summary":" Uterine leiomyosarcoma (LMS) is a rare but aggressive malignancy. On imaging,\nit is difficult to differentiate LMS from, for example, degenerated leiomyoma\n(LM), a prevalent but benign condition. We curated a data set of 115 axial\nT2-weighted MRI images from 110 patients (mean [range] age=45 [17-81] years)\nwith UTs that included five different tumor types. These data were randomly\nsplit stratifying on tumor volume into training (n=85) and test sets (n=30). An\nindependent second reader (reader 2) provided manual segmentations for all test\nset images. To automate segmentation, we applied nnU-Net and explored the\neffect of training set size on performance by randomly generating subsets with\n25, 45, 65 and 85 training set images. We evaluated the ability of radiomic\nfeatures to distinguish between types of UT individually and when combined\nthrough feature selection and machine learning. Using the entire training set\nthe mean [95% CI] fibroid DSC was measured as 0.87 [0.59-1.00] and the\nagreement between the two readers was 0.89 [0.77-1.0] on the test set. When\nclassifying degenerated LM from LMS we achieve a test set F1-score of 0.80.\nClassifying UTs based on radiomic features we identify classifiers achieving\nF1-scores of 0.53 [0.45, 0.61] and 0.80 [0.80, 0.80] on the test set for the\nbenign versus malignant, and degenerated LM versus LMS tasks. We show that it\nis possible to develop an automated method for 3D segmentation of the uterus\nand UT that is close to human-level performance with fewer than 150 annotated\nimages. For distinguishing UT types, while we train models that merit further\ninvestigation with additional data, reliable automatic differentiation of UTs\nremains a challenge.\n","authors":["Darryl E. Wright","Adriana V. Gregory","Deema Anaam","Sepideh Yadollahi","Sumana Ramanathan","Kafayat A. Oyemade","Reem Alsibai","Heather Holmes","Harrison Gottlich","Cherie-Akilah G. Browne","Sarah L. Cohen Rassier","Isabel Green","Elizabeth A. Stewart","Hiroaki Takahashi","Bohyun Kim","Shannon Laughlin-Tommaso","Timothy L. Kline"],"pdf_url":"https://arxiv.org/pdf/2308.10372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.03776v4","updated":"2023-08-20T21:27:52Z","published":"2022-03-07T23:38:01Z","title":"A Trainable Approach to Zero-delay Smoothing Spline Interpolation","summary":" The task of reconstructing smooth signals from streamed data in the form of\nsignal samples arises in various applications. This work addresses such a task\nsubject to a zero-delay response; that is, the smooth signal must be\nreconstructed sequentially as soon as a data sample is available and without\nhaving access to subsequent data. State-of-the-art approaches solve this\nproblem by interpolating consecutive data samples using splines. Here, each\ninterpolation step yields a piece that ensures a smooth signal reconstruction\nwhile minimizing a cost metric, typically a weighted sum between the squared\nresidual and a derivative-based measure of smoothness. As a result, a\nzero-delay interpolation is achieved in exchange for an almost certainly higher\ncumulative cost as compared to interpolating all data samples together. This\npaper presents a novel approach to further reduce this cumulative cost on\naverage. First, we formulate a zero-delay smoothing spline interpolation\nproblem from a sequential decision-making perspective, allowing us to model the\nfuture impact of each interpolated piece on the average cumulative cost. Then,\nan interpolation method is proposed to exploit the temporal dependencies\nbetween the streamed data samples. Our method is assisted by a recurrent neural\nnetwork and accordingly trained to reduce the accumulated cost on average over\na set of example data samples collected from the same signal source generating\nthe signal to be reconstructed. Finally, we present extensive experimental\nresults for synthetic and real data showing how our approach outperforms the\nabovementioned state-of-the-art.\n","authors":["Emilio Ruiz-Moreno","Luis Miguel López-Ramos","Baltasar Beferull-Lozano"],"pdf_url":"https://arxiv.org/pdf/2203.03776v4.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2301.12667v3","updated":"2023-08-20T21:19:13Z","published":"2023-01-30T05:08:05Z","title":"NeSyFOLD: Neurosymbolic Framework for Interpretable Image Classification","summary":" Deep learning models such as CNNs have surpassed human performance in\ncomputer vision tasks such as image classification. However, despite their\nsophistication, these models lack interpretability which can lead to biased\noutcomes reflecting existing prejudices in the data. We aim to make predictions\nmade by a CNN interpretable. Hence, we present a novel framework called\nNeSyFOLD to create a neurosymbolic (NeSy) model for image classification tasks.\nThe model is a CNN with all layers following the last convolutional layer\nreplaced by a stratified answer set program (ASP). A rule-based machine\nlearning algorithm called FOLD-SE-M is used to derive the stratified answer set\nprogram from binarized filter activations of the last convolutional layer. The\nanswer set program can be viewed as a rule-set, wherein the truth value of each\npredicate depends on the activation of the corresponding kernel in the CNN. The\nrule-set serves as a global explanation for the model and is interpretable. A\njustification for the predictions made by the NeSy model can be obtained using\nan ASP interpreter. We also use our NeSyFOLD framework with a CNN that is\ntrained using a sparse kernel learning technique called Elite BackProp (EBP).\nThis leads to a significant reduction in rule-set size without compromising\naccuracy or fidelity thus improving scalability of the NeSy model and\ninterpretability of its rule-set. Evaluation is done on datasets with varied\ncomplexity and sizes. To make the rule-set more intuitive to understand, we\npropose a novel algorithm for labelling each kernel's corresponding predicate\nin the rule-set with the semantic concept(s) it learns. We evaluate the\nperformance of our \"semantic labelling algorithm\" to quantify the efficacy of\nthe semantic labelling for both the NeSy model and the NeSy-EBP model.\n","authors":["Parth Padalkar","Huaduo Wang","Gopal Gupta"],"pdf_url":"https://arxiv.org/pdf/2301.12667v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10364v1","updated":"2023-08-20T20:49:15Z","published":"2023-08-20T20:49:15Z","title":"SE(3) Equivariant Augmented Coupling Flows","summary":" Coupling normalizing flows allow for fast sampling and density evaluation,\nmaking them the tool of choice for probabilistic modeling of physical systems.\nHowever, the standard coupling architecture precludes endowing flows that\noperate on the Cartesian coordinates of atoms with the SE(3) and permutation\ninvariances of physical systems. This work proposes a coupling flow that\npreserves SE(3) and permutation equivariance by performing coordinate splits\nalong additional augmented dimensions. At each layer, the flow maps atoms'\npositions into learned SE(3) invariant bases, where we apply standard flow\ntransformations, such as monotonic rational-quadratic splines, before returning\nto the original basis. Crucially, our flow preserves fast sampling and density\nevaluation, and may be used to produce unbiased estimates of expectations with\nrespect to the target distribution via importance sampling. When trained on the\nDW4, LJ13 and QM9-positional datasets, our flow is competitive with equivariant\ncontinuous normalizing flows, while allowing sampling two orders of magnitude\nfaster. Moreover, to the best of our knowledge, we are the first to learn the\nfull Boltzmann distribution of alanine dipeptide by only modeling the Cartesian\npositions of its atoms. Lastly, we demonstrate that our flow can be trained to\napproximately sample from the Boltzmann distribution of the DW4 and LJ13\nparticle systems using only their energy functions.\n","authors":["Laurence I. Midgley","Vincent Stimper","Javier Antorán","Emile Mathieu","Bernhard Schölkopf","José Miguel Hernández-Lobato"],"pdf_url":"https://arxiv.org/pdf/2308.10364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.01520v2","updated":"2023-08-20T20:37:31Z","published":"2023-01-04T10:17:16Z","title":"Towards Explainable Land Cover Mapping: a Counterfactual-based Strategy","summary":" Counterfactual explanations are an emerging tool to enhance interpretability\nof deep learning models. Given a sample, these methods seek to find and display\nto the user similar samples across the decision boundary. In this paper, we\npropose a generative adversarial counterfactual approach for satellite image\ntime series in a multi-class setting for the land cover classification task.\nOne of the distinctive features of the proposed approach is the lack of prior\nassumption on the targeted class for a given counterfactual explanation. This\ninherent flexibility allows for the discovery of interesting information on the\nrelationship between land cover classes. The other feature consists of\nencouraging the counterfactual to differ from the original sample only in a\nsmall and compact temporal segment. These time-contiguous perturbations allow\nfor a much sparser and, thus, interpretable solution. Furthermore,\nplausibility/realism of the generated counterfactual explanations is enforced\nvia the proposed adversarial learning strategy.\n","authors":["Cassio F. Dantas","Diego Marcos","Dino Ienco"],"pdf_url":"https://arxiv.org/pdf/2301.01520v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16699v2","updated":"2023-08-20T20:20:15Z","published":"2023-06-29T05:49:07Z","title":"Rapid-INR: Storage Efficient CPU-free DNN Training Using Implicit Neural\n Representation","summary":" Implicit Neural Representation (INR) is an innovative approach for\nrepresenting complex shapes or objects without explicitly defining their\ngeometry or surface structure. Instead, INR represents objects as continuous\nfunctions. Previous research has demonstrated the effectiveness of using neural\nnetworks as INR for image compression, showcasing comparable performance to\ntraditional methods such as JPEG. However, INR holds potential for various\napplications beyond image compression. This paper introduces Rapid-INR, a novel\napproach that utilizes INR for encoding and compressing images, thereby\naccelerating neural network training in computer vision tasks. Our methodology\ninvolves storing the whole dataset directly in INR format on a GPU, mitigating\nthe significant data communication overhead between the CPU and GPU during\ntraining. Additionally, the decoding process from INR to RGB format is highly\nparallelized and executed on-the-fly. To further enhance compression, we\npropose iterative and dynamic pruning, as well as layer-wise quantization,\nbuilding upon previous work. We evaluate our framework on the image\nclassification task, utilizing the ResNet-18 backbone network and three\ncommonly used datasets with varying image sizes. Rapid-INR reduces memory\nconsumption to only 5% of the original dataset size and achieves a maximum\n6$\\times$ speedup over the PyTorch training pipeline, as well as a maximum 1.2x\nspeedup over the DALI training pipeline, with only a marginal decrease in\naccuracy. Importantly, Rapid-INR can be readily applied to other computer\nvision tasks and backbone networks with reasonable engineering efforts. Our\nimplementation code is publicly available at\nhttps://github.com/sharc-lab/Rapid-INR.\n","authors":["Hanqiu Chen","Hang Yang","Stephen Fitzmeyer","Cong Hao"],"pdf_url":"https://arxiv.org/pdf/2306.16699v2.pdf","comment":"Accepted by ICCAD 2023"},{"id":"http://arxiv.org/abs/2307.04345v2","updated":"2023-08-20T19:58:49Z","published":"2023-07-10T05:06:41Z","title":"Continual Learning as Computationally Constrained Reinforcement Learning","summary":" An agent that efficiently accumulates knowledge to develop increasingly\nsophisticated skills over a long lifetime could advance the frontier of\nartificial intelligence capabilities. The design of such agents, which remains\na long-standing challenge of artificial intelligence, is addressed by the\nsubject of continual learning. This monograph clarifies and formalizes concepts\nof continual learning, introducing a framework and set of tools to stimulate\nfurther research.\n","authors":["Saurabh Kumar","Henrik Marklund","Ashish Rao","Yifan Zhu","Hong Jun Jeon","Yueyang Liu","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2307.04345v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.08794v2","updated":"2023-08-20T19:40:45Z","published":"2021-11-16T21:48:09Z","title":"Investigating Conversion from Mild Cognitive Impairment to Alzheimer's\n Disease using Latent Space Manipulation","summary":" Alzheimer's disease is the most common cause of dementia that affects\nmillions of lives worldwide. Investigating the underlying causes and risk\nfactors of Alzheimer's disease is essential to prevent its progression. Mild\nCognitive Impairment (MCI) is considered an intermediate stage before\nAlzheimer's disease. Early prediction of the conversion from the MCI to\nAlzheimer's is crucial to take necessary precautions for decelerating the\nprogression and developing suitable treatments. In this study, we propose a\ndeep learning framework to discover the variables which are identifiers of the\nconversion from MCI to Alzheimer's disease. In particular, the latent space of\na variational auto-encoder network trained with the MCI and Alzheimer's\npatients is manipulated to obtain the significant attributes and decipher their\nbehavior that leads to the conversion from MCI to Alzheimer's disease. By\nutilizing a generative decoder and the dimensions that lead to the Alzheimer's\ndiagnosis, we generate synthetic dementia patients from MCI patients in the\ndataset. Experimental results show promising quantitative and qualitative\nresults on one of the most extensive and commonly used Alzheimer's disease\nneuroimaging datasets in literature.\n","authors":["Deniz Sezin Ayvaz","Inci M. Baytas"],"pdf_url":"https://arxiv.org/pdf/2111.08794v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10345v1","updated":"2023-08-20T19:33:12Z","published":"2023-08-20T19:33:12Z","title":"Can Large Language Models Find And Fix Vulnerable Software?","summary":" In this study, we evaluated the capability of Large Language Models (LLMs),\nparticularly OpenAI's GPT-4, in detecting software vulnerabilities, comparing\ntheir performance against traditional static code analyzers like Snyk and\nFortify. Our analysis covered numerous repositories, including those from NASA\nand the Department of Defense. GPT-4 identified approximately four times the\nvulnerabilities than its counterparts. Furthermore, it provided viable fixes\nfor each vulnerability, demonstrating a low rate of false positives. Our tests\nencompassed 129 code samples across eight programming languages, revealing the\nhighest vulnerabilities in PHP and JavaScript. GPT-4's code corrections led to\na 90% reduction in vulnerabilities, requiring only an 11% increase in code\nlines. A critical insight was LLMs' ability to self-audit, suggesting fixes for\ntheir identified vulnerabilities and underscoring their precision. Future\nresearch should explore system-level vulnerabilities and integrate multiple\nstatic code analyzers for a holistic perspective on LLMs' potential.\n","authors":["David Noever"],"pdf_url":"https://arxiv.org/pdf/2308.10345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09580v2","updated":"2023-08-20T19:31:29Z","published":"2023-02-19T14:12:48Z","title":"Non-separable Covariance Kernels for Spatiotemporal Gaussian Processes\n based on a Hybrid Spectral Method and the Harmonic Oscillator","summary":" Gaussian processes provide a flexible, non-parametric framework for the\napproximation of functions in high-dimensional spaces. The covariance kernel is\nthe main engine of Gaussian processes, incorporating correlations that underpin\nthe predictive distribution. For applications with spatiotemporal datasets,\nsuitable kernels should model joint spatial and temporal dependence. Separable\nspace-time covariance kernels offer simplicity and computational efficiency.\nHowever, non-separable kernels include space-time interactions that better\ncapture observed correlations. Most non-separable kernels that admit explicit\nexpressions are based on mathematical considerations (admissibility conditions)\nrather than first-principles derivations. We present a hybrid spectral approach\nfor generating covariance kernels which is based on physical arguments. We use\nthis approach to derive a new class of physically motivated, non-separable\ncovariance kernels which have their roots in the stochastic, linear, damped,\nharmonic oscillator (LDHO). The new kernels incorporate functions with both\nmonotonic and oscillatory decay of space-time correlations. The LDHO covariance\nkernels involve space-time interactions which are introduced by dispersion\nrelations that modulate the oscillator coefficients. We derive explicit\nrelations for the spatiotemporal covariance kernels in the three oscillator\nregimes (underdamping, critical damping, overdamping) and investigate their\nproperties.\n","authors":["Dionissios T. Hristopulos"],"pdf_url":"https://arxiv.org/pdf/2302.09580v2.pdf","comment":"56 pages, 12 figures, five appendices"},{"id":"http://arxiv.org/abs/2308.10328v1","updated":"2023-08-20T17:52:02Z","published":"2023-08-20T17:52:02Z","title":"A Comprehensive Empirical Evaluation on Online Continual Learning","summary":" Online continual learning aims to get closer to a live learning experience by\nlearning directly on a stream of data with temporally shifting distribution and\nby storing a minimum amount of data from that stream. In this empirical\nevaluation, we evaluate various methods from the literature that tackle online\ncontinual learning. More specifically, we focus on the class-incremental\nsetting in the context of image classification, where the learner must learn\nnew classes incrementally from a stream of data. We compare these methods on\nthe Split-CIFAR100 and Split-TinyImagenet benchmarks, and measure their average\naccuracy, forgetting, stability, and quality of the representations, to\nevaluate various aspects of the algorithm at the end but also during the whole\ntraining period. We find that most methods suffer from stability and\nunderfitting issues. However, the learned representations are comparable to\ni.i.d. training under the same computational budget. No clear winner emerges\nfrom the results and basic experience replay, when properly tuned and\nimplemented, is a very strong baseline. We release our modular and extensible\ncodebase at https://github.com/AlbinSou/ocl_survey based on the avalanche\nframework to reproduce our results and encourage future research.\n","authors":["Albin Soutif--Cormerais","Antonio Carta","Andrea Cossu","Julio Hurtado","Vincenzo Lomonaco","Joost Van de Weijer","Hamed Hemati"],"pdf_url":"https://arxiv.org/pdf/2308.10328v1.pdf","comment":"ICCV Visual Continual Learning Workshop 2023 accepted paper"},{"id":"http://arxiv.org/abs/2308.10327v1","updated":"2023-08-20T17:51:24Z","published":"2023-08-20T17:51:24Z","title":"Quantum State Tomography using Quantum Machine Learning","summary":" Quantum State Tomography (QST) is a fundamental technique in Quantum\nInformation Processing (QIP) for reconstructing unknown quantum states.\nHowever, the conventional QST methods are limited by the number of measurements\nrequired, which makes them impractical for large-scale quantum systems. To\novercome this challenge, we propose the integration of Quantum Machine Learning\n(QML) techniques to enhance the efficiency of QST. In this paper, we conduct a\ncomprehensive investigation into various approaches for QST, encompassing both\nclassical and quantum methodologies; We also implement different QML approaches\nfor QST and demonstrate their effectiveness on various simulated and\nexperimental quantum systems, including multi-qubit networks. Our results show\nthat our QML-based QST approach can achieve high fidelity (98%) with\nsignificantly fewer measurements than conventional methods, making it a\npromising tool for practical QIP applications.\n","authors":["Nouhaila Innan","Owais Ishtiaq Siddiqui","Shivang Arora","Tamojit Ghosh","Yasemin Poyraz Koçak","Dominic Paragas","Abdullah Al Omar Galib","Muhammad Al-Zafar Khan","Mohamed Bennai"],"pdf_url":"https://arxiv.org/pdf/2308.10327v1.pdf","comment":"18 pages, 19 figures"},{"id":"http://arxiv.org/abs/2308.10322v1","updated":"2023-08-20T17:17:27Z","published":"2023-08-20T17:17:27Z","title":"Homogenising SoHO/EIT and SDO/AIA 171Å$~$ Images: A Deep Learning\n Approach","summary":" Extreme Ultraviolet images of the Sun are becoming an integral part of space\nweather prediction tasks. However, having different surveys requires the\ndevelopment of instrument-specific prediction algorithms. As an alternative, it\nis possible to combine multiple surveys to create a homogeneous dataset. In\nthis study, we utilize the temporal overlap of SoHO/EIT and SDO/AIA 171~\\AA\n~surveys to train an ensemble of deep learning models for creating a single\nhomogeneous survey of EUV images for 2 solar cycles. Prior applications of deep\nlearning have focused on validating the homogeneity of the output while\noverlooking the systematic estimation of uncertainty. We use an approach called\n`Approximate Bayesian Ensembling' to generate an ensemble of models whose\nuncertainty mimics that of a fully Bayesian neural network at a fraction of the\ncost. We find that ensemble uncertainty goes down as the training set size\nincreases. Additionally, we show that the model ensemble adds immense value to\nthe prediction by showing higher uncertainty in test data that are not well\nrepresented in the training data.\n","authors":["Subhamoy Chatterjee","Andrés Muñoz-Jaramillo","Maher Dayeh","Hazel M. Bain","Kimberly Moreland"],"pdf_url":"https://arxiv.org/pdf/2308.10322v1.pdf","comment":"20 pages, 8 figures, accepted for publication in ApJS"},{"id":"http://arxiv.org/abs/2305.01154v2","updated":"2023-08-20T17:05:10Z","published":"2023-05-02T02:04:19Z","title":"FedAVO: Improving Communication Efficiency in Federated Learning with\n African Vultures Optimizer","summary":" Federated Learning (FL), a distributed machine learning technique has\nrecently experienced tremendous growth in popularity due to its emphasis on\nuser data privacy. However, the distributed computations of FL can result in\nconstrained communication and drawn-out learning processes, necessitating the\nclient-server communication cost optimization. The ratio of chosen clients and\nthe quantity of local training passes are two hyperparameters that have a\nsignificant impact on FL performance. Due to different training preferences\nacross various applications, it can be difficult for FL practitioners to\nmanually select such hyperparameters. In our research paper, we introduce\nFedAVO, a novel FL algorithm that enhances communication effectiveness by\nselecting the best hyperparameters leveraging the African Vulture Optimizer\n(AVO). Our research demonstrates that the communication costs associated with\nFL operations can be substantially reduced by adopting AVO for FL\nhyperparameter adjustment. Through extensive evaluations of FedAVO on benchmark\ndatasets, we show that FedAVO achieves significant improvement in terms of\nmodel accuracy and communication round, particularly with realistic cases of\nNon-IID datasets. Our extensive evaluation of the FedAVO algorithm identifies\nthe optimal hyperparameters that are appropriately fitted for the benchmark\ndatasets, eventually increasing global model accuracy by 6% in comparison to\nthe state-of-the-art FL algorithms (such as FedAvg, FedProx, FedPSO, etc.).\n","authors":["Md Zarif Hossain","Ahmed Imteaj"],"pdf_url":"https://arxiv.org/pdf/2305.01154v2.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2308.10317v1","updated":"2023-08-20T16:35:21Z","published":"2023-08-20T16:35:21Z","title":"Towards Sustainable Development: A Novel Integrated Machine Learning\n Model for Holistic Environmental Health Monitoring","summary":" Urbanization enables economic growth but also harms the environment through\ndegradation. Traditional methods of detecting environmental issues have proven\ninefficient. Machine learning has emerged as a promising tool for tracking\nenvironmental deterioration by identifying key predictive features. Recent\nresearch focused on developing a predictive model using pollutant levels and\nparticulate matter as indicators of environmental state in order to outline\nchallenges. Machine learning was employed to identify patterns linking areas\nwith worse conditions. This research aims to assist governments in identifying\nintervention points, improving planning and conservation efforts, and\nultimately contributing to sustainable development.\n","authors":["Anirudh Mazumder","Sarthak Engala","Aditya Nallaparaju"],"pdf_url":"https://arxiv.org/pdf/2308.10317v1.pdf","comment":"5 pages, 3 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.10388v1","updated":"2023-08-20T23:30:27Z","published":"2023-08-20T23:30:27Z","title":"Neural Architectures Learning Fourier Transforms, Signal Processing and\n Much More....","summary":" This report will explore and answer fundamental questions about taking\nFourier Transforms and tying it with recent advances in AI and neural\narchitecture. One interpretation of the Fourier Transform is decomposing a\nsignal into its constituent components by projecting them onto complex\nexponentials. Variants exist, such as discrete cosine transform that does not\noperate on the complex domain and projects an input signal to only cosine\nfunctions oscillating at different frequencies. However, this is a fundamental\nlimitation, and it needs to be more suboptimal. The first one is that all\nkernels are sinusoidal: What if we could have some kernels adapted or learned\naccording to the problem? What if we can use neural architectures for this? We\nshow how one can learn these kernels from scratch for audio signal processing\napplications. We find that the neural architecture not only learns sinusoidal\nkernel shapes but discovers all kinds of incredible signal-processing\nproperties. E.g., windowing functions, onset detectors, high pass filters, low\npass filters, modulations, etc. Further, upon analysis of the filters, we find\nthat the neural architecture has a comb filter-like structure on top of the\nlearned kernels. Comb filters that allow harmonic frequencies to pass through\nare one of the core building blocks/types of filters similar to high-pass,\nlow-pass, and band-pass filters of various traditional signal processing\nalgorithms. Further, we can also use the convolution operation with a signal to\nbe learned from scratch, and we will explore papers in the literature that uses\nthis with that robust Transformer architectures. Further, we would also explore\nmaking the learned kernel's content adaptive, i.e., learning different kernels\nfor different inputs.\n","authors":["Prateek Verma"],"pdf_url":"https://arxiv.org/pdf/2308.10388v1.pdf","comment":"12 pages, 6 figures. Technical Report at Stanford University;\n Presented on 14th August 2023"},{"id":"http://arxiv.org/abs/2308.07056v3","updated":"2023-08-20T14:39:06Z","published":"2023-08-14T10:31:29Z","title":"VoxBlink: X-Large Speaker Verification Dataset on Camera","summary":" In this paper, we contribute a novel and extensive dataset for speaker\nverification, which contains noisy 38k identities/1.45M utterances (VoxBlink)\nand relatively cleaned 18k identities/1.02M (VoxBlink-Clean) utterances for\ntraining. Firstly, we accumulate a 60K+ users' list with their avatars and\ndownload their short videos on YouTube. We then established an automatic and\nscalable pipeline to extract relevant speech and video segments from these\nvideos. To our knowledge, the VoxBlink dataset is one of the largest speaker\nrecognition datasets available. Secondly, we conduct a series of experiments\nbased on different backbones trained on a mix of the VoxCeleb2 and the\nVoxBlink-Clean. Our findings highlight a notable performance improvement,\nranging from 13% to 30%, across different backbone architectures upon\nintegrating our dataset for training. The dataset will be made publicly\navailable shortly.\n","authors":["Yuke Lin","Xiaoyi Qin","Ming Cheng","Ning Jiang","Guoqing Zhao","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2308.07056v3.pdf","comment":"submit to ICASSP2023"},{"id":"http://arxiv.org/abs/2308.10195v1","updated":"2023-08-20T07:56:34Z","published":"2023-08-20T07:56:34Z","title":"WMFormer++: Nested Transformer for Visible Watermark Removal via Implict\n Joint Learning","summary":" Watermarking serves as a widely adopted approach to safeguard media\ncopyright. In parallel, the research focus has extended to watermark removal\ntechniques, offering an adversarial means to enhance watermark robustness and\nfoster advancements in the watermarking field. Existing watermark removal\nmethods often rely on UNet architectures with multiple decoder branches -- one\nfor watermark localization and the other for background image restoration.\nThese methods involve complex module designs to guide information flow for\nrespective tasks, which can lead to suboptimal performance and an overly\ncumbersome model. To simplify the existing framework, we propose a novel\nTransformer-based approach with a unified decoder branch, treating watermark\nextraction and background restoration as a single task and allowing thenetwork\nto learn information flow between them without artificial design patterns.\nAdditionally, we utilize nested structures to facilitate multi-scale feature\nfusion, forming a parallel ensemble of nested structures that constitute the\nUNet. Supervision is applied to UNets with varying depths to facilitate\nknowledge learning across all levels. Extensive experiments are conducted on\nvarious challenging benchmarks to validate the effectiveness of our proposed\nmethod. The results demonstrate that our approach achieves state-of-the-art\nperformance and produces high-quality images.\n","authors":["Dongjian Huo","Zehong Zhang","Hanjing Su","Guanbin Li","Chaowei Fang","Qingyao Wu"],"pdf_url":"https://arxiv.org/pdf/2308.10195v1.pdf","comment":null}]},"2023-08-19T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2307.09702v4","updated":"2023-08-19T21:27:51Z","published":"2023-07-19T01:14:49Z","title":"Efficient Guided Generation for Large Language Models","summary":" In this article we show how the problem of neural text generation can be\nconstructively reformulated in terms of transitions between the states of a\nfinite-state machine. This framework leads to an efficient approach to guiding\ntext generation with regular expressions and context-free grammars by allowing\nthe construction of an index over a language model's vocabulary. The approach\nis model agnostic, allows one to enforce domain-specific knowledge and\nconstraints, and enables the construction of reliable interfaces by\nguaranteeing the structure of the generated text. It adds little overhead to\nthe token sequence generation process and significantly outperforms existing\nsolutions. An implementation is provided in the open source Python library\nOutlines\n","authors":["Brandon T. Willard","Rémi Louf"],"pdf_url":"https://arxiv.org/pdf/2307.09702v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10107v1","updated":"2023-08-19T20:48:16Z","published":"2023-08-19T20:48:16Z","title":"Bayes Risk Transducer: Transducer with Controllable Alignment Prediction","summary":" Automatic speech recognition (ASR) based on transducers is widely used. In\ntraining, a transducer maximizes the summed posteriors of all paths. The path\nwith the highest posterior is commonly defined as the predicted alignment\nbetween the speech and the transcription. While the vanilla transducer does not\nhave a prior preference for any of the valid paths, this work intends to\nenforce the preferred paths and achieve controllable alignment prediction.\nSpecifically, this work proposes Bayes Risk Transducer (BRT), which uses a\nBayes risk function to set lower risk values to the preferred paths so that the\npredicted alignment is more likely to satisfy specific desired properties. We\nfurther demonstrate that these predicted alignments with intentionally designed\nproperties can provide practical advantages over the vanilla transducer.\nExperimentally, the proposed BRT saves inference cost by up to 46% for\nnon-streaming ASR and reduces overall system latency by 41% for streaming ASR.\n","authors":["Jinchuan Tian","Jianwei Yu","Hangting Chen","Brian Yan","Chao Weng","Dong Yu","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2308.10107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10103v1","updated":"2023-08-19T20:18:15Z","published":"2023-08-19T20:18:15Z","title":"ASPIRE: Language-Guided Augmentation for Robust Image Classification","summary":" Neural image classifiers can often learn to make predictions by overly\nrelying on non-predictive features that are spuriously correlated with the\nclass labels in the training data. This leads to poor performance in real-world\natypical scenarios where such features are absent. Supplementing the training\ndataset with images without such spurious features can aid robust learning\nagainst spurious correlations via better generalization. This paper presents\nASPIRE (Language-guided data Augmentation for SPurIous correlation REmoval), a\nsimple yet effective solution for expanding the training dataset with synthetic\nimages without spurious features. ASPIRE, guided by language, generates these\nimages without requiring any form of additional supervision or existing\nexamples. Precisely, we employ LLMs to first extract foreground and background\nfeatures from textual descriptions of an image, followed by advanced\nlanguage-guided image editing to discover the features that are spuriously\ncorrelated with the class label. Finally, we personalize a text-to-image\ngeneration model to generate diverse in-domain images without spurious\nfeatures. We demonstrate the effectiveness of ASPIRE on 4 datasets, including\nthe very challenging Hard ImageNet dataset, and 9 baselines and show that\nASPIRE improves the classification accuracy of prior methods by 1% - 38%. Code\nsoon at: https://github.com/Sreyan88/ASPIRE.\n","authors":["Sreyan Ghosh","Chandra Kiran Reddy Evuru","Sonal Kumar","Utkarsh Tyagi","Sakshi Singh","Sanjoy Chowdhury","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2308.10103v1.pdf","comment":"Pre-print Under Review"},{"id":"http://arxiv.org/abs/2308.10092v1","updated":"2023-08-19T18:58:32Z","published":"2023-08-19T18:58:32Z","title":"Open, Closed, or Small Language Models for Text Classification?","summary":" Recent advancements in large language models have demonstrated remarkable\ncapabilities across various NLP tasks. But many questions remain, including\nwhether open-source models match closed ones, why these models excel or\nstruggle with certain tasks, and what types of practical procedures can improve\nperformance. We address these questions in the context of classification by\nevaluating three classes of models using eight datasets across three distinct\ntasks: named entity recognition, political party prediction, and misinformation\ndetection. While larger LLMs often lead to improved performance, open-source\nmodels can rival their closed-source counterparts by fine-tuning. Moreover,\nsupervised smaller models, like RoBERTa, can achieve similar or even greater\nperformance in many datasets compared to generative LLMs. On the other hand,\nclosed models maintain an advantage in hard tasks that demand the most\ngeneralizability. This study underscores the importance of model selection\nbased on task requirements\n","authors":["Hao Yu","Zachary Yang","Kellin Pelrine","Jean Francois Godbout","Reihaneh Rabbany"],"pdf_url":"https://arxiv.org/pdf/2308.10092v1.pdf","comment":"14 pages, 15 Tables, 1 Figure"},{"id":"http://arxiv.org/abs/2308.10088v1","updated":"2023-08-19T18:47:44Z","published":"2023-08-19T18:47:44Z","title":"PACE: Improving Prompt with Actor-Critic Editing for Large Language\n Model","summary":" Large language models (LLMs) have showcased remarkable potential across\nvarious tasks by conditioning on prompts. However, the quality of different\nhuman-written prompts leads to substantial discrepancies in LLMs' performance,\nand improving prompts usually necessitates considerable human effort and\nexpertise. To this end, this paper proposes Prompt with Actor-Critic Editing\n(PACE) for LLMs to enable automatic prompt editing. Drawing inspiration from\nthe actor-critic algorithm in reinforcement learning, PACE leverages LLMs as\nthe dual roles of actors and critics, conceptualizing prompt as a type of\npolicy. PACE refines prompt, taking into account the feedback from both actors\nperforming prompt and critics criticizing response. This process helps LLMs\nbetter align prompt to a specific task, thanks to real responses and thinking\nfrom LLMs. We conduct extensive experiments on 24 instruction induction tasks\nand 21 big-bench tasks. Experimental results indicate that PACE elevates the\nrelative performance of medium/low-quality human-written prompts by up to 98\\%,\nwhich has comparable performance to high-quality human-written prompts.\nMoreover, PACE also exhibits notable efficacy for prompt generation.\n","authors":["Yihong Dong","Kangcheng Luo","Xue Jiang","Zhi Jin","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2308.10088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10045v1","updated":"2023-08-19T15:08:10Z","published":"2023-08-19T15:08:10Z","title":"An Empirical Study of CLIP for Text-based Person Search","summary":" Text-based Person Search (TBPS) aims to retrieve the person images using\nnatural language descriptions. Recently, Contrastive Language Image Pretraining\n(CLIP), a universal large cross-modal vision-language pre-training model, has\nremarkably performed over various cross-modal downstream tasks due to its\npowerful cross-modal semantic learning capacity. TPBS, as a fine-grained\ncross-modal retrieval task, is also facing the rise of research on the\nCLIP-based TBPS. In order to explore the potential of the visual-language\npre-training model for downstream TBPS tasks, this paper makes the first\nattempt to conduct a comprehensive empirical study of CLIP for TBPS and thus\ncontribute a straightforward, incremental, yet strong TBPS-CLIP baseline to the\nTBPS community. We revisit critical design considerations under CLIP, including\ndata augmentation and loss function. The model, with the aforementioned designs\nand practical training tricks, can attain satisfactory performance without any\nsophisticated modules. Also, we conduct the probing experiments of TBPS-CLIP in\nmodel generalization and model compression, demonstrating the effectiveness of\nTBPS-CLIP from various aspects. This work is expected to provide empirical\ninsights and highlight future CLIP-based TBPS research.\n","authors":["Min Cao","Yang Bai","Ziyin Zeng","Mang Ye","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10045v1.pdf","comment":"13 pages, 5 fiugres and 17 tables. Code is available at\n https://github.com/Flame-Chasers/TBPS-CLIP"},{"id":"http://arxiv.org/abs/2308.10032v1","updated":"2023-08-19T14:33:40Z","published":"2023-08-19T14:33:40Z","title":"GameEval: Evaluating LLMs on Conversational Games","summary":" The rapid advancements in large language models (LLMs) have presented\nchallenges in evaluating those models. Existing evaluation methods are either\nreference-based or preference based, which inevitably need human intervention\nor introduce test bias caused by evaluator models. In this paper, we propose\nGameEval, a novel approach to evaluating LLMs through goal-driven\nconversational games, overcoming the limitations of previous methods. GameEval\ntreats LLMs as game players and assigns them distinct roles with specific goals\nachieved by launching conversations of various forms, including discussion,\nquestion answering, and voting. We design three unique games with cooperative\nor adversarial objectives, accompanied by corresponding evaluation metrics, to\nshow how this new paradigm comprehensively evaluates model performance.Through\nextensive experiments, we show that GameEval can effectively differentiate the\ncapabilities of various LLMs, providing a comprehensive assessment of their\nintegrated abilities to solve complex problems. Our public anonymous code is\navailable at https://github.com/GameEval/GameEval.\n","authors":["Dan Qiao","Chenfei Wu","Yaobo Liang","Juntao Li","Nan Duan"],"pdf_url":"https://arxiv.org/pdf/2308.10032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10025v1","updated":"2023-08-19T14:17:57Z","published":"2023-08-19T14:17:57Z","title":"ControlRetriever: Harnessing the Power of Instructions for Controllable\n Retrieval","summary":" Recent studies have shown that dense retrieval models, lacking dedicated\ntraining data, struggle to perform well across diverse retrieval tasks, as\ndifferent retrieval tasks often entail distinct search intents. To address this\nchallenge, in this work we introduce ControlRetriever, a generic and efficient\napproach with a parameter isolated architecture, capable of controlling dense\nretrieval models to directly perform varied retrieval tasks, harnessing the\npower of instructions that explicitly describe retrieval intents in natural\nlanguage. Leveraging the foundation of ControlNet, which has proven powerful in\ntext-to-image generation, ControlRetriever imbues different retrieval models\nwith the new capacity of controllable retrieval, all while being guided by\ntask-specific instructions. Furthermore, we propose a novel LLM guided\nInstruction Synthesizing and Iterative Training strategy, which iteratively\ntunes ControlRetriever based on extensive automatically-generated retrieval\ndata with diverse instructions by capitalizing the advancement of large\nlanguage models. Extensive experiments show that in the BEIR benchmark, with\nonly natural language descriptions of specific retrieval intent for each task,\nControlRetriever, as a unified multi-task retrieval system without\ntask-specific tuning, significantly outperforms baseline methods designed with\ntask-specific retrievers and also achieves state-of-the-art zero-shot\nperformance.\n","authors":["Kaihang Pan","Juncheng Li","Hongye Song","Hao Fei","Wei Ji","Shuo Zhang","Jun Lin","Xiaozhong Liu","Siliang Tang"],"pdf_url":"https://arxiv.org/pdf/2308.10025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15247v2","updated":"2023-08-19T14:04:41Z","published":"2023-03-27T14:31:25Z","title":"Zero-Shot Composed Image Retrieval with Textual Inversion","summary":" Composed Image Retrieval (CIR) aims to retrieve a target image based on a\nquery composed of a reference image and a relative caption that describes the\ndifference between the two images. The high effort and cost required for\nlabeling datasets for CIR hamper the widespread usage of existing methods, as\nthey rely on supervised learning. In this work, we propose a new task,\nZero-Shot CIR (ZS-CIR), that aims to address CIR without requiring a labeled\ntraining dataset. Our approach, named zero-Shot composEd imAge Retrieval with\ntextuaL invErsion (SEARLE), maps the visual features of the reference image\ninto a pseudo-word token in CLIP token embedding space and integrates it with\nthe relative caption. To support research on ZS-CIR, we introduce an\nopen-domain benchmarking dataset named Composed Image Retrieval on Common\nObjects in context (CIRCO), which is the first dataset for CIR containing\nmultiple ground truths for each query. The experiments show that SEARLE\nexhibits better performance than the baselines on the two main datasets for CIR\ntasks, FashionIQ and CIRR, and on the proposed CIRCO. The dataset, the code and\nthe model are publicly available at https://github.com/miccunifi/SEARLE.\n","authors":["Alberto Baldrati","Lorenzo Agnolucci","Marco Bertini","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2303.15247v2.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.09985v1","updated":"2023-08-19T11:31:45Z","published":"2023-08-19T11:31:45Z","title":"HICL: Hashtag-Driven In-Context Learning for Social Media Natural\n Language Understanding","summary":" Natural language understanding (NLU) is integral to various social media\napplications. However, existing NLU models rely heavily on context for semantic\nlearning, resulting in compromised performance when faced with short and noisy\nsocial media content. To address this issue, we leverage in-context learning\n(ICL), wherein language models learn to make inferences by conditioning on a\nhandful of demonstrations to enrich the context and propose a novel\nhashtag-driven in-context learning (HICL) framework. Concretely, we pre-train a\nmodel #Encoder, which employs #hashtags (user-annotated topic labels) to drive\nBERT-based pre-training through contrastive learning. Our objective here is to\nenable #Encoder to gain the ability to incorporate topic-related semantic\ninformation, which allows it to retrieve topic-related posts to enrich contexts\nand enhance social media NLU with noisy contexts. To further integrate the\nretrieved context with the source text, we employ a gradient-based method to\nidentify trigger terms useful in fusing information from both sources. For\nempirical studies, we collected 45M tweets to set up an in-context NLU\nbenchmark, and the experimental results on seven downstream tasks show that\nHICL substantially advances the previous state-of-the-art results. Furthermore,\nwe conducted extensive analyzes and found that: (1) combining source input with\na top-retrieved post from #Encoder is more effective than using semantically\nsimilar posts; (2) trigger words can largely benefit in merging context from\nthe source and retrieved posts.\n","authors":["Hanzhuo Tan","Chunpu Xu","Jing Li","Yuqun Zhang","Zeyang Fang","Zeyu Chen","Baohua Lai"],"pdf_url":"https://arxiv.org/pdf/2308.09985v1.pdf","comment":"https://github.com/albertan017/HICL"},{"id":"http://arxiv.org/abs/2308.09975v1","updated":"2023-08-19T10:38:00Z","published":"2023-08-19T10:38:00Z","title":"FinEval: A Chinese Financial Domain Knowledge Evaluation Benchmark for\n Large Language Models","summary":" Large language models (LLMs) have demonstrated exceptional performance in\nvarious natural language processing tasks, yet their efficacy in more\nchallenging and domain-specific tasks remains largely unexplored. This paper\npresents FinEval, a benchmark specifically designed for the financial domain\nknowledge in the LLMs. FinEval is a collection of high-quality multiple-choice\nquestions covering Finance, Economy, Accounting, and Certificate. It includes\n4,661 questions spanning 34 different academic subjects. To ensure a\ncomprehensive model performance evaluation, FinEval employs a range of prompt\ntypes, including zero-shot and few-shot prompts, as well as answer-only and\nchain-of-thought prompts. Evaluating state-of-the-art Chinese and English LLMs\non FinEval, the results show that only GPT-4 achieved an accuracy close to 70%\nin different prompt settings, indicating significant growth potential for LLMs\nin the financial domain knowledge. Our work offers a more comprehensive\nfinancial knowledge evaluation benchmark, utilizing data of mock exams and\ncovering a wide range of evaluated LLMs.\n","authors":["Liwen Zhang","Weige Cai","Zhaowei Liu","Zhi Yang","Wei Dai","Yujie Liao","Qianru Qin","Yifei Li","Xingyu Liu","Zhiqiang Liu","Zhoufan Zhu","Anbo Wu","Xin Guo","Yun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.09975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09970v1","updated":"2023-08-19T10:10:49Z","published":"2023-08-19T10:10:49Z","title":"Tackling Vision Language Tasks Through Learning Inner Monologues","summary":" Visual language tasks require AI models to comprehend and reason with both\nvisual and textual content. Driven by the power of Large Language Models\n(LLMs), two prominent methods have emerged: (1) the hybrid integration between\nLLMs and Vision-Language Models (VLMs), where visual inputs are firstly\nconverted into language descriptions by VLMs, serving as inputs for LLMs to\ngenerate final answer(s); (2) visual feature alignment in language space, where\nvisual inputs are encoded as embeddings and projected to LLMs' language space\nvia further supervised fine-tuning. The first approach provides light training\ncosts and interpretability but is hard to be optimized in an end-to-end\nfashion. The second approach presents decent performance, but feature alignment\nusually requires large amounts of training data and lacks interpretability. To\ntackle this dilemma, we propose a novel approach, Inner Monologue Multi-Modal\nOptimization (IMMO), to solve complex vision language problems by simulating\ninner monologue processes, a cognitive process in which an individual engages\nin silent verbal communication with themselves. We enable LLMs and VLMs to\ninteract through natural language conversation and propose to use a two-stage\ntraining process to learn how to do the inner monologue (self-asking questions\nand answering questions). IMMO is evaluated on two popular tasks and the\nresults suggest by emulating the cognitive phenomenon of internal dialogue, our\napproach can enhance reasoning and explanation abilities, contributing to the\nmore effective fusion of vision and language models. More importantly, instead\nof using predefined human-crafted monologues, IMMO learns this process within\nthe deep learning models, promising wider applicability to many different AI\nproblems beyond vision language tasks.\n","authors":["Diji Yang","Kezhen Chen","Jinmeng Rao","Xiaoyuan Guo","Yawen Zhang","Jie Yang","Yi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09957v1","updated":"2023-08-19T09:19:34Z","published":"2023-08-19T09:19:34Z","title":"Data-to-text Generation for Severely Under-Resourced Languages with\n GPT-3.5: A Bit of Help Needed from Google Translate","summary":" LLMs like GPT are great at tasks involving English which dominates in their\ntraining data. In this paper, we look at how they cope with tasks involving\nlanguages that are severely under-represented in their training data, in the\ncontext of data-to-text generation for Irish, Maltese, Welsh and Breton. During\nthe prompt-engineering phase we tested a range of prompt types and formats on\nGPT-3.5 and~4 with a small sample of example input/output pairs. We then fully\nevaluated the two most promising prompts in two scenarios: (i) direct\ngeneration into the under-resourced language, and (ii) generation into English\nfollowed by translation into the under-resourced language. We find that\nfew-shot prompting works better for direct generation into under-resourced\nlanguages, but that the difference disappears when pivoting via English. The\nfew-shot + translation system variants were submitted to the WebNLG 2023 shared\ntask where they outperformed competitor systems by substantial margins in all\nlanguages on all metrics. We conclude that good performance on under-resourced\nlanguages can be achieved out-of-the box with state-of-the-art LLMs. However,\nour best results (for Welsh) remain well below the lowest ranked English system\nat WebNLG'20.\n","authors":["Michela Lorandi","Anya Belz"],"pdf_url":"https://arxiv.org/pdf/2308.09957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09954v1","updated":"2023-08-19T09:17:19Z","published":"2023-08-19T09:17:19Z","title":"Eva-KELLM: A New Benchmark for Evaluating Knowledge Editing of LLMs","summary":" Large language models (LLMs) possess a wealth of knowledge encoded in their\nparameters. However, this knowledge may become outdated or unsuitable over\ntime. As a result, there has been a growing interest in knowledge editing for\nLLMs and evaluating its effectiveness. Existing studies primarily focus on\nknowledge editing using factual triplets, which not only incur high costs for\ncollection but also struggle to express complex facts. Furthermore, these\nstudies are often limited in their evaluation perspectives. In this paper, we\npropose Eva-KELLM, a new benchmark for evaluating knowledge editing of LLMs.\nThis benchmark includes an evaluation framework and a corresponding dataset.\nUnder our framework, we first ask the LLM to perform knowledge editing using\nraw documents, which provides a more convenient and universal approach compared\nto using factual triplets. We then evaluate the updated LLM from multiple\nperspectives. In addition to assessing the effectiveness of knowledge editing\nand the retention of unrelated knowledge from conventional studies, we further\ntest the LLM's ability in two aspects: 1) Reasoning with the altered knowledge,\naiming for the LLM to genuinely learn the altered knowledge instead of simply\nmemorizing it. 2) Cross-lingual knowledge transfer, where the LLM updated with\nraw documents in one language should be capable of handling queries from\nanother language. To facilitate further research, we construct and release the\ncorresponding dataset. Using this benchmark, we investigate the effectiveness\nof several commonly-used knowledge editing methods. Experimental results\nindicate that the current methods for knowledge editing using raw documents are\nnot effective in yielding satisfactory results, particularly when it comes to\nreasoning with altered knowledge and cross-lingual knowledge transfer.\n","authors":["Suhang Wu","Minlong Peng","Yue Chen","Jinsong Su","Mingming Sun"],"pdf_url":"https://arxiv.org/pdf/2308.09954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05974v2","updated":"2023-08-19T07:31:01Z","published":"2022-12-12T15:29:48Z","title":"Federated Few-Shot Learning for Mobile NLP","summary":" Natural language processing (NLP) sees rich mobile applications. To support\nvarious language understanding tasks, a foundation NLP model is often\nfine-tuned in a federated, privacy-preserving setting (FL). This process\ncurrently relies on at least hundreds of thousands of labeled training samples\nfrom mobile clients; yet mobile users often lack willingness or knowledge to\nlabel their data. Such an inadequacy of data labels is known as a few-shot\nscenario; it becomes the key blocker for mobile NLP applications.\n For the first time, this work investigates federated NLP in the few-shot\nscenario (FedFSL). By retrofitting algorithmic advances of pseudo labeling and\nprompt learning, we first establish a training pipeline that delivers\ncompetitive accuracy when only 0.05% (fewer than 100) of the training data is\nlabeled and the remaining is unlabeled. To instantiate the workflow, we further\npresent a system FeS, addressing the high execution cost with novel designs.\n(1) Curriculum pacing, which injects pseudo labels to the training workflow at\na rate commensurate to the learning progress; (2) Representational diversity, a\nmechanism for selecting the most learnable data, only for which pseudo labels\nwill be generated; (3) Co-planning of a model's training depth and layer\ncapacity. Together, these designs reduce the training delay, client energy, and\nnetwork traffic by up to 46.0$\\times$, 41.2$\\times$ and 3000.0$\\times$,\nrespectively. Through algorithm/system co-design, FFNLP demonstrates that FL\ncan apply to challenging settings where most training samples are unlabeled.\n","authors":["Dongqi Cai","Shangguang Wang","Yaozong Wu","Felix Xiaozhu Lin","Mengwei Xu"],"pdf_url":"https://arxiv.org/pdf/2212.05974v2.pdf","comment":"MobiCom 2023"},{"id":"http://arxiv.org/abs/2212.00192v2","updated":"2023-08-19T07:28:53Z","published":"2022-12-01T00:36:48Z","title":"Towards Practical Few-shot Federated NLP","summary":" Transformer-based pre-trained models have emerged as the predominant solution\nfor natural language processing (NLP). Fine-tuning such pre-trained models for\ndownstream tasks often requires a considerable amount of labeled private data.\nIn practice, private data is often distributed across heterogeneous mobile\ndevices and may be prohibited from being uploaded. Moreover, well-curated\nlabeled data is often scarce, presenting an additional challenge. To address\nthese challenges, we first introduce a data generator for federated few-shot\nlearning tasks, which encompasses the quantity and skewness of scarce labeled\ndata in a realistic setting. Subsequently, we propose AUG-FedPrompt, a\nprompt-based federated learning system that exploits abundant unlabeled data\nfor data augmentation. Our experiments indicate that AUG-FedPrompt can perform\non par with full-set fine-tuning with a limited amount of labeled data.\nHowever, such competitive performance comes at a significant system cost.\n","authors":["Dongqi Cai","Yaozong Wu","Haitao Yuan","Shangguang Wang","Felix Xiaozhu Lin","Mengwei Xu"],"pdf_url":"https://arxiv.org/pdf/2212.00192v2.pdf","comment":"EuroSys23 workshop"},{"id":"http://arxiv.org/abs/2209.13877v3","updated":"2023-08-19T06:24:28Z","published":"2022-09-28T07:25:04Z","title":"YATO: Yet Another deep learning based Text analysis Open toolkit","summary":" We introduce YATO, an open-source, easy-to-use toolkit for text analysis with\ndeep learning. Different from existing heavily engineered toolkits and\nplatforms, YATO is lightweight and user-friendly for researchers from\ncross-disciplinary areas. Designed in a hierarchical structure, YATO supports\nfree combinations of three types of widely used features including 1)\ntraditional neural networks (CNN, RNN, etc.); 2) pre-trained language models\n(BERT, RoBERTa, ELECTRA, etc.); and 3) user-customized neural features via a\nsimple configurable file. Benefiting from the advantages of flexibility and\nease of use, YATO can facilitate fast reproduction and refinement of\nstate-of-the-art NLP models, and promote the cross-disciplinary applications of\nNLP techniques. The code, examples, and documentation are publicly available at\nhttps://github.com/jiesutd/YATO. A demo video is also available at\nhttps://youtu.be/tSjjf5BzfQg.\n","authors":["Zeqiang Wang","Yile Wang","Jiageng Wu","Zhiyang Teng","Jie Yang"],"pdf_url":"https://arxiv.org/pdf/2209.13877v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09892v1","updated":"2023-08-19T03:10:51Z","published":"2023-08-19T03:10:51Z","title":"Utilizing Semantic Textual Similarity for Clinical Survey Data Feature\n Selection","summary":" Survey data can contain a high number of features while having a\ncomparatively low quantity of examples. Machine learning models that attempt to\npredict outcomes from survey data under these conditions can overfit and result\nin poor generalizability. One remedy to this issue is feature selection, which\nattempts to select an optimal subset of features to learn upon. A relatively\nunexplored source of information in the feature selection process is the usage\nof textual names of features, which may be semantically indicative of which\nfeatures are relevant to a target outcome. The relationships between feature\nnames and target names can be evaluated using language models (LMs) to produce\nsemantic textual similarity (STS) scores, which can then be used to select\nfeatures. We examine the performance using STS to select features directly and\nin the minimal-redundancy-maximal-relevance (mRMR) algorithm. The performance\nof STS as a feature selection metric is evaluated against preliminary survey\ndata collected as a part of a clinical study on persistent post-surgical pain\n(PPSP). The results suggest that features selected with STS can result in\nhigher performance models compared to traditional feature selection algorithms.\n","authors":["Benjamin C. Warner","Ziqi Xu","Simon Haroutounian","Thomas Kannampallil","Chenyang Lu"],"pdf_url":"https://arxiv.org/pdf/2308.09892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09890v1","updated":"2023-08-19T03:01:45Z","published":"2023-08-19T03:01:45Z","title":"Inductive-bias Learning: Generating Code Models with Large Language\n Model","summary":" Large Language Models(LLMs) have been attracting attention due to a ability\ncalled in-context learning(ICL). ICL, without updating the parameters of a LLM,\nit is possible to achieve highly accurate inference based on rules ``in the\ncontext'' by merely inputting a training data into the prompt. Although ICL is\na developing field with many unanswered questions, LLMs themselves serves as a\ninference model, seemingly realizing inference without explicitly indicate\n``inductive bias''. On the other hand, a code generation is also a highlighted\napplication of LLMs. The accuracy of code generation has dramatically improved,\nenabling even non-engineers to generate code to perform the desired tasks by\ncrafting appropriate prompts. In this paper, we propose a novel ``learning''\nmethod called an ``Inductive-Bias Learning (IBL)'', which combines the\ntechniques of ICL and code generation. An idea of IBL is straightforward. Like\nICL, IBL inputs a training data into the prompt and outputs a code with a\nnecessary structure for inference (we referred to as ``Code Model'') from a\n``contextual understanding''. Despite being a seemingly simple approach, IBL\nencompasses both a ``property of inference without explicit inductive bias''\ninherent in ICL and a ``readability and explainability'' of the code\ngeneration. Surprisingly, generated Code Models have been found to achieve\npredictive accuracy comparable to, and in some cases surpassing, ICL and\nrepresentative machine learning models. Our IBL code is open source:\nhttps://github.com/fuyu-quant/IBLM\n","authors":["Toma Tanaka","Naofumi Emoto","Tsukasa Yumibayashi"],"pdf_url":"https://arxiv.org/pdf/2308.09890v1.pdf","comment":"17 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.04566v3","updated":"2023-08-19T02:20:42Z","published":"2023-08-08T20:29:13Z","title":"Single-Sentence Reader: A Novel Approach for Addressing Answer Position\n Bias","summary":" Machine Reading Comprehension (MRC) models tend to take advantage of spurious\ncorrelations (also known as dataset bias or annotation artifacts in the\nresearch community). Consequently, these models may perform the MRC task\nwithout fully comprehending the given context and question, which is\nundesirable since it may result in low robustness against distribution shift.\nThis paper delves into the concept of answer-position bias, where a significant\npercentage of training questions have answers located solely in the first\nsentence of the context. We propose a Single-Sentence Reader as a new approach\nfor addressing answer position bias in MRC. We implement this approach using\nsix different models and thoroughly analyze their performance. Remarkably, our\nproposed Single-Sentence Readers achieve results that nearly match those of\nmodels trained on conventional training sets, proving their effectiveness. Our\nstudy also discusses several challenges our Single-Sentence Readers encounter\nand proposes a potential solution.\n","authors":["Son Quoc Tran","Matt Kretchmar"],"pdf_url":"https://arxiv.org/pdf/2308.04566v3.pdf","comment":"We need to revise our paper"},{"id":"http://arxiv.org/abs/2308.07134v2","updated":"2023-08-19T01:38:31Z","published":"2023-08-14T13:41:09Z","title":"Natural Language is All a Graph Needs","summary":" The emergence of large-scale pre-trained language models, such as ChatGPT,\nhas revolutionized various research fields in artificial intelligence.\nTransformers-based large language models (LLMs) have gradually replaced CNNs\nand RNNs to unify fields of computer vision and natural language processing.\nCompared with the data that exists relatively independently such as images,\nvideos or texts, graph is a type of data that contains rich structural and\nrelational information. Meanwhile, natural language, as one of the most\nexpressive mediums, excels in describing complex structures. However, existing\nwork on incorporating graph learning problems into the generative language\nmodeling framework remains very limited. As the importance of large language\nmodels continues to grow, it becomes essential to explore whether LLMs can also\nreplace GNNs as the foundation model for graphs. In this paper, we propose\nInstructGLM (Instruction-finetuned Graph Language Model), systematically design\nhighly scalable prompts based on natural language instructions, and use natural\nlanguage to describe the geometric structure and node features of the graph for\ninstruction tuning an LLM to perform learning and inference on graphs in a\ngenerative manner. Our method exceeds all competitive GNN baselines on\nogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of\nour method and sheds light on generative large language models as the\nfoundation model for graph machine learning.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v2.pdf","comment":"21 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.09862v1","updated":"2023-08-19T00:39:21Z","published":"2023-08-19T00:39:21Z","title":"Breaking Language Barriers: A Question Answering Dataset for Hindi and\n Marathi","summary":" The recent advances in deep-learning have led to the development of highly\nsophisticated systems with an unquenchable appetite for data. On the other\nhand, building good deep-learning models for low-resource languages remains a\nchallenging task. This paper focuses on developing a Question Answering dataset\nfor two such languages- Hindi and Marathi. Despite Hindi being the 3rd most\nspoken language worldwide, with 345 million speakers, and Marathi being the\n11th most spoken language globally, with 83.2 million speakers, both languages\nface limited resources for building efficient Question Answering systems. To\ntackle the challenge of data scarcity, we have developed a novel approach for\ntranslating the SQuAD 2.0 dataset into Hindi and Marathi. We release the\nlargest Question-Answering dataset available for these languages, with each\ndataset containing 28,000 samples. We evaluate the dataset on various\narchitectures and release the best-performing models for both Hindi and\nMarathi, which will facilitate further research in these languages. Leveraging\nsimilarity tools, our method holds the potential to create datasets in diverse\nlanguages, thereby enhancing the understanding of natural language across\nvaried linguistic contexts. Our fine-tuned models, code, and dataset will be\nmade publicly available.\n","authors":["Maithili Sabane","Onkar Litake","Aman Chadha"],"pdf_url":"https://arxiv.org/pdf/2308.09862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09861v1","updated":"2023-08-19T00:24:59Z","published":"2023-08-19T00:24:59Z","title":"Black-box Adversarial Attacks against Dense Retrieval Models: A\n Multi-view Contrastive Learning Method","summary":" Neural ranking models (NRMs) and dense retrieval (DR) models have given rise\nto substantial improvements in overall retrieval performance. In addition to\ntheir effectiveness, and motivated by the proven lack of robustness of deep\nlearning-based approaches in other areas, there is growing interest in the\nrobustness of deep learning-based approaches to the core retrieval problem.\nAdversarial attack methods that have so far been developed mainly focus on\nattacking NRMs, with very little attention being paid to the robustness of DR\nmodels. In this paper, we introduce the adversarial retrieval attack (AREA)\ntask. The AREA task is meant to trick DR models into retrieving a target\ndocument that is outside the initial set of candidate documents retrieved by\nthe DR model in response to a query. We consider the decision-based black-box\nadversarial setting, which is realistic in real-world search engines. To\naddress the AREA task, we first employ existing adversarial attack methods\ndesigned for NRMs. We find that the promising results that have previously been\nreported on attacking NRMs, do not generalize to DR models: these methods\nunderperform a simple term spamming method. We attribute the observed lack of\ngeneralizability to the interaction-focused architecture of NRMs, which\nemphasizes fine-grained relevance matching. DR models follow a different\nrepresentation-focused architecture that prioritizes coarse-grained\nrepresentations. We propose to formalize attacks on DR models as a contrastive\nlearning problem in a multi-view representation space. The core idea is to\nencourage the consistency between each view representation of the target\ndocument and its corresponding viewer via view-wise supervision signals.\nExperimental results demonstrate that the proposed method can significantly\noutperform existing attack strategies in misleading the DR model with small\nindiscernible text perturbations.\n","authors":["Yu-An Liu","Ruqing Zhang","Jiafeng Guo","Maarten de Rijke","Wei Chen","Yixing Fan","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.09861v1.pdf","comment":"Accept by CIKM2023, 10 pages"},{"id":"http://arxiv.org/abs/2308.11592v1","updated":"2023-08-19T17:32:34Z","published":"2023-08-19T17:32:34Z","title":"UniDoc: A Universal Large Multimodal Model for Simultaneous Text\n Detection, Recognition, Spotting and Understanding","summary":" In the era of Large Language Models (LLMs), tremendous strides have been made\nin the field of multimodal understanding. However, existing advanced algorithms\nare limited to effectively utilizing the immense representation capabilities\nand rich world knowledge inherent to these large pre-trained models, and the\nbeneficial connections among tasks within the context of text-rich scenarios\nhave not been sufficiently explored. In this work, we introduce UniDoc, a novel\nmultimodal model equipped with text detection and recognition capabilities,\nwhich are deficient in existing approaches. Moreover, UniDoc capitalizes on the\nbeneficial interactions among tasks to enhance the performance of each\nindividual task. To implement UniDoc, we perform unified multimodal instruct\ntuning on the contributed large-scale instruction following datasets.\nQuantitative and qualitative experimental results show that UniDoc sets\nstate-of-the-art scores across multiple challenging benchmarks. To the best of\nour knowledge, this is the first large multimodal model capable of simultaneous\ntext detection, recognition, spotting, and understanding.\n","authors":["Hao Feng","Zijian Wang","Jingqun Tang","Jinghui Lu","Wengang Zhou","Houqiang Li","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2308.11592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11519v1","updated":"2023-08-19T13:29:15Z","published":"2023-08-19T13:29:15Z","title":"Optimizing Multi-Class Text Classification: A Diverse Stacking Ensemble\n Framework Utilizing Transformers","summary":" Customer reviews play a crucial role in assessing customer satisfaction,\ngathering feedback, and driving improvements for businesses. Analyzing these\nreviews provides valuable insights into customer sentiments, including\ncompliments, comments, and suggestions. Text classification techniques enable\nbusinesses to categorize customer reviews into distinct categories,\nfacilitating a better understanding of customer feedback. However, challenges\nsuch as overfitting and bias limit the effectiveness of a single classifier in\nensuring optimal prediction. This study proposes a novel approach to address\nthese challenges by introducing a stacking ensemble-based multi-text\nclassification method that leverages transformer models. By combining multiple\nsingle transformers, including BERT, ELECTRA, and DistilBERT, as base-level\nclassifiers, and a meta-level classifier based on RoBERTa, an optimal\npredictive model is generated. The proposed stacking ensemble-based multi-text\nclassification method aims to enhance the accuracy and robustness of customer\nreview analysis. Experimental evaluations conducted on a real-world customer\nreview dataset demonstrate the effectiveness and superiority of the proposed\napproach over traditional single classifier models. The stacking ensemble-based\nmulti-text classification method using transformers proves to be a promising\nsolution for businesses seeking to extract valuable insights from customer\nreviews and make data-driven decisions to enhance customer satisfaction and\ndrive continuous improvement.\n","authors":["Anusuya Krishnan"],"pdf_url":"https://arxiv.org/pdf/2308.11519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11585v1","updated":"2023-08-19T13:14:15Z","published":"2023-08-19T13:14:15Z","title":"Causal Intersectionality and Dual Form of Gradient Descent for\n Multimodal Analysis: a Case Study on Hateful Memes","summary":" In the wake of the explosive growth of machine learning (ML) usage,\nparticularly within the context of emerging Large Language Models (LLMs),\ncomprehending the semantic significance rooted in their internal workings is\ncrucial. While causal analyses focus on defining semantics and its\nquantification, the gradient-based approach is central to explainable AI (XAI),\ntackling the interpretation of the black box. By synergizing these approaches,\nthe exploration of how a model's internal mechanisms illuminate its causal\neffect has become integral for evidence-based decision-making. A parallel line\nof research has revealed that intersectionality - the combinatory impact of\nmultiple demographics of an individual - can be structured in the form of an\nAveraged Treatment Effect (ATE). Initially, this study illustrates that the\nhateful memes detection problem can be formulated as an ATE, assisted by the\nprinciples of intersectionality, and that a modality-wise summarization of\ngradient-based attention attribution scores can delineate the distinct\nbehaviors of three Transformerbased models concerning ATE. Subsequently, we\nshow that the latest LLM LLaMA2 has the ability to disentangle the\nintersectional nature of memes detection in an in-context learning setting,\nwith their mechanistic properties elucidated via meta-gradient, a secondary\nform of gradient. In conclusion, this research contributes to the ongoing\ndialogue surrounding XAI and the multifaceted nature of ML models.\n","authors":["Yosuke Miyanishi","Minh Le Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.11585v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2306.12235v3","updated":"2023-08-19T18:16:59Z","published":"2023-06-21T12:53:31Z","title":"CompMix: A Benchmark for Heterogeneous Question Answering","summary":" Fact-centric question answering (QA) often requires access to multiple,\nheterogeneous, information sources. By jointly considering several sources like\na knowledge base (KB), a text collection, and tables from the web, QA systems\ncan enhance their answer coverage and confidence. However, existing QA\nbenchmarks are mostly constructed with a single source of knowledge in mind.\nThis limits capabilities of these benchmarks to fairly evaluate QA systems that\ncan tap into more than one information repository. To bridge this gap, we\nrelease CompMix, a crowdsourced QA benchmark which naturally demands the\nintegration of a mixture of input sources. CompMix has a total of 9,410\nquestions, and features several complex intents like joins and temporal\nconditions. Evaluation of a range of QA systems on CompMix highlights the need\nfor further research on leveraging information from heterogeneous sources.\n","authors":["Philipp Christmann","Rishiraj Saha Roy","Gerhard Weikum"],"pdf_url":"https://arxiv.org/pdf/2306.12235v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10053v1","updated":"2023-08-19T15:29:45Z","published":"2023-08-19T15:29:45Z","title":"Large Language Models as Zero-Shot Conversational Recommenders","summary":" In this paper, we present empirical studies on conversational recommendation\ntasks using representative large language models in a zero-shot setting with\nthree primary contributions. (1) Data: To gain insights into model behavior in\n\"in-the-wild\" conversational recommendation scenarios, we construct a new\ndataset of recommendation-related conversations by scraping a popular\ndiscussion website. This is the largest public real-world conversational\nrecommendation dataset to date. (2) Evaluation: On the new dataset and two\nexisting conversational recommendation datasets, we observe that even without\nfine-tuning, large language models can outperform existing fine-tuned\nconversational recommendation models. (3) Analysis: We propose various probing\ntasks to investigate the mechanisms behind the remarkable performance of large\nlanguage models in conversational recommendation. We analyze both the large\nlanguage models' behaviors and the characteristics of the datasets, providing a\nholistic understanding of the models' effectiveness, limitations and suggesting\ndirections for the design of future conversational recommenders\n","authors":["Zhankui He","Zhouhang Xie","Rahul Jha","Harald Steck","Dawen Liang","Yesu Feng","Bodhisattwa Prasad Majumder","Nathan Kallus","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2308.10053v1.pdf","comment":"Accepted as CIKM 2023 long paper. Longer version is coming soon\n (e.g., more details about dataset)"},{"id":"http://arxiv.org/abs/2308.10028v1","updated":"2023-08-19T14:25:59Z","published":"2023-08-19T14:25:59Z","title":"Voucher Abuse Detection with Prompt-based Fine-tuning on Graph Neural\n Networks","summary":" Voucher abuse detection is an important anomaly detection problem in\nE-commerce. While many GNN-based solutions have emerged, the supervised\nparadigm depends on a large quantity of labeled data. A popular alternative is\nto adopt self-supervised pre-training using label-free data, and further\nfine-tune on a downstream task with limited labels. Nevertheless, the\n\"pre-train, fine-tune\" paradigm is often plagued by the objective gap between\npre-training and downstream tasks. Hence, we propose VPGNN, a prompt-based\nfine-tuning framework on GNNs for voucher abuse detection. We design a novel\ngraph prompting function to reformulate the downstream task into a similar\ntemplate as the pretext task in pre-training, thereby narrowing the objective\ngap. Extensive experiments on both proprietary and public datasets demonstrate\nthe strength of VPGNN in both few-shot and semi-supervised scenarios. Moreover,\nan online deployment of VPGNN in a production environment shows a 23.4%\nimprovement over two existing deployed models.\n","authors":["Zhihao Wen","Yuan Fang","Yihan Liu","Yang Guo","Shuji Hao"],"pdf_url":"https://arxiv.org/pdf/2308.10028v1.pdf","comment":"7 pages, Accepted by CIKM23 Applied Research Track"},{"id":"http://arxiv.org/abs/2303.15247v2","updated":"2023-08-19T14:04:41Z","published":"2023-03-27T14:31:25Z","title":"Zero-Shot Composed Image Retrieval with Textual Inversion","summary":" Composed Image Retrieval (CIR) aims to retrieve a target image based on a\nquery composed of a reference image and a relative caption that describes the\ndifference between the two images. The high effort and cost required for\nlabeling datasets for CIR hamper the widespread usage of existing methods, as\nthey rely on supervised learning. In this work, we propose a new task,\nZero-Shot CIR (ZS-CIR), that aims to address CIR without requiring a labeled\ntraining dataset. Our approach, named zero-Shot composEd imAge Retrieval with\ntextuaL invErsion (SEARLE), maps the visual features of the reference image\ninto a pseudo-word token in CLIP token embedding space and integrates it with\nthe relative caption. To support research on ZS-CIR, we introduce an\nopen-domain benchmarking dataset named Composed Image Retrieval on Common\nObjects in context (CIRCO), which is the first dataset for CIR containing\nmultiple ground truths for each query. The experiments show that SEARLE\nexhibits better performance than the baselines on the two main datasets for CIR\ntasks, FashionIQ and CIRR, and on the proposed CIRCO. The dataset, the code and\nthe model are publicly available at https://github.com/miccunifi/SEARLE.\n","authors":["Alberto Baldrati","Lorenzo Agnolucci","Marco Bertini","Alberto Del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2303.15247v2.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2308.09976v1","updated":"2023-08-19T10:43:11Z","published":"2023-08-19T10:43:11Z","title":"Explicit Time Embedding Based Cascade Attention Network for Information\n Popularity Prediction","summary":" Predicting information cascade popularity is a fundamental problem in social\nnetworks. Capturing temporal attributes and cascade role information (e.g.,\ncascade graphs and cascade sequences) is necessary for understanding the\ninformation cascade. Current methods rarely focus on unifying this information\nfor popularity predictions, which prevents them from effectively modeling the\nfull properties of cascades to achieve satisfactory prediction performances. In\nthis paper, we propose an explicit Time embedding based Cascade Attention\nNetwork (TCAN) as a novel popularity prediction architecture for large-scale\ninformation networks. TCAN integrates temporal attributes (i.e., periodicity,\nlinearity, and non-linear scaling) into node features via a general time\nembedding approach (TE), and then employs a cascade graph attention encoder\n(CGAT) and a cascade sequence attention encoder (CSAT) to fully learn the\nrepresentation of cascade graphs and cascade sequences. We use two real-world\ndatasets (i.e., Weibo and APS) with tens of thousands of cascade samples to\nvalidate our methods. Experimental results show that TCAN obtains mean\nlogarithm squared errors of 2.007 and 1.201 and running times of 1.76 hours and\n0.15 hours on both datasets, respectively. Furthermore, TCAN outperforms other\nrepresentative baselines by 10.4%, 3.8%, and 10.4% in terms of MSLE, MAE, and\nR-squared on average while maintaining good interpretability.\n","authors":["Xigang Sun","Jingya Zhou","Ling Liu","Wenqi Wei"],"pdf_url":"https://arxiv.org/pdf/2308.09976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09966v1","updated":"2023-08-19T09:47:22Z","published":"2023-08-19T09:47:22Z","title":"Time-aligned Exposure-enhanced Model for Click-Through Rate Prediction","summary":" Click-Through Rate (CTR) prediction, crucial in applications like recommender\nsystems and online advertising, involves ranking items based on the likelihood\nof user clicks. User behavior sequence modeling has marked progress in CTR\nprediction, which extracts users' latent interests from their historical\nbehavior sequences to facilitate accurate CTR prediction. Recent research\nexplores using implicit feedback sequences, like unclicked records, to extract\ndiverse user interests. However, these methods encounter key challenges: 1)\ntemporal misalignment due to disparate sequence time ranges and 2) the lack of\nfine-grained interaction among feedback sequences. To address these challenges,\nwe propose a novel framework called TEM4CTR, which ensures temporal alignment\namong sequences while leveraging auxiliary feedback information to enhance\nclick behavior at the item level through a representation projection mechanism.\nMoreover, this projection-based information transfer module can effectively\nalleviate the negative impact of irrelevant or even potentially detrimental\ncomponents of the auxiliary feedback information on the learning process of\nclick behavior. Comprehensive experiments on public and industrial datasets\nconfirm the superiority and effectiveness of TEM4CTR, showcasing the\nsignificance of temporal alignment in multi-feedback modeling.\n","authors":["Hengyu Zhang","Chang Meng","Wei Guo","Huifeng Guo","Jieming Zhu","Guangpeng Zhao","Ruiming Tang","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2308.09966v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.09943v1","updated":"2023-08-19T08:29:15Z","published":"2023-08-19T08:29:15Z","title":"printf: Preference Modeling Based on User Reviews with Item Images and\n Textual Information via Graph Learning","summary":" Nowadays, modern recommender systems usually leverage textual and visual\ncontents as auxiliary information to predict user preference. For textual\ninformation, review texts are one of the most popular contents to model user\nbehaviors. Nevertheless, reviews usually lose their shine when it comes to\ntop-N recommender systems because those that solely utilize textual reviews as\nfeatures struggle to adequately capture the interaction relationships between\nusers and items. For visual one, it is usually modeled with naive convolutional\nnetworks and also hard to capture high-order relationships between users and\nitems. Moreover, previous works did not collaboratively use both texts and\nimages in a proper way. In this paper, we propose printf, preference modeling\nbased on user reviews with item images and textual information via graph\nlearning, to address the above challenges. Specifically, the dimension-based\nattention mechanism directs relations between user reviews and interacted\nitems, allowing each dimension to contribute different importance weights to\nderive user representations. Extensive experiments are conducted on three\npublicly available datasets. The experimental results demonstrate that our\nproposed printf consistently outperforms baseline methods with the relative\nimprovements for NDCG@5 of 26.80%, 48.65%, and 25.74% on Amazon-Grocery,\nAmazon-Tools, and Amazon-Electronics datasets, respectively. The in-depth\nanalysis also indicates the dimensions of review representations definitely\nhave different topics and aspects, assisting the validity of our model design.\n","authors":["Hao-Lun Lin","Jyun-Yu Jiang","Ming-Hao Juan","Pu-Jen Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.09943v1.pdf","comment":"In Proceedings of The 32nd ACM International Conference on\n Information and Knowledge Management (CIKM '23), ACM, 2023"},{"id":"http://arxiv.org/abs/2308.09904v1","updated":"2023-08-19T04:46:01Z","published":"2023-08-19T04:46:01Z","title":"RAH! RecSys-Assistant-Human: A Human-Central Recommendation Framework\n with Large Language Models","summary":" The recommendation ecosystem involves interactions between recommender\nsystems(Computer) and users(Human). Orthogonal to the perspective of\nrecommender systems, we attempt to utilize LLMs from the perspective of users\nand propose a more human-central recommendation framework named RAH, which\nconsists of Recommender system, Assistant and Human. The assistant is a\nLLM-based and personal proxy for a human to achieve user satisfaction. The\nassistant plays a non-invasion role and the RAH framework can adapt to\ndifferent recommender systems and user groups. Subsequently, we implement and\nevaluate the RAH framework for learning user personalities and proxy human\nfeedback. The experiment shows that (1) using learn-action-critic and\nreflection mechanisms can lead more aligned personality and (2) our assistant\ncan effectively proxy human feedback and help adjust recommender systems.\nFinally, we discuss further strategies in the RAH framework to address\nhuman-central concerns including user control, privacy and fairness.\n","authors":["Yubo Shu","Hansu Gu","Peng Zhang","Haonan Zhang","Tun Lu","Dongsheng Li","Ning Gu"],"pdf_url":"https://arxiv.org/pdf/2308.09904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14834v3","updated":"2023-08-19T03:32:53Z","published":"2023-06-26T16:39:39Z","title":"Scalable Neural Contextual Bandit for Recommender Systems","summary":" High-quality recommender systems ought to deliver both innovative and\nrelevant content through effective and exploratory interactions with users.\nYet, supervised learning-based neural networks, which form the backbone of many\nexisting recommender systems, only leverage recognized user interests, falling\nshort when it comes to efficiently uncovering unknown user preferences. While\nthere has been some progress with neural contextual bandit algorithms towards\nenabling online exploration through neural networks, their onerous\ncomputational demands hinder widespread adoption in real-world recommender\nsystems. In this work, we propose a scalable sample-efficient neural contextual\nbandit algorithm for recommender systems. To do this, we design an epistemic\nneural network architecture, Epistemic Neural Recommendation (ENR), that\nenables Thompson sampling at a large scale. In two distinct large-scale\nexperiments with real-world tasks, ENR significantly boosts click-through rates\nand user ratings by at least 9% and 6% respectively compared to\nstate-of-the-art neural contextual bandit algorithms. Furthermore, it achieves\nequivalent performance with at least 29% fewer user interactions compared to\nthe best-performing baseline algorithm. Remarkably, while accomplishing these\nimprovements, ENR demands orders of magnitude fewer computational resources\nthan neural contextual bandit baseline algorithms.\n","authors":["Zheqing Zhu","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2306.14834v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16637v2","updated":"2023-08-19T03:21:34Z","published":"2023-05-26T05:26:00Z","title":"FARA: Future-aware Ranking Algorithm for Fairness Optimization","summary":" Ranking systems are the key components of modern Information Retrieval (IR)\napplications, such as search engines and recommender systems. Besides the\nranking relevance to users, the exposure fairness to item providers has also\nbeen considered an important factor in ranking optimization. Many fair ranking\nalgorithms have been proposed to jointly optimize both ranking relevance and\nfairness. However, we find that most existing fair ranking methods adopt greedy\nalgorithms that only optimize rankings for the next immediate session or\nrequest. As shown in this paper, such a myopic paradigm could limit the upper\nbound of ranking optimization and lead to suboptimal performance in the long\nterm.\n To this end, we propose \\textbf{FARA}, a novel \\textbf{F}uture-\\textbf{A}ware\n\\textbf{R}anking \\textbf{A}lgorithm for ranking relevance and fairness\noptimization. Instead of greedily optimizing rankings for the next immediate\nsession, FARA plans ahead by jointly optimizing multiple ranklists together and\nsaving them for future sessions. Specifically, FARA first uses the Taylor\nexpansion to investigate how future ranklists will influence the overall\nfairness of the system. Then, based on the analysis of the Taylor expansion,\nFARA adopts a two-phase optimization algorithm where we first solve an optimal\nfuture exposure planning problem and then construct the optimal ranklists\naccording to the optimal future exposure planning. Theoretically, we show that\nFARA is optimal for ranking relevance and fairness joint optimization.\nEmpirically, our extensive experiments on three semi-synthesized datasets show\nthat FARA is efficient, effective, and can deliver significantly better ranking\nperformance compared to state-of-the-art fair ranking methods. We make our\nimplementation public at\n\\href{https://github.com/Taosheng-ty/QP_fairness/}{https://github.com/Taosheng-ty/QP\\_fairness/}.\n","authors":["Tao Yang","Zhichao Xu","Zhenduo Wang","Qingyao Ai"],"pdf_url":"https://arxiv.org/pdf/2305.16637v2.pdf","comment":"11 pages, four figures, four tables. CIKM2023"},{"id":"http://arxiv.org/abs/2308.07134v2","updated":"2023-08-19T01:38:31Z","published":"2023-08-14T13:41:09Z","title":"Natural Language is All a Graph Needs","summary":" The emergence of large-scale pre-trained language models, such as ChatGPT,\nhas revolutionized various research fields in artificial intelligence.\nTransformers-based large language models (LLMs) have gradually replaced CNNs\nand RNNs to unify fields of computer vision and natural language processing.\nCompared with the data that exists relatively independently such as images,\nvideos or texts, graph is a type of data that contains rich structural and\nrelational information. Meanwhile, natural language, as one of the most\nexpressive mediums, excels in describing complex structures. However, existing\nwork on incorporating graph learning problems into the generative language\nmodeling framework remains very limited. As the importance of large language\nmodels continues to grow, it becomes essential to explore whether LLMs can also\nreplace GNNs as the foundation model for graphs. In this paper, we propose\nInstructGLM (Instruction-finetuned Graph Language Model), systematically design\nhighly scalable prompts based on natural language instructions, and use natural\nlanguage to describe the geometric structure and node features of the graph for\ninstruction tuning an LLM to perform learning and inference on graphs in a\ngenerative manner. Our method exceeds all competitive GNN baselines on\nogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of\nour method and sheds light on generative large language models as the\nfoundation model for graph machine learning.\n","authors":["Ruosong Ye","Caiqi Zhang","Runhui Wang","Shuyuan Xu","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.07134v2.pdf","comment":"21 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.09861v1","updated":"2023-08-19T00:24:59Z","published":"2023-08-19T00:24:59Z","title":"Black-box Adversarial Attacks against Dense Retrieval Models: A\n Multi-view Contrastive Learning Method","summary":" Neural ranking models (NRMs) and dense retrieval (DR) models have given rise\nto substantial improvements in overall retrieval performance. In addition to\ntheir effectiveness, and motivated by the proven lack of robustness of deep\nlearning-based approaches in other areas, there is growing interest in the\nrobustness of deep learning-based approaches to the core retrieval problem.\nAdversarial attack methods that have so far been developed mainly focus on\nattacking NRMs, with very little attention being paid to the robustness of DR\nmodels. In this paper, we introduce the adversarial retrieval attack (AREA)\ntask. The AREA task is meant to trick DR models into retrieving a target\ndocument that is outside the initial set of candidate documents retrieved by\nthe DR model in response to a query. We consider the decision-based black-box\nadversarial setting, which is realistic in real-world search engines. To\naddress the AREA task, we first employ existing adversarial attack methods\ndesigned for NRMs. We find that the promising results that have previously been\nreported on attacking NRMs, do not generalize to DR models: these methods\nunderperform a simple term spamming method. We attribute the observed lack of\ngeneralizability to the interaction-focused architecture of NRMs, which\nemphasizes fine-grained relevance matching. DR models follow a different\nrepresentation-focused architecture that prioritizes coarse-grained\nrepresentations. We propose to formalize attacks on DR models as a contrastive\nlearning problem in a multi-view representation space. The core idea is to\nencourage the consistency between each view representation of the target\ndocument and its corresponding viewer via view-wise supervision signals.\nExperimental results demonstrate that the proposed method can significantly\noutperform existing attack strategies in misleading the DR model with small\nindiscernible text perturbations.\n","authors":["Yu-An Liu","Ruqing Zhang","Jiafeng Guo","Maarten de Rijke","Wei Chen","Yixing Fan","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.09861v1.pdf","comment":"Accept by CIKM2023, 10 pages"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.10121v1","updated":"2023-08-19T22:24:00Z","published":"2023-08-19T22:24:00Z","title":"Dronevision: An Experimental 3D Testbed for Flying Light Specks","summary":" Today's robotic laboratories for drones are housed in a large room. At times,\nthey are the size of a warehouse. These spaces are typically equipped with\npermanent devices to localize the drones, e.g., Vicon Infrared cameras.\nSignificant time is invested to fine-tune the localization apparatus to compute\nand control the position of the drones. One may use these laboratories to\ndevelop a 3D multimedia system with miniature sized drones configured with\nlight sources. As an alternative, this brave new idea paper envisions shrinking\nthese room-sized laboratories to the size of a cube or cuboid that sits on a\ndesk and costs less than 10K dollars. The resulting Dronevision (DV) will be\nthe size of a 1990s Television. In addition to light sources, its Flying Light\nSpecks (FLSs) will be network-enabled drones with storage and processing\ncapability to implement decentralized algorithms. The DV will include a\nlocalization technique to expedite development of 3D displays. It will act as a\nhaptic interface for a user to interact with and manipulate the 3D virtual\nilluminations. It will empower an experimenter to design, implement, test,\ndebug, and maintain software and hardware that realize novel algorithms in the\ncomfort of their office without having to reserve a laboratory. In addition to\nenhancing productivity, it will improve safety of the experimenter by\nminimizing the likelihood of accidents. This paper introduces the concept of a\nDV, the research agenda one may pursue using this device, and our plans to\nrealize one.\n","authors":["Hamed Alimohammadzadeh","Rohit Bernard","Yang Chen","Trung Phan","Prashant Singh","Shuqin Zhu","Heather Culbertson","Shahram Ghandeharizadeh"],"pdf_url":"https://arxiv.org/pdf/2308.10121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10115v1","updated":"2023-08-19T21:53:52Z","published":"2023-08-19T21:53:52Z","title":"An Evaluation of Three Distance Measurement Technologies for Flying\n Light Specks","summary":" This study evaluates the accuracy of three different types of time-of-flight\nsensors to measure distance. We envision the possible use of these sensors to\nlocalize swarms of flying light specks (FLSs) to illuminate objects and avatars\nof a metaverse. An FLS is a miniature-sized drone configured with RGB light\nsources. It is unable to illuminate a point cloud by itself. However, the\ninter-FLS relationship effect of an organizational framework will compensate\nfor the simplicity of each individual FLS, enabling a swarm of cooperating FLSs\nto illuminate complex shapes and render haptic interactions. Distance between\nFLSs is an important criterion of the inter-FLS relationship. We consider\nsensors that use radio frequency (UWB), infrared light (IR), and sound\n(ultrasonic) to quantify this metric. Obtained results show only one sensor is\nable to measure distances as small as 1 cm with a high accuracy. A sensor may\nrequire a calibration process that impacts its accuracy in measuring distance.\n","authors":["Trung Phan","Hamed Alimohammadzadeh","Heather Culbertson","Shahram Ghandeharizadeh"],"pdf_url":"https://arxiv.org/pdf/2308.10115v1.pdf","comment":"In International Conference on Intelligent Metaverse Technologies and\n Applications (iMETA2023), Tartu, Estonia, September 18-20, 2023"},{"id":"http://arxiv.org/abs/2308.10068v1","updated":"2023-08-19T16:20:59Z","published":"2023-08-19T16:20:59Z","title":"ILCAS: Imitation Learning-Based Configuration-Adaptive Streaming for\n Live Video Analytics with Cross-Camera Collaboration","summary":" The high-accuracy and resource-intensive deep neural networks (DNNs) have\nbeen widely adopted by live video analytics (VA), where camera videos are\nstreamed over the network to resource-rich edge/cloud servers for DNN\ninference. Common video encoding configurations (e.g., resolution and frame\nrate) have been identified with significant impacts on striking the balance\nbetween bandwidth consumption and inference accuracy and therefore their\nadaption scheme has been a focus of optimization. However, previous\nprofiling-based solutions suffer from high profiling cost, while existing deep\nreinforcement learning (DRL) based solutions may achieve poor performance due\nto the usage of fixed reward function for training the agent, which fails to\ncraft the application goals in various scenarios. In this paper, we propose\nILCAS, the first imitation learning (IL) based configuration-adaptive VA\nstreaming system. Unlike DRL-based solutions, ILCAS trains the agent with\ndemonstrations collected from the expert which is designed as an offline\noptimal policy that solves the configuration adaption problem through dynamic\nprogramming. To tackle the challenge of video content dynamics, ILCAS derives\nmotion feature maps based on motion vectors which allow ILCAS to visually\n``perceive'' video content changes. Moreover, ILCAS incorporates a cross-camera\ncollaboration scheme to exploit the spatio-temporal correlations of cameras for\nmore proper configuration selection. Extensive experiments confirm the\nsuperiority of ILCAS compared with state-of-the-art solutions, with 2-20.9%\nimprovement of mean accuracy and 19.9-85.3% reduction of chunk upload lag.\n","authors":["Duo Wu","Dayou Zhang","Miao Zhang","Ruoyu Zhang","Fangxin Wang","Shuguang Cui"],"pdf_url":"https://arxiv.org/pdf/2308.10068v1.pdf","comment":"This work has been submitted to the IEEE Transactions on Mobile\n Computing for possible publication. Copyright may be transferred without\n notice, after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2210.11549v2","updated":"2023-08-19T15:17:00Z","published":"2022-10-20T19:31:23Z","title":"H4VDM: H.264 Video Device Matching","summary":" Methods that can determine if two given video sequences are captured by the\nsame device (e.g., mobile telephone or digital camera) can be used in many\nforensics tasks. In this paper we refer to this as \"video device matching\". In\nopen-set video forensics scenarios it is easier to determine if two video\nsequences were captured with the same device than identifying the specific\ndevice. In this paper, we propose a technique for open-set video device\nmatching. Given two H.264 compressed video sequences, our method can determine\nif they are captured by the same device, even if our method has never\nencountered the device in training. We denote our proposed technique as H.264\nVideo Device Matching (H4VDM). H4VDM uses H.264 compression information\nextracted from video sequences to make decisions. It is more robust against\nartifacts that alter camera sensor fingerprints, and it can be used to analyze\nrelatively small fragments of the H.264 sequence. We trained and tested our\nmethod on a publicly available video forensics dataset consisting of 35\ndevices, where our proposed method demonstrated good performance.\n","authors":["Ziyue Xiang","Paolo Bestagini","Stefano Tubaro","Edward J. Delp"],"pdf_url":"https://arxiv.org/pdf/2210.11549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09948v1","updated":"2023-08-19T08:55:40Z","published":"2023-08-19T08:55:40Z","title":"Bamboo: Boosting Training Efficiency for Real-Time Video Streaming via\n Online Grouped Federated Transfer Learning","summary":" Most of the learning-based algorithms for bitrate adaptation are limited to\noffline learning, which inevitably suffers from the simulation-to-reality gap.\nOnline learning can better adapt to dynamic real-time communication scenes but\nstill face the challenge of lengthy training convergence time. In this paper,\nwe propose a novel online grouped federated transfer learning framework named\nBamboo to accelerate training efficiency. The preliminary experiments validate\nthat our method remarkably improves online training efficiency by up to 302%\ncompared to other reinforcement learning algorithms in various network\nconditions while ensuring the quality of experience (QoE) of real-time video\ncommunication.\n","authors":["Qianyuan Zheng","Hao Chen","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2308.09948v1.pdf","comment":"This paper will be presented at Apnet 2023"},{"id":"http://arxiv.org/abs/2308.09911v1","updated":"2023-08-19T05:34:13Z","published":"2023-08-19T05:34:13Z","title":"Noisy-Correspondence Learning for Text-to-Image Person Re-identification","summary":" Text-to-image person re-identification (TIReID) is a compelling topic in the\ncross-modal community, which aims to retrieve the target person based on a\ntextual query. Although numerous TIReID methods have been proposed and achieved\npromising performance, they implicitly assume the training image-text pairs are\ncorrectly aligned, which is not always the case in real-world scenarios. In\npractice, the image-text pairs inevitably exist under-correlated or even\nfalse-correlated, a.k.a noisy correspondence (NC), due to the low quality of\nthe images and annotation errors. To address this problem, we propose a novel\nRobust Dual Embedding method (RDE) that can learn robust visual-semantic\nassociations even with NC. Specifically, RDE consists of two main components:\n1) A Confident Consensus Division (CCD) module that leverages the dual-grained\ndecisions of dual embedding modules to obtain a consensus set of clean training\ndata, which enables the model to learn correct and reliable visual-semantic\nassociations. 2) A Triplet-Alignment Loss (TAL) relaxes the conventional\ntriplet-ranking loss with hardest negatives, which tends to rapidly overfit NC,\nto a log-exponential upper bound over all negatives, thus preventing the model\nfrom overemphasizing false image-text pairs. We conduct extensive experiments\non three public benchmarks, namely CUHK-PEDES, ICFG-PEDES, and RSTPReID, to\nevaluate the performance and robustness of our RDE. Our method achieves\nstate-of-the-art results both with and without synthetic noisy correspondences\non all three datasets.\n","authors":["Yang Qin","Yingke Chen","Dezhong Peng","Xi Peng","Joey Tianyi Zhou","Peng Hu"],"pdf_url":"https://arxiv.org/pdf/2308.09911v1.pdf","comment":null}]},"2023-08-22T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.11606v1","updated":"2023-08-22T17:53:55Z","published":"2023-08-22T17:53:55Z","title":"StoryBench: A Multifaceted Benchmark for Continuous Story Visualization","summary":" Generating video stories from text prompts is a complex task. In addition to\nhaving high visual quality, videos need to realistically adhere to a sequence\nof text prompts whilst being consistent throughout the frames. Creating a\nbenchmark for video generation requires data annotated over time, which\ncontrasts with the single caption used often in video datasets. To fill this\ngap, we collect comprehensive human annotations on three existing datasets, and\nintroduce StoryBench: a new, challenging multi-task benchmark to reliably\nevaluate forthcoming text-to-video models. Our benchmark includes three video\ngeneration tasks of increasing difficulty: action execution, where the next\naction must be generated starting from a conditioning video; story\ncontinuation, where a sequence of actions must be executed starting from a\nconditioning video; and story generation, where a video must be generated from\nonly text prompts. We evaluate small yet strong text-to-video baselines, and\nshow the benefits of training on story-like data algorithmically generated from\nexisting video captions. Finally, we establish guidelines for human evaluation\nof video stories, and reaffirm the need of better automatic metrics for video\ngeneration. StoryBench aims at encouraging future research efforts in this\nexciting new area.\n","authors":["Emanuele Bugliarello","Hernan Moraldo","Ruben Villegas","Mohammad Babaeizadeh","Mohammad Taghi Saffar","Han Zhang","Dumitru Erhan","Vittorio Ferrari","Pieter-Jan Kindermans","Paul Voigtlaender"],"pdf_url":"https://arxiv.org/pdf/2308.11606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11601v1","updated":"2023-08-22T17:48:24Z","published":"2023-08-22T17:48:24Z","title":"Tryage: Real-time, intelligent Routing of User Prompts to Large Language\n Model","summary":" The introduction of the transformer architecture and the self-attention\nmechanism has led to an explosive production of language models trained on\nspecific downstream tasks and data domains. With over 200, 000 models in the\nHugging Face ecosystem, users grapple with selecting and optimizing models to\nsuit multifaceted workflows and data domains while addressing computational,\nsecurity, and recency concerns. There is an urgent need for machine learning\nframeworks that can eliminate the burden of model selection and customization\nand unleash the incredible power of the vast emerging model library for end\nusers. Here, we propose a context-aware routing system, Tryage, that leverages\na language model router for optimal selection of expert models from a model\nlibrary based on analysis of individual input prompts. Inspired by the thalamic\nrouter in the brain, Tryage employs a perceptive router to predict down-stream\nmodel performance on prompts and, then, makes a routing decision using an\nobjective function that integrates performance predictions with user goals and\nconstraints that are incorporated through flags (e.g., model size, model\nrecency). Tryage allows users to explore a Pareto front and automatically\ntrade-off between task accuracy and secondary goals including minimization of\nmodel size, recency, security, verbosity, and readability. Across heterogeneous\ndata sets that include code, text, clinical data, and patents, the Tryage\nframework surpasses Gorilla and GPT3.5 turbo in dynamic model selection\nidentifying the optimal model with an accuracy of 50.9% , compared to 23.6% by\nGPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how\nrouting models can be applied to program and control the behavior of\nmulti-model LLM systems to maximize efficient use of the expanding and evolving\nlanguage model ecosystem.\n","authors":["Surya Narayanan Hari","Matt Thomson"],"pdf_url":"https://arxiv.org/pdf/2308.11601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11596v1","updated":"2023-08-22T17:44:18Z","published":"2023-08-22T17:44:18Z","title":"SeamlessM4T-Massively Multilingual & Multimodal Machine Translation","summary":" What does it take to create the Babel Fish, a tool that can help individuals\ntranslate speech between any two languages? While recent breakthroughs in\ntext-based models have pushed machine translation coverage beyond 200\nlanguages, unified speech-to-speech translation models have yet to achieve\nsimilar strides. More specifically, conventional speech-to-speech translation\nsystems rely on cascaded systems that perform translation progressively,\nputting high-performing unified systems out of reach. To address these gaps, we\nintroduce SeamlessM4T, a single model that supports speech-to-speech\ntranslation, speech-to-text translation, text-to-speech translation,\ntext-to-text translation, and automatic speech recognition for up to 100\nlanguages. To build this, we used 1 million hours of open speech audio data to\nlearn self-supervised speech representations with w2v-BERT 2.0. Subsequently,\nwe created a multimodal corpus of automatically aligned speech translations.\nFiltered and combined with human-labeled and pseudo-labeled data, we developed\nthe first multilingual system capable of translating from and into English for\nboth speech and text. On FLEURS, SeamlessM4T sets a new standard for\ntranslations into multiple target languages, achieving an improvement of 20%\nBLEU over the previous SOTA in direct speech-to-text translation. Compared to\nstrong cascaded models, SeamlessM4T improves the quality of into-English\ntranslation by 1.3 BLEU points in speech-to-text and by 2.6 ASR-BLEU points in\nspeech-to-speech. Tested for robustness, our system performs better against\nbackground noises and speaker variations in speech-to-text tasks compared to\nthe current SOTA model. Critically, we evaluated SeamlessM4T on gender bias and\nadded toxicity to assess translation safety. Finally, all contributions in this\nwork are open-sourced at this https\nhttps://github.com/facebookresearch/seamless_communication.\n","authors":["Seamless Communication","Loïc Barrault","Yu-An Chung","Mariano Cora Meglioli","David Dale","Ning Dong","Paul-Ambroise Duquenne","Hady Elsahar","Hongyu Gong","Kevin Heffernan","John Hoffman","Christopher Klaiber","Pengwei Li","Daniel Licht","Jean Maillard","Alice Rakotoarison","Kaushik Ram Sadagopan","Guillaume Wenzek","Ethan Ye","Bapi Akula","Peng-Jen Chen","Naji El Hachem","Brian Ellis","Gabriel Mejia Gonzalez","Justin Haaheim","Prangthip Hansanti","Russ Howes","Bernie Huang","Min-Jae Hwang","Hirofumi Inaguma","Somya Jain","Elahe Kalbassi","Amanda Kallet","Ilia Kulikov","Janice Lam","Daniel Li","Xutai Ma","Ruslan Mavlyutov","Benjamin Peloquin","Mohamed Ramadan","Abinesh Ramakrishnan","Anna Sun","Kevin Tran","Tuan Tran","Igor Tufanov","Vish Vogeti","Carleigh Wood","Yilin Yang","Bokai Yu","Pierre Andrews","Can Balioglu","Marta R. Costa-jussà","Onur Celebi","Maha Elbayad","Cynthia Gao","Francisco Guzmán","Justine Kao","Ann Lee","Alexandre Mourachko","Juan Pino","Sravya Popuri","Christophe Ropers","Safiyyah Saleem","Holger Schwenk","Paden Tomasello","Changhan Wang","Jeff Wang","Skyler Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09729v2","updated":"2023-08-22T17:32:16Z","published":"2023-08-17T16:59:50Z","title":"MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large\n Language Models","summary":" LLMs usually exhibit limitations in their ability to incorporate new\nknowledge, the generation of hallucinations, and the transparency of their\ndecision-making process. In this paper, we explore how to prompt LLMs with\nknowledge graphs (KG), working as a remedy to engage LLMs with up-to-date\nknowledge and elicit the reasoning pathways from LLMs. Specifically, we build a\nprompting pipeline that endows LLMs with the capability of comprehending KG\ninputs and inferring with a combined implicit knowledge and the retrieved\nexternal knowledge. In addition, we investigate eliciting the mind map on which\nLLMs perform the reasoning and generate the answers. It is identified that the\nproduced mind map exhibits the reasoning pathways of LLMs grounded on the\nontology of knowledge, hence bringing the prospects of probing and gauging LLM\ninference in production. The experiments on three question & answering datasets\nalso show that MindMap prompting leads to a striking empirical gain. For\ninstance, prompting a GPT-3.5 with MindMap yields an overwhelming performance\nover GPT-4 consistently. We also demonstrate that with structured facts\nretrieved from KG, MindMap can outperform a series of\nprompting-with-document-retrieval methods, benefiting from more accurate,\nconcise, and comprehensive knowledge from KGs.\n","authors":["Yilin Wen","Zifeng Wang","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.09729v2.pdf","comment":"7 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2308.11563v1","updated":"2023-08-22T16:59:31Z","published":"2023-08-22T16:59:31Z","title":"Using ChatGPT as a CAT tool in Easy Language translation","summary":" This study sets out to investigate the feasibility of using ChatGPT to\ntranslate citizen-oriented administrative texts into German Easy Language, a\nsimplified, controlled language variety that is adapted to the needs of people\nwith reading impairments. We use ChatGPT to translate selected texts from\nwebsites of German public authorities using two strategies, i.e. linguistic and\nholistic. We analyse the quality of the generated texts based on different\ncriteria, such as correctness, readability, and syntactic complexity. The\nresults indicated that the generated texts are easier than the standard texts,\nbut that they still do not fully meet the established Easy Language standards.\nAdditionally, the content is not always rendered correctly.\n","authors":["Silvana Deilen","Sergio Hernández Garrido","Ekaterina Lapshinova-Koltunski","Christiane Maaß"],"pdf_url":"https://arxiv.org/pdf/2308.11563v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10652v3","updated":"2023-08-22T16:30:45Z","published":"2023-07-20T07:33:30Z","title":"Exploring the Landscape of Natural Language Processing Research","summary":" As an efficient approach to understand, generate, and process natural\nlanguage texts, research in natural language processing (NLP) has exhibited a\nrapid spread and wide adoption in recent years. Given the increasing research\nwork in this area, several NLP-related approaches have been surveyed in the\nresearch community. However, a comprehensive study that categorizes established\ntopics, identifies trends, and outlines areas for future research remains\nabsent. Contributing to closing this gap, we have systematically classified and\nanalyzed research papers in the ACL Anthology. As a result, we present a\nstructured overview of the research landscape, provide a taxonomy of fields of\nstudy in NLP, analyze recent developments in NLP, summarize our findings, and\nhighlight directions for future work.\n","authors":["Tim Schopf","Karim Arabi","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.10652v3.pdf","comment":"Accepted to the 14th International Conference on Recent Advances in\n Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2305.03881v2","updated":"2023-08-22T16:09:59Z","published":"2023-05-06T00:24:44Z","title":"Fairness in Image Search: A Study of Occupational Stereotyping in Image\n Retrieval and its Debiasing","summary":" Multi-modal search engines have experienced significant growth and widespread\nuse in recent years, making them the second most common internet use. While\nsearch engine systems offer a range of services, the image search field has\nrecently become a focal point in the information retrieval community, as the\nadage goes, \"a picture is worth a thousand words\". Although popular search\nengines like Google excel at image search accuracy and agility, there is an\nongoing debate over whether their search results can be biased in terms of\ngender, language, demographics, socio-cultural aspects, and stereotypes. This\npotential for bias can have a significant impact on individuals' perceptions\nand influence their perspectives.\n In this paper, we present our study on bias and fairness in web search, with\na focus on keyword-based image search. We first discuss several kinds of biases\nthat exist in search systems and why it is important to mitigate them. We\nnarrow down our study to assessing and mitigating occupational stereotypes in\nimage search, which is a prevalent fairness issue in image retrieval. For the\nassessment of stereotypes, we take gender as an indicator. We explore various\nopen-source and proprietary APIs for gender identification from images. With\nthese, we examine the extent of gender bias in top-tanked image search results\nobtained for several occupational keywords. To mitigate the bias, we then\npropose a fairness-aware re-ranking algorithm that optimizes (a) relevance of\nthe search result with the keyword and (b) fairness w.r.t genders identified.\nWe experiment on 100 top-ranked images obtained for 10 occupational keywords\nand consider random re-ranking and re-ranking based on relevance as baselines.\nOur experimental results show that the fairness-aware re-ranking algorithm\nproduces rankings with better fairness scores and competitive relevance scores\nthan the baselines.\n","authors":["Swagatika Dash"],"pdf_url":"https://arxiv.org/pdf/2305.03881v2.pdf","comment":"20 Pages, Work uses Proprietary Search Systems from the year 2021"},{"id":"http://arxiv.org/abs/2308.11537v1","updated":"2023-08-22T16:05:18Z","published":"2023-08-22T16:05:18Z","title":"BELB: a Biomedical Entity Linking Benchmark","summary":" Biomedical entity linking (BEL) is the task of grounding entity mentions to a\nknowledge base. It plays a vital role in information extraction pipelines for\nthe life sciences literature. We review recent work in the field and find that,\nas the task is absent from existing benchmarks for biomedical text mining,\ndifferent studies adopt different experimental setups making comparisons based\non published numbers problematic. Furthermore, neural systems are tested\nprimarily on instances linked to the broad coverage knowledge base UMLS,\nleaving their performance to more specialized ones, e.g. genes or variants,\nunderstudied. We therefore developed BELB, a Biomedical Entity Linking\nBenchmark, providing access in a unified format to 11 corpora linked to 7\nknowledge bases and spanning six entity types: gene, disease, chemical,\nspecies, cell line and variant. BELB greatly reduces preprocessing overhead in\ntesting BEL systems on multiple corpora offering a standardized testbed for\nreproducible experiments. Using BELB we perform an extensive evaluation of six\nrule-based entity-specific systems and three recent neural approaches\nleveraging pre-trained language models. Our results reveal a mixed picture\nshowing that neural approaches fail to perform consistently across entity\ntypes, highlighting the need of further studies towards entity-agnostic models.\n","authors":["Samuele Garda","Leon Weber-Genzel","Robert Martin","Ulf Leser"],"pdf_url":"https://arxiv.org/pdf/2308.11537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11531v1","updated":"2023-08-22T15:59:21Z","published":"2023-08-22T15:59:21Z","title":"Empowering Refugee Claimants and their Lawyers: Using Machine Learning\n to Examine Decision-Making in Refugee Law","summary":" Our project aims at helping and supporting stakeholders in refugee status\nadjudications, such as lawyers, judges, governing bodies, and claimants, in\norder to make better decisions through data-driven intelligence and increase\nthe understanding and transparency of the refugee application process for all\ninvolved parties. This PhD project has two primary objectives: (1) to retrieve\npast cases, and (2) to analyze legal decision-making processes on a dataset of\nCanadian cases. In this paper, we present the current state of our work, which\nincludes a completed experiment on part (1) and ongoing efforts related to part\n(2). We believe that NLP-based solutions are well-suited to address these\nchallenges, and we investigate the feasibility of automating all steps\ninvolved. In addition, we introduce a novel benchmark for future NLP research\nin refugee law. Our methodology aims to be inclusive to all end-users and\nstakeholders, with expected benefits including reduced time-to-decision, fairer\nand more transparent outcomes, and improved decision quality.\n","authors":["Claire Barale"],"pdf_url":"https://arxiv.org/pdf/2308.11531v1.pdf","comment":"19th International Conference on Artificial Intelligence and Law -\n ICAIL 2023, Doctoral Consortium. arXiv admin note: substantial text overlap\n with arXiv:2305.15533"},{"id":"http://arxiv.org/abs/2308.09765v2","updated":"2023-08-22T15:53:18Z","published":"2023-08-18T18:18:55Z","title":"Taken by Surprise: Contrast effect for Similarity Scores","summary":" Accurately evaluating the similarity of object vector embeddings is of\ncritical importance for natural language processing, information retrieval and\nclassification tasks. Popular similarity scores (e.g cosine similarity) are\nbased on pairs of embedding vectors and disregard the distribution of the\nensemble from which objects are drawn. Human perception of object similarity\nsignificantly depends on the context in which the objects appear. In this work\nwe propose the $\\textit{surprise score}$, an ensemble-normalized similarity\nmetric that encapsulates the contrast effect of human perception and\nsignificantly improves the classification performance on zero- and few-shot\ndocument classification tasks. This score quantifies the surprise to find a\ngiven similarity between two elements relative to the pairwise ensemble\nsimilarities. We evaluate this metric on zero/few shot classification and\nclustering tasks and typically find 10-15 % better performance compared to raw\ncosine similarity. Our code is available at\nhttps://github.com/MeetElise/surprise-similarity.\n","authors":["Thomas C. Bachlechner","Mario Martone","Marjorie Schillo"],"pdf_url":"https://arxiv.org/pdf/2308.09765v2.pdf","comment":"9 pages, 2 figures and 4 tables"},{"id":"http://arxiv.org/abs/2308.11507v1","updated":"2023-08-22T15:28:49Z","published":"2023-08-22T15:28:49Z","title":"Unsupervised Prototype Adapter for Vision-Language Models","summary":" Recently, large-scale pre-trained vision-language models (e.g. CLIP and\nALIGN) have demonstrated remarkable effectiveness in acquiring transferable\nvisual representations. To leverage the valuable knowledge encoded within these\nmodels for downstream tasks, several fine-tuning approaches, including prompt\ntuning methods and adapter-based methods, have been developed to adapt\nvision-language models effectively with supervision. However, these methods\nrely on the availability of annotated samples, which can be labor-intensive and\ntime-consuming to acquire, thus limiting scalability. To address this issue, in\nthis work, we design an unsupervised fine-tuning approach for vision-language\nmodels called Unsupervised Prototype Adapter (UP-Adapter). Specifically, for\nthe unannotated target datasets, we leverage the text-image aligning capability\nof CLIP to automatically select the most confident samples for each class.\nUtilizing these selected samples, we generate class prototypes, which serve as\nthe initialization for the learnable prototype model. After fine-tuning, the\nprototype model prediction is combined with the original CLIP's prediction by a\nresidual connection to perform downstream recognition tasks. Our extensive\nexperimental results on image recognition and domain generalization show that\nthe proposed unsupervised method outperforms 8-shot CoOp, 8-shot Tip-Adapter,\nand also the state-of-the-art UPL method by large margins.\n","authors":["Yi Zhang","Ce Zhang","Xueting Hu","Zhihai He"],"pdf_url":"https://arxiv.org/pdf/2308.11507v1.pdf","comment":"Accepted by PRCV 2023"},{"id":"http://arxiv.org/abs/2308.11490v1","updated":"2023-08-22T15:10:45Z","published":"2023-08-22T15:10:45Z","title":"Can Authorship Representation Learning Capture Stylistic Features?","summary":" Automatically disentangling an author's style from the content of their\nwriting is a longstanding and possibly insurmountable problem in computational\nlinguistics. At the same time, the availability of large text corpora furnished\nwith author labels has recently enabled learning authorship representations in\na purely data-driven manner for authorship attribution, a task that ostensibly\ndepends to a greater extent on encoding writing style than encoding content.\nHowever, success on this surrogate task does not ensure that such\nrepresentations capture writing style since authorship could also be correlated\nwith other latent variables, such as topic. In an effort to better understand\nthe nature of the information these representations convey, and specifically to\nvalidate the hypothesis that they chiefly encode writing style, we\nsystematically probe these representations through a series of targeted\nexperiments. The results of these experiments suggest that representations\nlearned for the surrogate authorship prediction task are indeed sensitive to\nwriting style. As a consequence, authorship representations may be expected to\nbe robust to certain kinds of data shift, such as topic drift over time.\nAdditionally, our findings may open the door to downstream applications that\nrequire stylistic representations, such as style transfer.\n","authors":["Andrew Wang","Cristina Aggazzotti","Rebecca Kotula","Rafael Rivera Soto","Marcus Bishop","Nicholas Andrews"],"pdf_url":"https://arxiv.org/pdf/2308.11490v1.pdf","comment":"appearing at TACL 2023"},{"id":"http://arxiv.org/abs/2308.11483v1","updated":"2023-08-22T14:54:59Z","published":"2023-08-22T14:54:59Z","title":"Large Language Models Sensitivity to The Order of Options in\n Multiple-Choice Questions","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nvarious NLP tasks. However, previous works have shown these models are\nsensitive towards prompt wording, and few-shot demonstrations and their order,\nposing challenges to fair assessment of these models. As these models become\nmore powerful, it becomes imperative to understand and address these\nlimitations. In this paper, we focus on LLMs robustness on the task of\nmultiple-choice questions -- commonly adopted task to study reasoning and\nfact-retrieving capability of LLMs. Investigating the sensitivity of LLMs\ntowards the order of options in multiple-choice questions, we demonstrate a\nconsiderable performance gap of approximately 13% to 75% in LLMs on different\nbenchmarks, when answer options are reordered, even when using demonstrations\nin a few-shot setting. Through a detailed analysis, we conjecture that this\nsensitivity arises when LLMs are uncertain about the prediction between the\ntop-2/3 choices, and specific options placements may favor certain prediction\nbetween those top choices depending on the question caused by positional bias.\nWe also identify patterns in top-2 choices that amplify or mitigate the model's\nbias toward option placement. We found that for amplifying bias, the optimal\nstrategy involves positioning the top two choices as the first and last\noptions. Conversely, to mitigate bias, we recommend placing these choices among\nthe adjacent options. To validate our conjecture, we conduct various\nexperiments and adopt two approaches to calibrate LLMs' predictions, leading to\nup to 8 percentage points improvement across different models and benchmarks.\n","authors":["Pouya Pezeshkpour","Estevam Hruschka"],"pdf_url":"https://arxiv.org/pdf/2308.11483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10482v2","updated":"2023-08-22T14:33:43Z","published":"2023-08-21T05:46:40Z","title":"An Effective Method using Phrase Mechanism in Neural Machine Translation","summary":" Machine Translation is one of the essential tasks in Natural Language\nProcessing (NLP), which has massive applications in real life as well as\ncontributing to other tasks in the NLP research community. Recently,\nTransformer -based methods have attracted numerous researchers in this domain\nand achieved state-of-the-art results in most of the pair languages. In this\npaper, we report an effective method using a phrase mechanism,\nPhraseTransformer, to improve the strong baseline model Transformer in\nconstructing a Neural Machine Translation (NMT) system for parallel corpora\nVietnamese-Chinese. Our experiments on the MT dataset of the VLSP 2022\ncompetition achieved the BLEU score of 35.3 on Vietnamese to Chinese and 33.2\nBLEU scores on Chinese to Vietnamese data. Our code is available at\nhttps://github.com/phuongnm94/PhraseTransformer.\n","authors":["Phuong Minh Nguyen","Le Minh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.10482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11466v1","updated":"2023-08-22T14:25:15Z","published":"2023-08-22T14:25:15Z","title":"Sentence-Level Multimodal and Language-Agnostic Representations","summary":" We introduce SONAR, a new multilingual and multimodal fixed-size sentence\nembedding space. Our single text encoder, covering 200 languages, substantially\noutperforms existing sentence embeddings such as LASER3 and LabSE on the xsim\nand xsim++ multilingual similarity search tasks. Speech segments can be\nembedded in the same SONAR embedding space using language-specific speech\nencoders trained in a teacher-student setting on speech transcription data. Our\nencoders outperform existing speech encoders on similarity search tasks. We\nalso provide a text decoder for 200 languages, which allows us to perform\ntext-to-text and speech-to-text machine translation, including for zero-shot\nlanguage and modality combinations. Our text-to-text results are competitive\ncompared to the state-of-the-art NLLB~1B model, despite the fixed-size\nbottleneck representation. Our zero-shot speech-to-text translation results\ncompare favorably with strong supervised baselines such as Whisper.\n","authors":["Paul-Ambroise Duquenne","Holger Schwenk","Benoît Sagot"],"pdf_url":"https://arxiv.org/pdf/2308.11466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07312v2","updated":"2023-08-22T14:10:19Z","published":"2023-07-14T12:45:03Z","title":"Using Large Language Models for Zero-Shot Natural Language Generation\n from Knowledge Graphs","summary":" In any system that uses structured knowledge graph (KG) data as its\nunderlying knowledge representation, KG-to-text generation is a useful tool for\nturning parts of the graph data into text that can be understood by humans.\nRecent work has shown that models that make use of pretraining on large amounts\nof text data can perform well on the KG-to-text task even with relatively small\nsets of training data on the specific graph-to-text task. In this paper, we\nbuild on this concept by using large language models to perform zero-shot\ngeneration based on nothing but the model's understanding of the triple\nstructure from what it can read. We show that ChatGPT achieves near\nstate-of-the-art performance on some measures of the WebNLG 2020 challenge, but\nfalls behind on others. Additionally, we compare factual, counter-factual and\nfictional statements, and show that there is a significant connection between\nwhat the LLM already knows about the data it is parsing and the quality of the\noutput text.\n","authors":["Agnes Axelsson","Gabriel Skantze"],"pdf_url":"https://arxiv.org/pdf/2307.07312v2.pdf","comment":"9 pages, 3 pages appendices, 1 figure, 4 tables (incl. appendices)"},{"id":"http://arxiv.org/abs/2308.11447v1","updated":"2023-08-22T13:55:36Z","published":"2023-08-22T13:55:36Z","title":"Aspect-oriented Opinion Alignment Network for Aspect-Based Sentiment\n Classification","summary":" Aspect-based sentiment classification is a crucial problem in fine-grained\nsentiment analysis, which aims to predict the sentiment polarity of the given\naspect according to its context. Previous works have made remarkable progress\nin leveraging attention mechanism to extract opinion words for different\naspects. However, a persistent challenge is the effective management of\nsemantic mismatches, which stem from attention mechanisms that fall short in\nadequately aligning opinions words with their corresponding aspect in\nmulti-aspect sentences. To address this issue, we propose a novel\nAspect-oriented Opinion Alignment Network (AOAN) to capture the contextual\nassociation between opinion words and the corresponding aspect. Specifically,\nwe first introduce a neighboring span enhanced module which highlights various\ncompositions of neighboring words and given aspects. In addition, we design a\nmulti-perspective attention mechanism that align relevant opinion information\nwith respect to the given aspect. Extensive experiments on three benchmark\ndatasets demonstrate that our model achieves state-of-the-art results. The\nsource code is available at https://github.com/AONE-NLP/ABSA-AOAN.\n","authors":["Xueyi Liu","Rui Hou","Yanglei Gan","Da Luo","Changlin Li","Xiaojun Shi","Qiao Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11447v1.pdf","comment":"8 pages, 5 figure, ECAI 2023"},{"id":"http://arxiv.org/abs/2308.11432v1","updated":"2023-08-22T13:30:37Z","published":"2023-08-22T13:30:37Z","title":"A Survey on Large Language Model based Autonomous Agents","summary":" Autonomous agents have long been a prominent research topic in the academic\ncommunity. Previous research in this field often focuses on training agents\nwith limited knowledge within isolated environments, which diverges\nsignificantly from the human learning processes, and thus makes the agents hard\nto achieve human-like decisions. Recently, through the acquisition of vast\namounts of web knowledge, large language models (LLMs) have demonstrated\nremarkable potential in achieving human-level intelligence. This has sparked an\nupsurge in studies investigating autonomous agents based on LLMs. To harness\nthe full potential of LLMs, researchers have devised diverse agent\narchitectures tailored to different applications. In this paper, we present a\ncomprehensive survey of these studies, delivering a systematic review of the\nfield of autonomous agents from a holistic perspective. More specifically, our\nfocus lies in the construction of LLM-based agents, for which we propose a\nunified framework that encompasses a majority of the previous work.\nAdditionally, we provide a summary of the various applications of LLM-based AI\nagents in the domains of social science, natural science, and engineering.\nLastly, we discuss the commonly employed evaluation strategies for LLM-based AI\nagents. Based on the previous studies, we also present several challenges and\nfuture directions in this field. To keep track of this field and continuously\nupdate our survey, we maintain a repository for the related references at\nhttps://github.com/Paitesanshi/LLM-Agent-Survey.\n","authors":["Lei Wang","Chen Ma","Xueyang Feng","Zeyu Zhang","Hao Yang","Jingsen Zhang","Zhiyuan Chen","Jiakai Tang","Xu Chen","Yankai Lin","Wayne Xin Zhao","Zhewei Wei","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.11432v1.pdf","comment":"32 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.11411v1","updated":"2023-08-22T13:00:13Z","published":"2023-08-22T13:00:13Z","title":"Extracting Relational Triples Based on Graph Recursive Neural Network\n via Dynamic Feedback Forest Algorithm","summary":" Extracting relational triples (subject, predicate, object) from text enables\nthe transformation of unstructured text data into structured knowledge. The\nnamed entity recognition (NER) and the relation extraction (RE) are two\nfoundational subtasks in this knowledge generation pipeline. The integration of\nsubtasks poses a considerable challenge due to their disparate nature. This\npaper presents a novel approach that converts the triple extraction task into a\ngraph labeling problem, capitalizing on the structural information of\ndependency parsing and graph recursive neural networks (GRNNs). To integrate\nsubtasks, this paper proposes a dynamic feedback forest algorithm that connects\nthe representations of subtasks by inference operations during model training.\nExperimental results demonstrate the effectiveness of the proposed method.\n","authors":["Hongyin Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.11411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11380v1","updated":"2023-08-22T12:09:30Z","published":"2023-08-22T12:09:30Z","title":"Convoifilter: A case study of doing cocktail party speech recognition","summary":" This paper presents an end-to-end model designed to improve automatic speech\nrecognition (ASR) for a particular speaker in a crowded, noisy environment. The\nmodel utilizes a single-channel speech enhancement module that isolates the\nspeaker's voice from background noise, along with an ASR module. Through this\napproach, the model is able to decrease the word error rate (WER) of ASR from\n80% to 26.4%. Typically, these two components are adjusted independently due to\nvariations in data requirements. However, speech enhancement can create\nanomalies that decrease ASR efficiency. By implementing a joint fine-tuning\nstrategy, the model can reduce the WER from 26.4% in separate tuning to 14.5%\nin joint tuning.\n","authors":["Thai-Binh Nguyen","Alexander Waibel"],"pdf_url":"https://arxiv.org/pdf/2308.11380v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2308.11351v1","updated":"2023-08-22T11:00:09Z","published":"2023-08-22T11:00:09Z","title":"M3PS: End-to-End Multi-Grained Multi-Modal Attribute-Aware Product\n Summarization in E-commerce","summary":" Given the long textual product information and the product image, Multi-Modal\nProduct Summarization (MMPS) aims to attract customers' interest and increase\ntheir desire to purchase by highlighting product characteristics with a short\ntextual summary. Existing MMPS methods have achieved promising performance.\nNevertheless, there still exist several problems: 1) lack end-to-end product\nsummarization, 2) lack multi-grained multi-modal modeling, and 3) lack\nmulti-modal attribute modeling. To address these issues, we propose an\nend-to-end multi-grained multi-modal attribute-aware product summarization\nmethod (M3PS) for generating high-quality product summaries in e-commerce. M3PS\njointly models product attributes and generates product summaries. Meanwhile,\nwe design several multi-grained multi-modal tasks to better guide the\nmulti-modal learning of M3PS. Furthermore, we model product attributes based on\nboth text and image modalities so that multi-modal product characteristics can\nbe manifested in the generated summaries. Extensive experiments on a real\nlarge-scale Chinese e-commence dataset demonstrate that our model outperforms\nstate-of-the-art product summarization methods w.r.t. several summarization\nmetrics.\n","authors":["Tao Chen","Ze Lin","Hui Li","Jiayi Ji","Yiyi Zhou","Guanbin Li","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.11351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10390v2","updated":"2023-08-22T10:52:41Z","published":"2023-08-20T23:47:23Z","title":"LibriSQA: Advancing Free-form and Open-ended Spoken Question Answering\n with a Novel Dataset and Framework","summary":" While Large Language Models (LLMs) have demonstrated commendable performance\nacross a myriad of domains and tasks, existing LLMs still exhibit a palpable\ndeficit in handling multimodal functionalities, especially for the Spoken\nQuestion Answering (SQA) task which necessitates precise alignment and deep\ninteraction between speech and text features. To address the SQA challenge on\nLLMs, we initially curated the free-form and open-ended LibriSQA dataset from\nLibrispeech, comprising Part I with natural conversational formats and Part II\nencompassing multiple-choice questions followed by answers and analytical\nsegments. Both parts collectively include 107k SQA pairs that cover various\ntopics. Given the evident paucity of existing speech-text LLMs, we propose a\nlightweight, end-to-end framework to execute the SQA task on the LibriSQA,\nwitnessing significant results. By reforming ASR into the SQA format, we\nfurther substantiate our framework's capability in handling ASR tasks. Our\nempirical findings bolster the LLMs' aptitude for aligning and comprehending\nmultimodal information, paving the way for the development of universal\nmultimodal LLMs. The dataset and demo can be found at\nhttps://github.com/ZihanZhaoSJTU/LibriSQA.\n","authors":["Zihan Zhao","Yiyang Jiang","Heyang Liu","Yanfeng Wang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10390v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03099v2","updated":"2023-08-22T09:48:20Z","published":"2023-08-06T12:28:24Z","title":"LARCH: Large Language Model-based Automatic Readme Creation with\n Heuristics","summary":" Writing a readme is a crucial aspect of software development as it plays a\nvital role in managing and reusing program code. Though it is a pain point for\nmany developers, automatically creating one remains a challenge even with the\nrecent advancements in large language models (LLMs), because it requires\ngenerating an abstract description from thousands of lines of code. In this\ndemo paper, we show that LLMs are capable of generating a coherent and\nfactually correct readmes if we can identify a code fragment that is\nrepresentative of the repository. Building upon this finding, we developed\nLARCH (LLM-based Automatic Readme Creation with Heuristics) which leverages\nrepresentative code identification with heuristics and weak supervision.\nThrough human and automated evaluations, we illustrate that LARCH can generate\ncoherent and factually correct readmes in the majority of cases, outperforming\na baseline that does not rely on representative code identification. We have\nmade LARCH open-source and provided a cross-platform Visual Studio Code\ninterface and command-line interface, accessible at\nhttps://github.com/hitachi-nlp/larch. A demo video showcasing LARCH's\ncapabilities is available at https://youtu.be/ZUKkh5ED-O4.\n","authors":["Yuta Koreeda","Terufumi Morishita","Osamu Imaichi","Yasuhiro Sogawa"],"pdf_url":"https://arxiv.org/pdf/2308.03099v2.pdf","comment":"This is a pre-print of a paper accepted at CIKM'23 Demo. Refer to the\n DOI URL for the original publication"},{"id":"http://arxiv.org/abs/2308.11284v1","updated":"2023-08-22T08:51:10Z","published":"2023-08-22T08:51:10Z","title":"LEAP: Efficient and Automated Test Method for NLP Software","summary":" The widespread adoption of DNNs in NLP software has highlighted the need for\nrobustness. Researchers proposed various automatic testing techniques for\nadversarial test cases. However, existing methods suffer from two limitations:\nweak error-discovering capabilities, with success rates ranging from 0% to\n24.6% for BERT-based NLP software, and time inefficiency, taking 177.8s to\n205.28s per test case, making them challenging for time-constrained scenarios.\nTo address these issues, this paper proposes LEAP, an automated test method\nthat uses LEvy flight-based Adaptive Particle swarm optimization integrated\nwith textual features to generate adversarial test cases. Specifically, we\nadopt Levy flight for population initialization to increase the diversity of\ngenerated test cases. We also design an inertial weight adaptive update\noperator to improve the efficiency of LEAP's global optimization of\nhigh-dimensional text examples and a mutation operator based on the greedy\nstrategy to reduce the search time. We conducted a series of experiments to\nvalidate LEAP's ability to test NLP software and found that the average success\nrate of LEAP in generating adversarial test cases is 79.1%, which is 6.1%\nhigher than the next best approach (PSOattack). While ensuring high success\nrates, LEAP significantly reduces time overhead by up to 147.6s compared to\nother heuristic-based methods. Additionally, the experimental results\ndemonstrate that LEAP can generate more transferable test cases and\nsignificantly enhance the robustness of DNN-based systems.\n","authors":["Mingxuan Xiao","Yan Xiao","Hai Dong","Shunhui Ji","Pengcheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11284v1.pdf","comment":"Accepted at ASE 2023"},{"id":"http://arxiv.org/abs/2308.11276v1","updated":"2023-08-22T08:43:33Z","published":"2023-08-22T08:43:33Z","title":"Music Understanding LLaMA: Advancing Text-to-Music Generation with\n Question Answering and Captioning","summary":" Text-to-music generation (T2M-Gen) faces a major obstacle due to the scarcity\nof large-scale publicly available music datasets with natural language\ncaptions. To address this, we propose the Music Understanding LLaMA (MU-LLaMA),\ncapable of answering music-related questions and generating captions for music\nfiles. Our model utilizes audio representations from a pretrained MERT model to\nextract music features. However, obtaining a suitable dataset for training the\nMU-LLaMA model remains challenging, as existing publicly accessible audio\nquestion answering datasets lack the necessary depth for open-ended music\nquestion answering. To fill this gap, we present a methodology for generating\nquestion-answer pairs from existing audio captioning datasets and introduce the\nMusicQA Dataset designed for answering open-ended music-related questions. The\nexperiments demonstrate that the proposed MU-LLaMA model, trained on our\ndesigned MusicQA dataset, achieves outstanding performance in both music\nquestion answering and music caption generation across various metrics,\noutperforming current state-of-the-art (SOTA) models in both fields and\noffering a promising advancement in the T2M-Gen research field.\n","authors":["Shansong Liu","Atin Sakkeer Hussain","Chenshuo Sun","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2308.11276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11257v1","updated":"2023-08-22T08:00:50Z","published":"2023-08-22T08:00:50Z","title":"HopPG: Self-Iterative Program Generation for Multi-Hop Question\n Answering over Heterogeneous Knowledge","summary":" The semantic parsing-based method is an important research branch for\nknowledge-based question answering. It usually generates executable programs\nlean upon the question and then conduct them to reason answers over a knowledge\nbase. Benefit from this inherent mechanism, it has advantages in the\nperformance and the interpretability. However,traditional semantic parsing\nmethods usually generate a complete program before executing it, which\nstruggles with multi-hop question answering over heterogeneous knowledge.\nFirstly,a complete multi-hop program relies on multiple heterogeneous\nsupporting facts, and it is difficult for models to receive these facts\nsimultaneously. Secondly,these methods ignore the interaction information\nbetween the previous-hop execution result and the current-hop program\ngeneration. To alleviate these challenges, we propose a self-iterative\nframework for multi-hop program generation (HopPG) over heterogeneous\nknowledge, which leverages the previous-hop execution results to retrieve\nsupporting facts and generate subsequent programs iteratively. We evaluate our\nmodel on MMQA-T^2. The experimental results show that HopPG outperforms\nexisting semantic-parsing-based baselines, especially on the multi-hop\nquestions.\n","authors":["Yingyao Wang","Yongwei Zhou","Chaoqun Duan","Junwei Bao","Tiejun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.11257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10971v2","updated":"2023-08-22T07:25:43Z","published":"2023-05-18T13:38:36Z","title":"NollySenti: Leveraging Transfer Learning and Machine Translation for\n Nigerian Movie Sentiment Classification","summary":" Africa has over 2000 indigenous languages but they are under-represented in\nNLP research due to lack of datasets. In recent years, there have been progress\nin developing labeled corpora for African languages. However, they are often\navailable in a single domain and may not generalize to other domains. In this\npaper, we focus on the task of sentiment classification for cross domain\nadaptation. We create a new dataset, NollySenti - based on the Nollywood movie\nreviews for five languages widely spoken in Nigeria (English, Hausa, Igbo,\nNigerian-Pidgin, and Yoruba. We provide an extensive empirical evaluation using\nclassical machine learning methods and pre-trained language models. Leveraging\ntransfer learning, we compare the performance of cross-domain adaptation from\nTwitter domain, and cross-lingual adaptation from English language. Our\nevaluation shows that transfer from English in the same target domain leads to\nmore than 5% improvement in accuracy compared to transfer from Twitter in the\nsame language. To further mitigate the domain difference, we leverage machine\ntranslation (MT) from English to other Nigerian languages, which leads to a\nfurther improvement of 7% over cross-lingual evaluation. While MT to\nlow-resource languages are often of low quality, through human evaluation, we\nshow that most of the translated sentences preserve the sentiment of the\noriginal English reviews.\n","authors":["Iyanuoluwa Shode","David Ifeoluwa Adelani","Jing Peng","Anna Feldman"],"pdf_url":"https://arxiv.org/pdf/2305.10971v2.pdf","comment":"Accepted to ACL 2023 (main conference)"},{"id":"http://arxiv.org/abs/2308.11224v1","updated":"2023-08-22T06:32:07Z","published":"2023-08-22T06:32:07Z","title":"Evaluating Large Language Models on Graphs: Performance Insights and\n Comparative Analysis","summary":" Large Language Models (LLMs) have garnered considerable interest within both\nacademic and industrial. Yet, the application of LLMs to graph data remains\nunder-explored. In this study, we evaluate the capabilities of four LLMs in\naddressing several analytical problems with graph data. We employ four distinct\nevaluation metrics: Comprehension, Correctness, Fidelity, and Rectification.\nOur results show that: 1) LLMs effectively comprehend graph data in natural\nlanguage and reason with graph topology. 2) GPT models can generate logical and\ncoherent results, outperforming alternatives in correctness. 3) All examined\nLLMs face challenges in structural reasoning, with techniques like zero-shot\nchain-of-thought and few-shot prompting showing diminished efficacy. 4) GPT\nmodels often produce erroneous answers in multi-answer tasks, raising concerns\nin fidelity. 5) GPT models exhibit elevated confidence in their outputs,\npotentially hindering their rectification capacities. Notably, GPT-4 has\ndemonstrated the capacity to rectify responses from GPT-3.5-turbo and its own\nprevious iterations. The code is available at:\nhttps://github.com/Ayame1006/LLMtoGraph.\n","authors":["Chang Liu","Bo Wu"],"pdf_url":"https://arxiv.org/pdf/2308.11224v1.pdf","comment":"12 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.11189v1","updated":"2023-08-22T04:49:23Z","published":"2023-08-22T04:49:23Z","title":"Diversity Measures: Domain-Independent Proxies for Failure in Language\n Model Queries","summary":" Error prediction in large language models often relies on domain-specific\ninformation. In this paper, we present measures for quantification of error in\nthe response of a large language model based on the diversity of responses to a\ngiven prompt - hence independent of the underlying application. We describe how\nthree such measures - based on entropy, Gini impurity, and centroid distance -\ncan be employed. We perform a suite of experiments on multiple datasets and\ntemperature settings to demonstrate that these measures strongly correlate with\nthe probability of failure. Additionally, we present empirical results\ndemonstrating how these measures can be applied to few-shot prompting,\nchain-of-thought reasoning, and error detection.\n","authors":["Noel Ngu","Nathaniel Lee","Paulo Shakarian"],"pdf_url":"https://arxiv.org/pdf/2308.11189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11171v1","updated":"2023-08-22T04:01:01Z","published":"2023-08-22T04:01:01Z","title":"ViCo: Engaging Video Comment Generation with Human Preference Rewards","summary":" Engaging video comments play an important role in video social media, as they\nare the carrier of feelings, thoughts, or humor of the audience. Preliminary\nworks have made initial exploration for video comment generation by adopting\ncaption-style encoder-decoder models. However, comment generation presents some\nunique challenges distinct from caption generation, which makes these methods\nsomewhat less effective at generating engaging comments. In contrast to the\nobjective and descriptive nature of captions, comments tend to be inherently\nsubjective, making it hard to quantify and evaluate the engagement of comments.\nFurthermore, the scarcity of truly engaging comments brings difficulty to\ncollecting enough high-quality training examples. In this paper, we propose\nViCo with three novel designs to tackle the above challenges for generating\nengaging Video Comments. Firstly, to quantify the engagement of comments, we\nutilize the number of \"likes\" each comment receives as a proxy of human\npreference after an appropriate debiasing procedure. Secondly, to automatically\nevaluate the engagement of comments, we train a reward model to align its\njudgment to the above proxy. Our user studies indicate that this reward model\neffectively aligns with human judgments. Lastly, to alleviate the scarcity of\nhigh-quality comments, an initial generator is trained on readily available but\nnoisy data to generate comments. Then the reward model is employed to offer\nfeedback on the generated comments, thus optimizing the initial generator. To\nfacilitate the research of video commenting, we collect a large video\ncomment-dataset (ViCo-20k) with rich metadata from a popular video website.\nExperiments on ViCo-20k show that the comments generated by our ViCo model\nexhibit the best performance in terms of both quantitative and qualitative\nresults, particularly when engagement is considered.\n","authors":["Yuchong Sun","Bei Liu","Xu Chen","Ruihua Song","Jianlong Fu"],"pdf_url":"https://arxiv.org/pdf/2308.11171v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08742v2","updated":"2023-08-22T03:19:16Z","published":"2023-08-17T02:33:43Z","title":"PMET: Precise Model Editing in a Transformer","summary":" Model editing techniques modify a minor proportion of knowledge in Large\nLanguage Models (LLMs) at a relatively low cost, which have demonstrated\nnotable success. Existing methods assume Transformer Layer (TL) hidden states\nare values of key-value memories of the Feed-Forward Network (FFN). They\nusually optimize the TL hidden states to memorize target knowledge and use it\nto update the weights of the FFN in LLMs. However, the information flow of TL\nhidden states comes from three parts: Multi-Head Self-Attention (MHSA), FFN,\nand residual connections. Existing methods neglect the fact that the TL hidden\nstates contains information not specifically required for FFN. Consequently,\nthe performance of model editing decreases. To achieve more precise model\nediting, we analyze hidden states of MHSA and FFN, finding that MHSA encodes\ncertain general knowledge extraction patterns. This implies that MHSA weights\ndo not require updating when new knowledge is introduced. Based on above\nfindings, we introduce PMET, which simultaneously optimizes Transformer\nComponent (TC, namely MHSA and FFN) hidden states, while only using the\noptimized TC hidden states of FFN to precisely update FFN weights. Our\nexperiments demonstrate that PMET exhibits state-of-the-art performance on both\nthe COUNTERFACT and zsRE datasets. Our ablation experiments substantiate the\neffectiveness of our enhancements, further reinforcing the finding that the\nMHSA encodes certain general knowledge extraction patterns and indicating its\nstorage of a small amount of factual knowledge. Our code is available at\nhttps://github.com/xpq-tech/PMET.git.\n","authors":["Xiaopeng Li","Shasha Li","Shezheng Song","Jing Yang","Jun Ma","Jie Yu"],"pdf_url":"https://arxiv.org/pdf/2308.08742v2.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2304.01852v4","updated":"2023-08-22T03:18:43Z","published":"2023-04-04T15:01:06Z","title":"Summary of ChatGPT-Related Research and Perspective Towards the Future\n of Large Language Models","summary":" This paper presents a comprehensive survey of ChatGPT-related (GPT-3.5 and\nGPT-4) research, state-of-the-art large language models (LLM) from the GPT\nseries, and their prospective applications across diverse domains. Indeed, key\ninnovations such as large-scale pre-training that captures knowledge across the\nentire world wide web, instruction fine-tuning and Reinforcement Learning from\nHuman Feedback (RLHF) have played significant roles in enhancing LLMs'\nadaptability and performance. We performed an in-depth analysis of 194 relevant\npapers on arXiv, encompassing trend analysis, word cloud representation, and\ndistribution analysis across various application domains. The findings reveal a\nsignificant and increasing interest in ChatGPT-related research, predominantly\ncentered on direct natural language processing applications, while also\ndemonstrating considerable potential in areas ranging from education and\nhistory to mathematics, medicine, and physics. This study endeavors to furnish\ninsights into ChatGPT's capabilities, potential implications, ethical concerns,\nand offer direction for future advancements in this field.\n","authors":["Yiheng Liu","Tianle Han","Siyuan Ma","Jiayue Zhang","Yuanyuan Yang","Jiaming Tian","Hao He","Antong Li","Mengshen He","Zhengliang Liu","Zihao Wu","Lin Zhao","Dajiang Zhu","Xiang Li","Ning Qiang","Dingang Shen","Tianming Liu","Bao Ge"],"pdf_url":"https://arxiv.org/pdf/2304.01852v4.pdf","comment":"21 pages, 4 figures, accepted by Meta-Radiology"},{"id":"http://arxiv.org/abs/2308.11148v1","updated":"2023-08-22T03:10:40Z","published":"2023-08-22T03:10:40Z","title":"LLaMA-Reviewer: Advancing Code Review Automation with Large Language\n Models through Parameter-Efficient Fine-Tuning (Practical Experience Report)","summary":" The automation of code review activities, a long-standing pursuit in software\nengineering, has been primarily addressed by numerous domain-specific\npre-trained models. Despite their success, these models frequently demand\nextensive resources for pre-training from scratch. In contrast, Large Language\nModels (LLMs) provide an intriguing alternative, given their remarkable\ncapabilities when supplemented with domain-specific knowledge. However, their\npotential for automating code review tasks remains largely unexplored.\n In response to this research gap, we present LLaMA-Reviewer, an innovative\nframework that leverages the capabilities of LLaMA, a popular LLM, in the realm\nof code review. Mindful of resource constraints, this framework employs\nparameter-efficient fine-tuning (PEFT) methods, delivering high performance\nwhile using less than 1% of trainable parameters.\n An extensive evaluation of LLaMA-Reviewer is conducted on two diverse,\npublicly available datasets. Notably, even with the smallest LLaMA base model\nconsisting of 6.7B parameters and a limited number of tuning epochs,\nLLaMA-Reviewer equals the performance of existing code-review-focused models.\n The ablation experiments provide insights into the influence of various\nfine-tuning process components, including input representation, instruction\ntuning, and different PEFT methods. To foster continuous progress in this\nfield, the code and all PEFT-weight plugins have been made open-source.\n","authors":["Junyi Lu","Lei Yu","Xiaojia Li","Li Yang","Chun Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.11148v1.pdf","comment":"Accepted to the 34th IEEE International Symposium on Software\n Reliability Engineering (ISSRE 2023)"},{"id":"http://arxiv.org/abs/2308.10755v2","updated":"2023-08-22T02:57:45Z","published":"2023-08-21T14:40:48Z","title":"WanJuan: A Comprehensive Multimodal Dataset for Advancing English and\n Chinese Large Models","summary":" The rise in popularity of ChatGPT and GPT-4 has significantly accelerated the\ndevelopment of large models, leading to the creation of numerous impressive\nlarge language models(LLMs) and multimodal large language models (MLLMs). These\ncutting-edge models owe their remarkable performance to high-quality data.\nHowever, the details of the training data used in leading paradigms are often\nkept confidential. This lack of transparency, coupled with the scarcity of\nopen-source data, impedes further developments within the community. As a\nresponse, this paper presents \"Wan Juan\", a large-scale multimodal dataset\ncomposed of both Chinese and English data, collected from a wide range of web\nsources. The dataset incorporates text, image-text, and video modalities, with\na total volume exceeding 2TB. It was utilized in the training of InternLM, a\nmodel that demonstrated significant advantages in multi-dimensional evaluations\nwhen compared to models of a similar scale. All data can be accessed at\nhttps://opendatalab.org.cn/WanJuan1.0.\n","authors":["Conghui He","Zhenjiang Jin","Chao Xu","Jiantao Qiu","Bin Wang","Wei Li","Hang Yan","Jiaqi Wang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.10755v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2308.10195v2","updated":"2023-08-22T02:55:39Z","published":"2023-08-20T07:56:34Z","title":"WMFormer++: Nested Transformer for Visible Watermark Removal via Implict\n Joint Learning","summary":" Watermarking serves as a widely adopted approach to safeguard media\ncopyright. In parallel, the research focus has extended to watermark removal\ntechniques, offering an adversarial means to enhance watermark robustness and\nfoster advancements in the watermarking field. Existing watermark removal\nmethods mainly rely on UNet with task-specific decoder branches--one for\nwatermark localization and the other for background image restoration. However,\nwatermark localization and background restoration are not isolated tasks;\nprecise watermark localization inherently implies regions necessitating\nrestoration, and the background restoration process contributes to more\naccurate watermark localization. To holistically integrate information from\nboth branches, we introduce an implicit joint learning paradigm. This empowers\nthe network to autonomously navigate the flow of information between implicit\nbranches through a gate mechanism. Furthermore, we employ cross-channel\nattention to facilitate local detail restoration and holistic structural\ncomprehension, while harnessing nested structures to integrate multi-scale\ninformation. Extensive experiments are conducted on various challenging\nbenchmarks to validate the effectiveness of our proposed method. The results\ndemonstrate our approach's remarkable superiority, surpassing existing\nstate-of-the-art methods by a large margin.\n","authors":["Dongjian Huo","Zehong Zhang","Hanjing Su","Guanbin Li","Chaowei Fang","Qingyao Wu"],"pdf_url":"https://arxiv.org/pdf/2308.10195v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11138v1","updated":"2023-08-22T02:39:42Z","published":"2023-08-22T02:39:42Z","title":"NLP-based detection of systematic anomalies among the narratives of\n consumer complaints","summary":" We develop an NLP-based procedure for detecting systematic nonmeritorious\nconsumer complaints, simply called systematic anomalies, among complaint\nnarratives. While classification algorithms are used to detect pronounced\nanomalies, in the case of smaller and frequent systematic anomalies, the\nalgorithms may falter due to a variety of reasons, including technical ones as\nwell as natural limitations of human analysts. Therefore, as the next step\nafter classification, we convert the complaint narratives into quantitative\ndata, which are then analyzed using an algorithm for detecting systematic\nanomalies. We illustrate the entire procedure using complaint narratives from\nthe Consumer Complaint Database of the Consumer Financial Protection Bureau.\n","authors":["Peiheng Gao","Ning Sun","Xuefeng Wang","Chen Yang","Ričardas Zitikis"],"pdf_url":"https://arxiv.org/pdf/2308.11138v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02242v2","updated":"2023-08-22T02:34:19Z","published":"2023-03-03T22:19:22Z","title":"TrojText: Test-time Invisible Textual Trojan Insertion","summary":" In Natural Language Processing (NLP), intelligent neuron models can be\nsusceptible to textual Trojan attacks. Such attacks occur when Trojan models\nbehave normally for standard inputs but generate malicious output for inputs\nthat contain a specific trigger. Syntactic-structure triggers, which are\ninvisible, are becoming more popular for Trojan attacks because they are\ndifficult to detect and defend against. However, these types of attacks require\na large corpus of training data to generate poisoned samples with the necessary\nsyntactic structures for Trojan insertion. Obtaining such data can be difficult\nfor attackers, and the process of generating syntactic poisoned triggers and\ninserting Trojans can be time-consuming. This paper proposes a solution called\nTrojText, which aims to determine whether invisible textual Trojan attacks can\nbe performed more efficiently and cost-effectively without training data. The\nproposed approach, called the Representation-Logit Trojan Insertion (RLI)\nalgorithm, uses smaller sampled test data instead of large training data to\nachieve the desired attack. The paper also introduces two additional\ntechniques, namely the accumulated gradient ranking (AGR) and Trojan Weights\nPruning (TWP), to reduce the number of tuned parameters and the attack\noverhead. The TrojText approach was evaluated on three datasets (AG's News,\nSST-2, and OLID) using three NLP models (BERT, XLNet, and DeBERTa). The\nexperiments demonstrated that the TrojText approach achieved a 98.35\\%\nclassification accuracy for test sentences in the target class on the BERT\nmodel for the AG's News dataset. The source code for TrojText is available at\nhttps://github.com/UCF-ML-Research/TrojText.\n","authors":["Qian Lou","Yepeng Liu","Bo Feng"],"pdf_url":"https://arxiv.org/pdf/2303.02242v2.pdf","comment":"In The Eleventh International Conference on Learning Representations.\n 2023 (ICLR 2023)"},{"id":"http://arxiv.org/abs/2308.11103v1","updated":"2023-08-22T00:57:36Z","published":"2023-08-22T00:57:36Z","title":"Anonymity at Risk? Assessing Re-Identification Capabilities of Large\n Language Models","summary":" Anonymity of both natural and legal persons in court rulings is a critical\naspect of privacy protection in the European Union and Switzerland. With the\nadvent of LLMs, concerns about large-scale re-identification of anonymized\npersons are growing. In accordance with the Federal Supreme Court of\nSwitzerland, we explore the potential of LLMs to re-identify individuals in\ncourt rulings by constructing a proof-of-concept using actual legal data from\nthe Swiss federal supreme court. Following the initial experiment, we\nconstructed an anonymized Wikipedia dataset as a more rigorous testing ground\nto further investigate the findings. With the introduction and application of\nthe new task of re-identifying people in texts, we also introduce new metrics\nto measure performance. We systematically analyze the factors that influence\nsuccessful re-identifications, identifying model size, input length, and\ninstruction tuning among the most critical determinants. Despite high\nre-identification rates on Wikipedia, even the best LLMs struggled with court\ndecisions. The complexity is attributed to the lack of test datasets, the\nnecessity for substantial training resources, and data sparsity in the\ninformation used for re-identification. In conclusion, this study demonstrates\nthat re-identification using LLMs may not be feasible for now, but as the\nproof-of-concept on Wikipedia showed, it might become possible in the future.\nWe hope that our system can help enhance the confidence in the security of\nanonymized decisions, thus leading to the courts being more confident to\npublish decisions.\n","authors":["Alex Nyffenegger","Matthias Stürmer","Joel Niklaus"],"pdf_url":"https://arxiv.org/pdf/2308.11103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08158v3","updated":"2023-08-22T00:55:21Z","published":"2023-06-13T22:07:54Z","title":"Survey on Sociodemographic Bias in Natural Language Processing","summary":" Deep neural networks often learn unintended bias during training, which might\nhave harmful effects when deployed in real-world settings. This work surveys\n214 papers related to sociodemographic bias in natural language processing\n(NLP). In this study, we aim to provide a more comprehensive understanding of\nthe similarities and differences among approaches to sociodemographic bias in\nNLP. To better understand the distinction between bias and real-world harm, we\nturn to ideas from psychology and behavioral economics to propose a definition\nfor sociodemographic bias. We identify three main categories of NLP bias\nresearch: types of bias, quantifying bias, and debiasing techniques. We\nhighlight the current trends in quantifying bias and debiasing techniques,\noffering insights into their strengths and weaknesses. We conclude that current\napproaches on quantifying bias face reliability issues, that many of the bias\nmetrics do not relate to real-world bias, and that debiasing techniques need to\nfocus more on training methods. Finally, we provide recommendations for future\nwork.\n","authors":["Vipul Gupta","Pranav Narayanan Venkit","Shomir Wilson","Rebecca J. Passonneau"],"pdf_url":"https://arxiv.org/pdf/2306.08158v3.pdf","comment":"23 pages, 1 figure"},{"id":"http://arxiv.org/abs/2301.09767v3","updated":"2023-08-22T00:22:42Z","published":"2023-01-24T00:32:56Z","title":"Truveta Mapper: A Zero-shot Ontology Alignment Framework","summary":" In this paper, a new perspective is suggested for unsupervised Ontology\nMatching (OM) or Ontology Alignment (OA) by treating it as a translation task.\nOntologies are represented as graphs, and the translation is performed from a\nnode in the source ontology graph to a path in the target ontology graph. The\nproposed framework, Truveta Mapper (TM), leverages a multi-task\nsequence-to-sequence transformer model to perform alignment across multiple\nontologies in a zero-shot, unified and end-to-end manner. Multi-tasking enables\nthe model to implicitly learn the relationship between different ontologies via\ntransfer-learning without requiring any explicit cross-ontology manually\nlabeled data. This also enables the formulated framework to outperform existing\nsolutions for both runtime latency and alignment quality. The model is\npre-trained and fine-tuned only on publicly available text corpus and\ninner-ontologies data. The proposed solution outperforms state-of-the-art\napproaches, Edit-Similarity, LogMap, AML, BERTMap, and the recently presented\nnew OM frameworks in Ontology Alignment Evaluation Initiative (OAEI22), offers\nlog-linear complexity, and overall makes the OM task efficient and more\nstraightforward without much post-processing involving mapping extension or\nmapping repair. We are open sourcing our solution.\n","authors":["Mariyam Amir","Murchana Baruah","Mahsa Eslamialishah","Sina Ehsani","Alireza Bahramali","Sadra Naddaf-Sh","Saman Zarandioon"],"pdf_url":"https://arxiv.org/pdf/2301.09767v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19370v2","updated":"2023-08-22T00:19:05Z","published":"2023-05-30T19:25:51Z","title":"Blockwise Parallel Transformer for Long Context Large Models","summary":" Transformers have emerged as the cornerstone of state-of-the-art natural\nlanguage processing models, showcasing exceptional performance across a wide\nrange of AI applications. However, the memory demands posed by the\nself-attention mechanism and the large feedforward network in Transformers\nlimit their ability to handle long sequences, thereby creating challenges for\ntasks involving multiple long sequences or long-term dependencies. We present a\ndistinct approach, Blockwise Parallel Transformer (BPT), that leverages\nblockwise computation of self-attention and feedforward network fusion to\nminimize memory costs. By processing longer input sequences while maintaining\nmemory efficiency, BPT enables training sequences up to 32 times longer than\nvanilla Transformers and 2 to 4 times longer than previous memory-efficient\nmethods. Extensive experiments on language modeling and reinforcement learning\ntasks demonstrate the effectiveness of BPT in reducing memory requirements and\nimproving performance.\n","authors":["Hao Liu","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2305.19370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11827v1","updated":"2023-08-22T23:18:53Z","published":"2023-08-22T23:18:53Z","title":"Exploring the Effectiveness of GPT Models in Test-Taking: A Case Study\n of the Driver's License Knowledge Test","summary":" Large language models such as Open AI's Generative Pre-trained Transformer\n(GPT) models are proficient at answering questions, but their knowledge is\nconfined to the information present in their training data. This limitation\nrenders them ineffective when confronted with questions about recent\ndevelopments or non-public documents. Our research proposes a method that\nenables GPT models to answer questions by employing context from an information\nsource not previously included in their training data. The methodology includes\npreprocessing of contextual information, the embedding of contexts and queries,\nconstructing prompt through the integration of context embeddings, and\ngenerating answers using GPT models. We applied this method in a controlled\ntest scenario using the California Driver's Handbook as the information source.\nThe GPT-3 model achieved a 96% passing score on a set of 50 sample driving\nknowledge test questions. In contrast, without context, the model's passing\nscore fell to 82%. However, the model still fails to answer some questions\ncorrectly even with providing library of context, highlighting room for\nimprovement. The research also examined the impact of prompt length and context\nformat, on the model's performance. Overall, the study provides insights into\nthe limitations and potential improvements for GPT models in question-answering\ntasks.\n","authors":["Saba Rahimi","Tucker Balch","Manuela Veloso"],"pdf_url":"https://arxiv.org/pdf/2308.11827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.01368v3","updated":"2023-08-22T22:52:54Z","published":"2022-08-02T11:27:36Z","title":"PyABSA: A Modularized Framework for Reproducible Aspect-based Sentiment\n Analysis","summary":" The advancement of aspect-based sentiment analysis (ABSA) has urged the lack\nof a user-friendly framework that can largely lower the difficulty of\nreproducing state-of-the-art ABSA performance, especially for beginners. To\nmeet the demand, we present \\our, a modularized framework built on PyTorch for\nreproducible ABSA. To facilitate ABSA research, PyABSA supports several ABSA\nsubtasks, including aspect term extraction, aspect sentiment classification,\nand end-to-end aspect-based sentiment analysis. Concretely, PyABSA integrates\n29 models and 26 datasets. With just a few lines of code, the result of a model\non a specific dataset can be reproduced. With a modularized design, PyABSA can\nalso be flexibly extended to considered models, datasets, and other related\ntasks. Besides, PyABSA highlights its data augmentation and annotation\nfeatures, which significantly address data scarcity. All are welcome to have a\ntry at \\url{https://github.com/yangheng95/PyABSA}.\n","authors":["Heng Yang","Chen Zhang","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2208.01368v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11807v1","updated":"2023-08-22T22:18:38Z","published":"2023-08-22T22:18:38Z","title":"Towards an On-device Agent for Text Rewriting","summary":" Large Language Models (LLMs) have demonstrated impressive capabilities for\ntext rewriting. Nonetheless, the large sizes of these models make them\nimpractical for on-device inference, which would otherwise allow for enhanced\nprivacy and economical inference. Creating a smaller yet potent language model\nfor text rewriting presents a formidable challenge because it requires\nbalancing the need for a small size with the need to retain the emergent\ncapabilities of the LLM, that requires costly data collection. To address the\nabove challenge, we introduce a new instruction tuning approach for building a\nmobile-centric text rewriting model. Our strategies enable the generation of\nhigh quality training data without any human labeling. In addition, we propose\na heuristic reinforcement learning framework which substantially enhances\nperformance without requiring preference data. To further bridge the\nperformance gap with the larger server-side model, we propose an effective\napproach that combines the mobile rewrite agent with the server model using a\ncascade. To tailor the text rewriting tasks to mobile scenarios, we introduce\nMessageRewriteEval, a benchmark that focuses on text rewriting for messages\nthrough natural language instructions. Through empirical experiments, we\ndemonstrate that our on-device model surpasses the current state-of-the-art\nLLMs in text rewriting while maintaining a significantly reduced model size.\nNotably, we show that our proposed cascading approach improves model\nperformance.\n","authors":["Yun Zhu","Yinxiao Liu","Felix Stahlberg","Shankar Kumar","Yu-hui Chen","Liangchen Luo","Lei Shu","Renjie Liu","Jindong Chen","Lei Meng"],"pdf_url":"https://arxiv.org/pdf/2308.11807v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11780v1","updated":"2023-08-22T20:40:21Z","published":"2023-08-22T20:40:21Z","title":"Few-shot Anomaly Detection in Text with Deviation Learning","summary":" Most current methods for detecting anomalies in text concentrate on\nconstructing models solely relying on unlabeled data. These models operate on\nthe presumption that no labeled anomalous examples are available, which\nprevents them from utilizing prior knowledge of anomalies that are typically\npresent in small numbers in many real-world applications. Furthermore, these\nmodels prioritize learning feature embeddings rather than optimizing anomaly\nscores directly, which could lead to suboptimal anomaly scoring and inefficient\nuse of data during the learning process. In this paper, we introduce FATE, a\ndeep few-shot learning-based framework that leverages limited anomaly examples\nand learns anomaly scores explicitly in an end-to-end method using deviation\nlearning. In this approach, the anomaly scores of normal examples are adjusted\nto closely resemble reference scores obtained from a prior distribution.\nConversely, anomaly samples are forced to have anomalous scores that\nconsiderably deviate from the reference score in the upper tail of the prior.\nAdditionally, our model is optimized to learn the distinct behavior of\nanomalies by utilizing a multi-head self-attention layer and multiple instance\nlearning approaches. Comprehensive experiments on several benchmark datasets\ndemonstrate that our proposed approach attains a new level of state-of-the-art\nperformance.\n","authors":["Anindya Sundar Das","Aravind Ajay","Sriparna Saha","Monowar Bhuyan"],"pdf_url":"https://arxiv.org/pdf/2308.11780v1.pdf","comment":"Accepted in ICONIP 2023"},{"id":"http://arxiv.org/abs/2308.11773v1","updated":"2023-08-22T20:30:59Z","published":"2023-08-22T20:30:59Z","title":"Identifying depression-related topics in smartphone-collected\n free-response speech recordings using an automatic speech recognition system\n and a deep learning topic model","summary":" Language use has been shown to correlate with depression, but large-scale\nvalidation is needed. Traditional methods like clinic studies are expensive.\nSo, natural language processing has been employed on social media to predict\ndepression, but limitations remain-lack of validated labels, biased user\nsamples, and no context. Our study identified 29 topics in 3919\nsmartphone-collected speech recordings from 265 participants using the Whisper\ntool and BERTopic model. Six topics with a median PHQ-8 greater than or equal\nto 10 were regarded as risk topics for depression: No Expectations, Sleep,\nMental Therapy, Haircut, Studying, and Coursework. To elucidate the topic\nemergence and associations with depression, we compared behavioral (from\nwearables) and linguistic characteristics across identified topics. The\ncorrelation between topic shifts and changes in depression severity over time\nwas also investigated, indicating the importance of longitudinally monitoring\nlanguage use. We also tested the BERTopic model on a similar smaller dataset\n(356 speech recordings from 57 participants), obtaining some consistent\nresults. In summary, our findings demonstrate specific speech topics may\nindicate depression severity. The presented data-driven workflow provides a\npractical approach to collecting and analyzing large-scale speech data from\nreal-world settings for digital health research.\n","authors":["Yuezhou Zhang","Amos A Folarin","Judith Dineley","Pauline Conde","Valeria de Angel","Shaoxiong Sun","Yatharth Ranjan","Zulqarnain Rashid","Callum Stewart","Petroula Laiou","Heet Sankesara","Linglong Qian","Faith Matcham","Katie M White","Carolin Oetzmann","Femke Lamers","Sara Siddi","Sara Simblett","Björn W. Schuller","Srinivasan Vairavan","Til Wykes","Josep Maria Haro","Brenda WJH Penninx","Vaibhav A Narayan","Matthew Hotopf","Richard JB Dobson","Nicholas Cummins","RADAR-CNS consortium"],"pdf_url":"https://arxiv.org/pdf/2308.11773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11764v1","updated":"2023-08-22T20:12:49Z","published":"2023-08-22T20:12:49Z","title":"Halo: Estimation and Reduction of Hallucinations in Open-Source Weak\n Large Language Models","summary":" Large Language Models (LLMs) have revolutionized Natural Language Processing\n(NLP). Although convenient for research and practical applications, open-source\nLLMs with fewer parameters often suffer from severe hallucinations compared to\ntheir larger counterparts. This paper focuses on measuring and reducing\nhallucinations in BLOOM 7B, a representative of such weaker open-source LLMs\nthat are publicly available for research and commercial applications. We\nintroduce HaloCheck, a lightweight BlackBox knowledge-free framework designed\nto quantify the severity of hallucinations in LLMs. Additionally, we explore\ntechniques like knowledge injection and teacher-student approaches to alleviate\nhallucinations in low-parameter LLMs. Our experiments effectively demonstrate\nthe reduction of hallucinations in challenging domains for these LLMs.\n","authors":["Mohamed Elaraby","Mengyin Lu","Jacob Dunn","Xueying Zhang","Yu Wang","Shizhu Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11730v1","updated":"2023-08-22T18:41:31Z","published":"2023-08-22T18:41:31Z","title":"Knowledge Graph Prompting for Multi-Document Question Answering","summary":" The 'pre-train, prompt, predict' paradigm of large language models (LLMs) has\nachieved remarkable success in open-domain question answering (OD-QA). However,\nfew works explore this paradigm in the scenario of multi-document question\nanswering (MD-QA), a task demanding a thorough understanding of the logical\nassociations among the contents and structures of different documents. To fill\nthis crucial gap, we propose a Knowledge Graph Prompting (KGP) method to\nformulate the right context in prompting LLMs for MD-QA, which consists of a\ngraph construction module and a graph traversal module. For graph construction,\nwe create a knowledge graph (KG) over multiple documents with nodes symbolizing\npassages or document structures (e.g., pages/tables), and edges denoting the\nsemantic/lexical similarity between passages or intra-document structural\nrelations. For graph traversal, we design an LM-guided graph traverser that\nnavigates across nodes and gathers supporting passages assisting LLMs in MD-QA.\nThe constructed graph serves as the global ruler that regulates the\ntransitional space among passages and reduces retrieval latency. Concurrently,\nthe LM-guided traverser acts as a local navigator that gathers pertinent\ncontext to progressively approach the question and guarantee retrieval quality.\nExtensive experiments underscore the efficacy of KGP for MD-QA, signifying the\npotential of leveraging graphs in enhancing the prompt design for LLMs. Our\ncode is at https://github.com/YuWVandy/KG-LLM-MDQA.\n","authors":["Yu Wang","Nedim Lipka","Ryan A. Rossi","Alexa Siu","Ruiyi Zhang","Tyler Derr"],"pdf_url":"https://arxiv.org/pdf/2308.11730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09662v2","updated":"2023-08-22T18:18:38Z","published":"2023-08-18T16:27:04Z","title":"Red-Teaming Large Language Models using Chain of Utterances for\n Safety-Alignment","summary":" Larger language models (LLMs) have taken the world by storm with their\nmassive multi-tasking capabilities simply by optimizing over a next-word\nprediction objective. With the emergence of their properties and encoded\nknowledge, the risk of LLMs producing harmful outputs increases, making them\nunfit for scalable deployment for the public. In this work, we propose a new\nsafety evaluation benchmark RED-EVAL that carries out red-teaming. We show that\neven widely deployed models are susceptible to the Chain of Utterances-based\n(CoU) prompting, jailbreaking closed source LLM-based systems such as GPT-4 and\nChatGPT to unethically respond to more than 65% and 73% of harmful queries. We\nalso demonstrate the consistency of the RED-EVAL across 8 open-source LLMs in\ngenerating harmful responses in more than 86% of the red-teaming attempts.\nNext, we propose RED-INSTRUCT--An approach for the safety alignment of LLMs. It\nconstitutes two phases: 1) HARMFULQA data collection: Leveraging CoU prompting,\nwe collect a dataset that consists of 1.9K harmful questions covering a wide\nrange of topics, 9.5K safe and 7.3K harmful conversations from ChatGPT; 2)\nSAFE-ALIGN: We demonstrate how the conversational dataset can be used for the\nsafety alignment of LLMs by minimizing the negative log-likelihood over helpful\nresponses and penalizing over harmful responses by gradient accent over sample\nloss. Our model STARLING, a fine-tuned Vicuna-7B, is observed to be more safely\naligned when evaluated on RED-EVAL and HHH benchmarks while preserving the\nutility of the baseline models (TruthfulQA, MMLU, and BBH).\n","authors":["Rishabh Bhardwaj","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2308.09662v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11696v1","updated":"2023-08-22T17:59:30Z","published":"2023-08-22T17:59:30Z","title":"Efficient Benchmarking (of Language Models)","summary":" The increasing versatility of language models LMs has given rise to a new\nclass of benchmarks that comprehensively assess a broad range of capabilities.\nSuch benchmarks are associated with massive computational costs reaching\nthousands of GPU hours per model. However the efficiency aspect of these\nevaluation efforts had raised little discussion in the literature. In this work\nwe present the problem of Efficient Benchmarking namely intelligently reducing\nthe computation costs of LM evaluation without compromising reliability. Using\nthe HELM benchmark as a test case we investigate how different benchmark design\nchoices affect the computation-reliability tradeoff. We propose to evaluate the\nreliability of such decisions by using a new measure Decision Impact on\nReliability DIoR for short. We find for example that the current leader on HELM\nmay change by merely removing a low-ranked model from the benchmark and observe\nthat a handful of examples suffice to obtain the correct benchmark ranking.\nConversely a slightly different choice of HELM scenarios varies ranking widely.\nBased on our findings we outline a set of concrete recommendations for more\nefficient benchmark design and utilization practices leading to dramatic cost\nsavings with minimal loss of benchmark reliability often reducing computation\nby x100 or more.\n","authors":["Yotam Perlitz","Elron Bandel","Ariel Gera","Ofir Arviv","Liat Ein-Dor","Eyal Shnarch","Noam Slonim","Michal Shmueli-Scheuer","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2308.11696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.11342v5","updated":"2023-08-22T17:02:42Z","published":"2021-06-21T18:19:46Z","title":"Dive into Deep Learning","summary":" This open-source book represents our attempt to make deep learning\napproachable, teaching readers the concepts, the context, and the code. The\nentire book is drafted in Jupyter notebooks, seamlessly integrating exposition\nfigures, math, and interactive examples with self-contained code. Our goal is\nto offer a resource that could (i) be freely available for everyone; (ii) offer\nsufficient technical depth to provide a starting point on the path to actually\nbecoming an applied machine learning scientist; (iii) include runnable code,\nshowing readers how to solve problems in practice; (iv) allow for rapid\nupdates, both by us and also by the community at large; (v) be complemented by\na forum for interactive discussion of technical details and to answer\nquestions.\n","authors":["Aston Zhang","Zachary C. Lipton","Mu Li","Alexander J. Smola"],"pdf_url":"https://arxiv.org/pdf/2106.11342v5.pdf","comment":"(HTML) https://D2L.ai (GitHub) https://github.com/d2l-ai/d2l-en/"},{"id":"http://arxiv.org/abs/2308.11683v1","updated":"2023-08-22T15:09:55Z","published":"2023-08-22T15:09:55Z","title":"Learning to generate and corr- uh I mean repair language in real-time","summary":" In conversation, speakers produce language incrementally, word by word, while\ncontinuously monitoring the appropriateness of their own contribution in the\ndynamically unfolding context of the conversation; and this often leads them to\nrepair their own utterance on the fly. This real-time language processing\ncapacity is furthermore crucial to the development of fluent and natural\nconversational AI. In this paper, we use a previously learned Dynamic Syntax\ngrammar and the CHILDES corpus to develop, train and evaluate a probabilistic\nmodel for incremental generation where input to the model is a purely semantic\ngeneration goal concept in Type Theory with Records (TTR). We show that the\nmodel's output exactly matches the gold candidate in 78% of cases with a\nROUGE-l score of 0.86. We further do a zero-shot evaluation of the ability of\nthe same model to generate self-repairs when the generation goal changes\nmid-utterance. Automatic evaluation shows that the model can generate\nself-repairs correctly in 85% of cases. A small human evaluation confirms the\nnaturalness and grammaticality of the generated self-repairs. Overall, these\nresults further highlight the generalisation power of grammar-based models and\nlay the foundations for more controllable, and naturally interactive\nconversational AI systems.\n","authors":["Arash Eshghi","Arash Ashrafzadeh"],"pdf_url":"https://arxiv.org/pdf/2308.11683v1.pdf","comment":"Proceedings of the workshop on the Semantics and Pragmatics of\n Dialogue, SemDial, Maribor, Slovenia (2023)"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2308.11617v1","updated":"2023-08-22T17:59:51Z","published":"2023-08-22T17:59:51Z","title":"GRIP: Generating Interaction Poses Using Latent Consistency and Spatial\n Cues","summary":" Hands are dexterous and highly versatile manipulators that are central to how\nhumans interact with objects and their environment. Consequently, modeling\nrealistic hand-object interactions, including the subtle motion of individual\nfingers, is critical for applications in computer graphics, computer vision,\nand mixed reality. Prior work on capturing and modeling humans interacting with\nobjects in 3D focuses on the body and object motion, often ignoring hand pose.\nIn contrast, we introduce GRIP, a learning-based method that takes, as input,\nthe 3D motion of the body and the object, and synthesizes realistic motion for\nboth hands before, during, and after object interaction. As a preliminary step\nbefore synthesizing the hand motion, we first use a network, ANet, to denoise\nthe arm motion. Then, we leverage the spatio-temporal relationship between the\nbody and the object to extract two types of novel temporal interaction cues,\nand use them in a two-stage inference pipeline to generate the hand motion. In\nthe first stage, we introduce a new approach to enforce motion temporal\nconsistency in the latent space (LTC), and generate consistent interaction\nmotions. In the second stage, GRIP generates refined hand poses to avoid\nhand-object penetrations. Given sequences of noisy body and object motion, GRIP\nupgrades them to include hand-object interaction. Quantitative experiments and\nperceptual studies demonstrate that GRIP outperforms baseline methods and\ngeneralizes to unseen objects and motions from different motion-capture\ndatasets.\n","authors":["Omid Taheri","Yi Zhou","Dimitrios Tzionas","Yang Zhou","Duygu Ceylan","Soren Pirk","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2308.11617v1.pdf","comment":"The project has been started during Omid Taheri's internship at Adobe\n and as a collaboration with the Max Planck Institute for Intelligent Systems"},{"id":"http://arxiv.org/abs/2308.11607v1","updated":"2023-08-22T17:53:58Z","published":"2023-08-22T17:53:58Z","title":"Delving into Motion-Aware Matching for Monocular 3D Object Tracking","summary":" Recent advances of monocular 3D object detection facilitate the 3D\nmulti-object tracking task based on low-cost camera sensors. In this paper, we\nfind that the motion cue of objects along different time frames is critical in\n3D multi-object tracking, which is less explored in existing monocular-based\napproaches. In this paper, we propose a motion-aware framework for monocular 3D\nMOT. To this end, we propose MoMA-M3T, a framework that mainly consists of\nthree motion-aware components. First, we represent the possible movement of an\nobject related to all object tracklets in the feature space as its motion\nfeatures. Then, we further model the historical object tracklet along the time\nframe in a spatial-temporal perspective via a motion transformer. Finally, we\npropose a motion-aware matching module to associate historical object tracklets\nand current observations as final tracking results. We conduct extensive\nexperiments on the nuScenes and KITTI datasets to demonstrate that our MoMA-M3T\nachieves competitive performance against state-of-the-art methods. Moreover,\nthe proposed tracker is flexible and can be easily plugged into existing\nimage-based 3D object detectors without re-training. Code and models are\navailable at https://github.com/kuanchihhuang/MoMA-M3T.\n","authors":["Kuan-Chih Huang","Ming-Hsuan Yang","Yi-Hsuan Tsai"],"pdf_url":"https://arxiv.org/pdf/2308.11607v1.pdf","comment":"Accepted by ICCV 2023. Code is available at\n https://github.com/kuanchihhuang/MoMA-M3T"},{"id":"http://arxiv.org/abs/2308.11606v1","updated":"2023-08-22T17:53:55Z","published":"2023-08-22T17:53:55Z","title":"StoryBench: A Multifaceted Benchmark for Continuous Story Visualization","summary":" Generating video stories from text prompts is a complex task. In addition to\nhaving high visual quality, videos need to realistically adhere to a sequence\nof text prompts whilst being consistent throughout the frames. Creating a\nbenchmark for video generation requires data annotated over time, which\ncontrasts with the single caption used often in video datasets. To fill this\ngap, we collect comprehensive human annotations on three existing datasets, and\nintroduce StoryBench: a new, challenging multi-task benchmark to reliably\nevaluate forthcoming text-to-video models. Our benchmark includes three video\ngeneration tasks of increasing difficulty: action execution, where the next\naction must be generated starting from a conditioning video; story\ncontinuation, where a sequence of actions must be executed starting from a\nconditioning video; and story generation, where a video must be generated from\nonly text prompts. We evaluate small yet strong text-to-video baselines, and\nshow the benefits of training on story-like data algorithmically generated from\nexisting video captions. Finally, we establish guidelines for human evaluation\nof video stories, and reaffirm the need of better automatic metrics for video\ngeneration. StoryBench aims at encouraging future research efforts in this\nexciting new area.\n","authors":["Emanuele Bugliarello","Hernan Moraldo","Ruben Villegas","Mohammad Babaeizadeh","Mohammad Taghi Saffar","Han Zhang","Dumitru Erhan","Vittorio Ferrari","Pieter-Jan Kindermans","Paul Voigtlaender"],"pdf_url":"https://arxiv.org/pdf/2308.11606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11605v1","updated":"2023-08-22T17:53:26Z","published":"2023-08-22T17:53:26Z","title":"GOPro: Generate and Optimize Prompts in CLIP using Self-Supervised\n Learning","summary":" Large-scale foundation models, such as CLIP, have demonstrated remarkable\nsuccess in visual recognition tasks by embedding images in a semantically rich\nspace. Self-supervised learning (SSL) has also shown promise in improving\nvisual recognition by learning invariant features. However, the combination of\nCLIP with SSL is found to face challenges due to the multi-task framework that\nblends CLIP's contrastive loss and SSL's loss, including difficulties with loss\nweighting and inconsistency among different views of images in CLIP's output\nspace. To overcome these challenges, we propose a prompt learning-based model\ncalled GOPro, which is a unified framework that ensures similarity between\nvarious augmented views of input images in a shared image-text embedding space,\nusing a pair of learnable image and text projectors atop CLIP, to promote\ninvariance and generalizability. To automatically learn such prompts, we\nleverage the visual content and style primitives extracted from pre-trained\nCLIP and adapt them to the target task. In addition to CLIP's cross-domain\ncontrastive loss, we introduce a visual contrastive loss and a novel prompt\nconsistency loss, considering the different views of the images. GOPro is\ntrained end-to-end on all three loss objectives, combining the strengths of\nCLIP and SSL in a principled manner. Empirical evaluations demonstrate that\nGOPro outperforms the state-of-the-art prompting techniques on three\nchallenging domain generalization tasks across multiple benchmarks by a\nsignificant margin. Our code is available at\nhttps://github.com/mainaksingha01/GOPro.\n","authors":["Mainak Singha","Ankit Jha","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2308.11605v1.pdf","comment":"Accepted at BMVC 2023"},{"id":"http://arxiv.org/abs/2201.12944v5","updated":"2023-08-22T17:50:41Z","published":"2022-01-31T00:39:37Z","title":"Deep Learning Approaches on Image Captioning: A Review","summary":" Image captioning is a research area of immense importance, aiming to generate\nnatural language descriptions for visual content in the form of still images.\nThe advent of deep learning and more recently vision-language pre-training\ntechniques has revolutionized the field, leading to more sophisticated methods\nand improved performance. In this survey paper, we provide a structured review\nof deep learning methods in image captioning by presenting a comprehensive\ntaxonomy and discussing each method category in detail. Additionally, we\nexamine the datasets commonly employed in image captioning research, as well as\nthe evaluation metrics used to assess the performance of different captioning\nmodels. We address the challenges faced in this field by emphasizing issues\nsuch as object hallucination, missing context, illumination conditions,\ncontextual understanding, and referring expressions. We rank different deep\nlearning methods' performance according to widely used evaluation metrics,\ngiving insight into the current state of the art. Furthermore, we identify\nseveral potential future directions for research in this area, which include\ntackling the information misalignment problem between image and text\nmodalities, mitigating dataset bias, incorporating vision-language pre-training\nmethods to enhance caption generation, and developing improved evaluation tools\nto accurately measure the quality of image captions.\n","authors":["Taraneh Ghandi","Hamidreza Pourreza","Hamidreza Mahyar"],"pdf_url":"https://arxiv.org/pdf/2201.12944v5.pdf","comment":"41 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.14436v3","updated":"2023-08-22T17:31:49Z","published":"2023-07-26T18:13:16Z","title":"Phenotype-preserving metric design for high-content image reconstruction\n by generative inpainting","summary":" In the past decades, automated high-content microscopy demonstrated its\nability to deliver large quantities of image-based data powering the\nversatility of phenotypic drug screening and systems biology applications.\nHowever, as the sizes of image-based datasets grew, it became infeasible for\nhumans to control, avoid and overcome the presence of imaging and sample\npreparation artefacts in the images. While novel techniques like machine\nlearning and deep learning may address these shortcomings through generative\nimage inpainting, when applied to sensitive research data this may come at the\ncost of undesired image manipulation. Undesired manipulation may be caused by\nphenomena such as neural hallucinations, to which some artificial neural\nnetworks are prone. To address this, here we evaluate the state-of-the-art\ninpainting methods for image restoration in a high-content fluorescence\nmicroscopy dataset of cultured cells with labelled nuclei. We show that\narchitectures like DeepFill V2 and Edge Connect can faithfully restore\nmicroscopy images upon fine-tuning with relatively little data. Our results\ndemonstrate that the area of the region to be restored is of higher importance\nthan shape. Furthermore, to control for the quality of restoration, we propose\na novel phenotype-preserving metric design strategy. In this strategy, the size\nand count of the restored biological phenotypes like cell nuclei are quantified\nto penalise undesirable manipulation. We argue that the design principles of\nour approach may also generalise to other applications.\n","authors":["Vaibhav Sharma","Artur Yakimovich"],"pdf_url":"https://arxiv.org/pdf/2307.14436v3.pdf","comment":"8 pages, 3 figures, conference proceedings"},{"id":"http://arxiv.org/abs/2308.11573v1","updated":"2023-08-22T17:23:00Z","published":"2023-08-22T17:23:00Z","title":"G3Reg: Pyramid Graph-based Global Registration using Gaussian Ellipsoid\n Model","summary":" This study introduces a novel framework, G3Reg, for fast and robust global\nregistration of LiDAR point clouds. In contrast to conventional complex\nkeypoints and descriptors, we extract fundamental geometric primitives\nincluding planes, clusters, and lines (PCL) from the raw point cloud to obtain\nlow-level semantic segments. Each segment is formulated as a unified Gaussian\nEllipsoid Model (GEM) by employing a probability ellipsoid to ensure the ground\ntruth centers are encompassed with a certain degree of probability. Utilizing\nthese GEMs, we then present a distrust-and-verify scheme based on a Pyramid\nCompatibility Graph for Global Registration (PAGOR). Specifically, we establish\nan upper bound, which can be traversed based on the confidence level for\ncompatibility testing to construct the pyramid graph. Gradually, we solve\nmultiple maximum cliques (MAC) for each level of the graph, generating numerous\ntransformation candidates. In the verification phase, we adopt a precise and\nefficient metric for point cloud alignment quality, founded on geometric\nprimitives, to identify the optimal candidate. The performance of the algorithm\nis extensively validated on three publicly available datasets and a\nself-collected multi-session dataset, without changing any parameter settings\nin the experimental evaluation. The results exhibit superior robustness and\nreal-time performance of the G3Reg framework compared to state-of-the-art\nmethods. Furthermore, we demonstrate the potential for integrating individual\nGEM and PAGOR components into other algorithmic frameworks to enhance their\nefficacy. To advance further research and promote community understanding, we\nhave publicly shared the source code.\n","authors":["Zhijian Qiao","Zehuan Yu","Binqian Jiang","Huan Yin","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2308.11573v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2307.09642v2","updated":"2023-08-22T17:19:34Z","published":"2023-07-18T21:10:59Z","title":"Skin Lesion Correspondence Localization in Total Body Photography","summary":" Longitudinal tracking of skin lesions - finding correspondence, changes in\nmorphology, and texture - is beneficial to the early detection of melanoma.\nHowever, it has not been well investigated in the context of full-body imaging.\nWe propose a novel framework combining geometric and texture information to\nlocalize skin lesion correspondence from a source scan to a target scan in\ntotal body photography (TBP). Body landmarks or sparse correspondence are first\ncreated on the source and target 3D textured meshes. Every vertex on each of\nthe meshes is then mapped to a feature vector characterizing the geodesic\ndistances to the landmarks on that mesh. Then, for each lesion of interest\n(LOI) on the source, its corresponding location on the target is first coarsely\nestimated using the geometric information encoded in the feature vectors and\nthen refined using the texture information. We evaluated the framework\nquantitatively on both a public and a private dataset, for which our success\nrates (at 10 mm criterion) are comparable to the only reported longitudinal\nstudy. As full-body 3D capture becomes more prevalent and has higher quality,\nwe expect the proposed method to constitute a valuable step in the longitudinal\ntracking of skin lesions.\n","authors":["Wei-Lun Huang","Davood Tashayyod","Jun Kang","Amir Gandjbakhche","Michael Kazhdan","Mehran Armand"],"pdf_url":"https://arxiv.org/pdf/2307.09642v2.pdf","comment":"MICCAI-2023"},{"id":"http://arxiv.org/abs/2308.11568v1","updated":"2023-08-22T17:14:19Z","published":"2023-08-22T17:14:19Z","title":"SPANet: Frequency-balancing Token Mixer using Spectral Pooling\n Aggregation Modulation","summary":" Recent studies show that self-attentions behave like low-pass filters (as\nopposed to convolutions) and enhancing their high-pass filtering capability\nimproves model performance. Contrary to this idea, we investigate existing\nconvolution-based models with spectral analysis and observe that improving the\nlow-pass filtering in convolution operations also leads to performance\nimprovement. To account for this observation, we hypothesize that utilizing\noptimal token mixers that capture balanced representations of both high- and\nlow-frequency components can enhance the performance of models. We verify this\nby decomposing visual features into the frequency domain and combining them in\na balanced manner. To handle this, we replace the balancing problem with a mask\nfiltering problem in the frequency domain. Then, we introduce a novel\ntoken-mixer named SPAM and leverage it to derive a MetaFormer model termed as\nSPANet. Experimental results show that the proposed method provides a way to\nachieve this balance, and the balanced representations of both high- and\nlow-frequency components can improve the performance of models on multiple\ncomputer vision tasks. Our code is available at\n$\\href{https://doranlyong.github.io/projects/spanet/}{\\text{https://doranlyong.github.io/projects/spanet/}}$.\n","authors":["Guhnoo Yun","Juhan Yoo","Kijung Kim","Jeongho Lee","Dong Hwan Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11568v1.pdf","comment":"Accepted paper at ICCV 2023"},{"id":"http://arxiv.org/abs/2305.13495v2","updated":"2023-08-22T16:49:32Z","published":"2023-05-22T21:25:27Z","title":"Type-to-Track: Retrieve Any Object via Prompt-based Tracking","summary":" One of the recent trends in vision problems is to use natural language\ncaptions to describe the objects of interest. This approach can overcome some\nlimitations of traditional methods that rely on bounding boxes or category\nannotations. This paper introduces a novel paradigm for Multiple Object\nTracking called Type-to-Track, which allows users to track objects in videos by\ntyping natural language descriptions. We present a new dataset for that\nGrounded Multiple Object Tracking task, called GroOT, that contains videos with\nvarious types of objects and their corresponding textual captions describing\ntheir appearance and action in detail. Additionally, we introduce two new\nevaluation protocols and formulate evaluation metrics specifically for this\ntask. We develop a new efficient method that models a transformer-based\neMbed-ENcoDE-extRact framework (MENDER) using the third-order tensor\ndecomposition. The experiments in five scenarios show that our MENDER approach\noutperforms another two-stage design in terms of accuracy and efficiency, up to\n14.7% accuracy and 4$\\times$ speed faster.\n","authors":["Pha Nguyen","Kha Gia Quach","Kris Kitani","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2305.13495v2.pdf","comment":"23 pages, 9 tables, 8 figures"},{"id":"http://arxiv.org/abs/2307.10123v3","updated":"2023-08-22T16:48:58Z","published":"2023-07-19T16:42:52Z","title":"Two Approaches to Supervised Image Segmentation","summary":" Though performed almost effortlessly by humans, segmenting 2D gray-scale or\ncolor images into respective regions of interest (e.g.~background, objects, or\nportions of objects) constitutes one of the greatest challenges in science and\ntechnology as a consequence of several effects including dimensionality\nreduction(3D to 2D), noise, reflections, shades, and occlusions, among many\nother possibilities. While a large number of interesting related approaches\nhave been suggested along the last decades, it was mainly thanks to the recent\ndevelopment of deep learning that more effective and general solutions have\nbeen obtained, currently constituting the basic comparison reference for this\ntype of operation. Also developed recently, a multiset-based methodology has\nbeen described that is capable of encouraging image segmentation performance\ncombining spatial accuracy, stability, and robustness while requiring little\ncomputational resources (hardware and/or training and recognition time). The\ninteresting features of the multiset neurons methodology mostly follow from the\nenhanced selectivity and sensitivity, as well as good robustness to data\nperturbations and outliers, allowed by the coincidence similarity index on\nwhich the multiset approach to supervised image segmentation is founded. After\ndescribing the deep learning and multiset neurons approaches, the present work\ndevelops comparison experiments between them which are primarily aimed at\nillustrating their respective main interesting features when applied to the\nadopted specific type of data and parameter configurations. While the deep\nlearning approach confirmed its potential for performing image segmentation,\nthe alternative multiset methodology allowed for enhanced accuracy while\nrequiring little computational resources.\n","authors":["Alexandre Benatti","Luciano da F. Costa"],"pdf_url":"https://arxiv.org/pdf/2307.10123v3.pdf","comment":"38 pages, 19 figures"},{"id":"http://arxiv.org/abs/2308.11562v1","updated":"2023-08-22T16:45:58Z","published":"2023-08-22T16:45:58Z","title":"EndoNet: model for automatic calculation of H-score on histological\n slides","summary":" H-score is a semi-quantitative method used to assess the presence and\ndistribution of proteins in tissue samples by combining the intensity of\nstaining and percentage of stained nuclei. It is widely used but time-consuming\nand can be limited in accuracy and precision. Computer-aided methods may help\novercome these limitations and improve the efficiency of pathologists'\nworkflows. In this work, we developed a model EndoNet for automatic calculation\nof H-score on histological slides. Our proposed method uses neural networks and\nconsists of two main parts. The first is a detection model which predicts\nkeypoints of centers of nuclei. The second is a H-score module which calculates\nthe value of the H-score using mean pixel values of predicted keypoints. Our\nmodel was trained and validated on 1780 annotated tiles with a shape of 100x100\n$\\mu m$ and performed 0.77 mAP on a test dataset. Moreover, the model can be\nadjusted to a specific specialist or whole laboratory to reproduce the manner\nof calculating the H-score. Thus, EndoNet is effective and robust in the\nanalysis of histology slides, which can improve and significantly accelerate\nthe work of pathologists.\n","authors":["Egor Ushakov","Anton Naumov","Vladislav Fomberg","Polina Vishnyakova","Aleksandra Asaturova","Alina Badlaeva","Anna Tregubova","Evgeny Karpulevich","Gennady Sukhikh","Timur Fatkhudinov"],"pdf_url":"https://arxiv.org/pdf/2308.11562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11561v1","updated":"2023-08-22T16:45:35Z","published":"2023-08-22T16:45:35Z","title":"Target-Grounded Graph-Aware Transformer for Aerial Vision-and-Dialog\n Navigation","summary":" This report details the method of the winning entry of the AVDN Challenge in\nICCV 2023. The competition addresses the Aerial Navigation from Dialog History\n(ANDH) task, which requires a drone agent to associate dialog history with\naerial observations to reach the destination. For better cross-modal grounding\nabilities of the drone agent, we propose a Target-Grounded Graph-Aware\nTransformer (TG-GAT) framework. Concretely, TG-GAT first leverages a\ngraph-aware transformer to capture spatiotemporal dependency, which is\nbeneficial for navigation state tracking and robust action planning. TG-GAT\nfirst leverages a graph-aware transformer to capture spatiotemporal\ndependencies for more robust action planning. In addition, an auxiliary visual\ngrounding task is devised to boost the agent's awareness of referred landmarks.\nMoreover, a hybrid augmentation strategy based on large language models is\nutilized to mitigate data scarcity limitations. Our TG-GAT framework won the\nAVDN Challenge 2023, with 2.2% and 3.0% absolute improvements over the baseline\non SPL and SR metrics, respectively. The code is available at\nhttps://github.com/yifeisu/avdn-challenge.\n","authors":["Yifei Su","Dong An","Yuan Xu","Kehan Chen","Yan Huang"],"pdf_url":"https://arxiv.org/pdf/2308.11561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11557v1","updated":"2023-08-22T16:37:51Z","published":"2023-08-22T16:37:51Z","title":"Open Set Synthetic Image Source Attribution","summary":" AI-generated images have become increasingly realistic and have garnered\nsignificant public attention. While synthetic images are intriguing due to\ntheir realism, they also pose an important misinformation threat. To address\nthis new threat, researchers have developed multiple algorithms to detect\nsynthetic images and identify their source generators. However, most existing\nsource attribution techniques are designed to operate in a closed-set scenario,\ni.e. they can only be used to discriminate between known image generators. By\ncontrast, new image-generation techniques are rapidly emerging. To contend with\nthis, there is a great need for open-set source attribution techniques that can\nidentify when synthetic images have originated from new, unseen generators. To\naddress this problem, we propose a new metric learning-based approach. Our\ntechnique works by learning transferrable embeddings capable of discriminating\nbetween generators, even when they are not seen during training. An image is\nfirst assigned to a candidate generator, then is accepted or rejected based on\nits distance in the embedding space from known generators' learned reference\npoints. Importantly, we identify that initializing our source attribution\nembedding network by pretraining it on image camera identification can improve\nour embeddings' transferability. Through a series of experiments, we\ndemonstrate our approach's ability to attribute the source of synthetic images\nin open-set scenarios.\n","authors":["Shengbang Fang","Tai D. Nguyen","Matthew C. Stamm"],"pdf_url":"https://arxiv.org/pdf/2308.11557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11551v1","updated":"2023-08-22T16:32:46Z","published":"2023-08-22T16:32:46Z","title":"Multi-event Video-Text Retrieval","summary":" Video-Text Retrieval (VTR) is a crucial multi-modal task in an era of massive\nvideo-text data on the Internet. A plethora of work characterized by using a\ntwo-stream Vision-Language model architecture that learns a joint\nrepresentation of video-text pairs has become a prominent approach for the VTR\ntask. However, these models operate under the assumption of bijective\nvideo-text correspondences and neglect a more practical scenario where video\ncontent usually encompasses multiple events, while texts like user queries or\nwebpage metadata tend to be specific and correspond to single events. This\nestablishes a gap between the previous training objective and real-world\napplications, leading to the potential performance degradation of earlier\nmodels during inference. In this study, we introduce the Multi-event Video-Text\nRetrieval (MeVTR) task, addressing scenarios in which each video contains\nmultiple different events, as a niche scenario of the conventional Video-Text\nRetrieval Task. We present a simple model, Me-Retriever, which incorporates key\nevent video representation and a new MeVTR loss for the MeVTR task.\nComprehensive experiments show that this straightforward framework outperforms\nother models in the Video-to-Text and Text-to-Video tasks, effectively\nestablishing a robust baseline for the MeVTR task. We believe this work serves\nas a strong foundation for future studies. Code is available at\nhttps://github.com/gengyuanmax/MeVTR.\n","authors":["Gengyuan Zhang","Jisen Ren","Jindong Gu","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2308.11551v1.pdf","comment":"accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2210.11549v3","updated":"2023-08-22T16:15:26Z","published":"2022-10-20T19:31:23Z","title":"H4VDM: H.264 Video Device Matching","summary":" Methods that can determine if two given video sequences are captured by the\nsame device (e.g., mobile telephone or digital camera) can be used in many\nforensics tasks. In this paper we refer to this as \"video device matching\". In\nopen-set video forensics scenarios it is easier to determine if two video\nsequences were captured with the same device than identifying the specific\ndevice. In this paper, we propose a technique for open-set video device\nmatching. Given two H.264 compressed video sequences, our method can determine\nif they are captured by the same device, even if our method has never\nencountered the device in training. We denote our proposed technique as H.264\nVideo Device Matching (H4VDM). H4VDM uses H.264 compression information\nextracted from video sequences to make decisions. It is more robust against\nartifacts that alter camera sensor fingerprints, and it can be used to analyze\nrelatively small fragments of the H.264 sequence. We trained and tested our\nmethod on a publicly available video forensics dataset consisting of 35\ndevices, where our proposed method demonstrated good performance.\n","authors":["Ziyue Xiang","Paolo Bestagini","Stefano Tubaro","Edward J. Delp"],"pdf_url":"https://arxiv.org/pdf/2210.11549v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07597v3","updated":"2023-08-22T16:14:53Z","published":"2023-04-15T17:05:24Z","title":"An Instance Segmentation Dataset of Yeast Cells in Microstructures","summary":" Extracting single-cell information from microscopy data requires accurate\ninstance-wise segmentations. Obtaining pixel-wise segmentations from microscopy\nimagery remains a challenging task, especially with the added complexity of\nmicrostructured environments. This paper presents a novel dataset for\nsegmenting yeast cells in microstructures. We offer pixel-wise instance\nsegmentation labels for both cells and trap microstructures. In total, we\nrelease 493 densely annotated microscopy images. To facilitate a unified\ncomparison between novel segmentation algorithms, we propose a standardized\nevaluation strategy for our dataset. The aim of the dataset and evaluation\nstrategy is to facilitate the development of new cell segmentation approaches.\nThe dataset is publicly available at\nhttps://christophreich1996.github.io/yeast_in_microstructures_dataset/ .\n","authors":["Christoph Reich","Tim Prangemeier","André O. Françani","Heinz Koeppl"],"pdf_url":"https://arxiv.org/pdf/2304.07597v3.pdf","comment":"IEEE EMBC 2023 (in press), Christoph Reich and Tim Prangemeier - both\n authors contributed equally"},{"id":"http://arxiv.org/abs/2305.03881v2","updated":"2023-08-22T16:09:59Z","published":"2023-05-06T00:24:44Z","title":"Fairness in Image Search: A Study of Occupational Stereotyping in Image\n Retrieval and its Debiasing","summary":" Multi-modal search engines have experienced significant growth and widespread\nuse in recent years, making them the second most common internet use. While\nsearch engine systems offer a range of services, the image search field has\nrecently become a focal point in the information retrieval community, as the\nadage goes, \"a picture is worth a thousand words\". Although popular search\nengines like Google excel at image search accuracy and agility, there is an\nongoing debate over whether their search results can be biased in terms of\ngender, language, demographics, socio-cultural aspects, and stereotypes. This\npotential for bias can have a significant impact on individuals' perceptions\nand influence their perspectives.\n In this paper, we present our study on bias and fairness in web search, with\na focus on keyword-based image search. We first discuss several kinds of biases\nthat exist in search systems and why it is important to mitigate them. We\nnarrow down our study to assessing and mitigating occupational stereotypes in\nimage search, which is a prevalent fairness issue in image retrieval. For the\nassessment of stereotypes, we take gender as an indicator. We explore various\nopen-source and proprietary APIs for gender identification from images. With\nthese, we examine the extent of gender bias in top-tanked image search results\nobtained for several occupational keywords. To mitigate the bias, we then\npropose a fairness-aware re-ranking algorithm that optimizes (a) relevance of\nthe search result with the keyword and (b) fairness w.r.t genders identified.\nWe experiment on 100 top-ranked images obtained for 10 occupational keywords\nand consider random re-ranking and re-ranking based on relevance as baselines.\nOur experimental results show that the fairness-aware re-ranking algorithm\nproduces rankings with better fairness scores and competitive relevance scores\nthan the baselines.\n","authors":["Swagatika Dash"],"pdf_url":"https://arxiv.org/pdf/2305.03881v2.pdf","comment":"20 Pages, Work uses Proprietary Search Systems from the year 2021"},{"id":"http://arxiv.org/abs/2308.10236v2","updated":"2023-08-22T16:09:09Z","published":"2023-08-20T11:49:12Z","title":"FedSIS: Federated Split Learning with Intermediate Representation\n Sampling for Privacy-preserving Generalized Face Presentation Attack\n Detection","summary":" Lack of generalization to unseen domains/attacks is the Achilles heel of most\nface presentation attack detection (FacePAD) algorithms. Existing attempts to\nenhance the generalizability of FacePAD solutions assume that data from\nmultiple source domains are available with a single entity to enable\ncentralized training. In practice, data from different source domains may be\ncollected by diverse entities, who are often unable to share their data due to\nlegal and privacy constraints. While collaborative learning paradigms such as\nfederated learning (FL) can overcome this problem, standard FL methods are\nill-suited for domain generalization because they struggle to surmount the twin\nchallenges of handling non-iid client data distributions during training and\ngeneralizing to unseen domains during inference. In this work, a novel\nframework called Federated Split learning with Intermediate representation\nSampling (FedSIS) is introduced for privacy-preserving domain generalization.\nIn FedSIS, a hybrid Vision Transformer (ViT) architecture is learned using a\ncombination of FL and split learning to achieve robustness against statistical\nheterogeneity in the client data distributions without any sharing of raw data\n(thereby preserving privacy). To further improve generalization to unseen\ndomains, a novel feature augmentation strategy called intermediate\nrepresentation sampling is employed, and discriminative information from\nintermediate blocks of a ViT is distilled using a shared adapter network. The\nFedSIS approach has been evaluated on two well-known benchmarks for\ncross-domain FacePAD to demonstrate that it is possible to achieve\nstate-of-the-art generalization performance without data sharing. Code:\nhttps://github.com/Naiftt/FedSIS\n","authors":["Naif Alkhunaizi","Koushik Srivatsan","Faris Almalik","Ibrahim Almakky","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2308.10236v2.pdf","comment":"Accepted to the IEEE International Joint Conference on Biometrics\n (IJCB), 2023"},{"id":"http://arxiv.org/abs/2207.09280v5","updated":"2023-08-22T15:46:12Z","published":"2022-07-19T13:49:30Z","title":"Exploiting Inter-Sample Affinity for Knowability-Aware Universal Domain\n Adaptation","summary":" Universal domain adaptation (UniDA) aims to transfer the knowledge of common\nclasses from the source domain to the target domain without any prior knowledge\non the label set, which requires distinguishing in the target domain the\nunknown samples from the known ones. Recent methods usually focused on\ncategorizing a target sample into one of the source classes rather than\ndistinguishing known and unknown samples, which ignores the inter-sample\naffinity between known and unknown samples and may lead to suboptimal\nperformance. Aiming at this issue, we propose a novel UDA framework where such\ninter-sample affinity is exploited. Specifically, we introduce a\nknowability-based labeling scheme which can be divided into two steps: 1)\nKnowability-guided detection of known and unknown samples based on the\nintrinsic structure of the neighborhoods of samples, where we leverage the\nfirst singular vectors of the affinity matrices to obtain the knowability of\nevery target sample. 2) Label refinement based on neighborhood consistency to\nrelabel the target samples, where we refine the labels of each target sample\nbased on its neighborhood consistency of predictions. Then, auxiliary losses\nbased on the two steps are used to reduce the inter-sample affinity between the\nunknown and the known target samples. Finally, experiments on four public\ndatasets demonstrate that our method significantly outperforms existing\nstate-of-the-art methods.\n","authors":["Yifan Wang","Lin Zhang","Ran Song","Hongliang Li","Paul L. Rosin","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2207.09280v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11513v1","updated":"2023-08-22T15:40:03Z","published":"2023-08-22T15:40:03Z","title":"TrackFlow: Multi-Object Tracking with Normalizing Flows","summary":" The field of multi-object tracking has recently seen a renewed interest in\nthe good old schema of tracking-by-detection, as its simplicity and strong\npriors spare it from the complex design and painful babysitting of\ntracking-by-attention approaches. In view of this, we aim at extending\ntracking-by-detection to multi-modal settings, where a comprehensive cost has\nto be computed from heterogeneous information e.g., 2D motion cues, visual\nappearance, and pose estimates. More precisely, we follow a case study where a\nrough estimate of 3D information is also available and must be merged with\nother traditional metrics (e.g., the IoU). To achieve that, recent approaches\nresort to either simple rules or complex heuristics to balance the contribution\nof each cost. However, i) they require careful tuning of tailored\nhyperparameters on a hold-out set, and ii) they imply these costs to be\nindependent, which does not hold in reality. We address these issues by\nbuilding upon an elegant probabilistic formulation, which considers the cost of\na candidate association as the negative log-likelihood yielded by a deep\ndensity estimator, trained to model the conditional joint probability\ndistribution of correct associations. Our experiments, conducted on both\nsimulated and real benchmarks, show that our approach consistently enhances the\nperformance of several tracking-by-detection algorithms.\n","authors":["Gianluca Mancusi","Aniello Panariello","Angelo Porrello","Matteo Fabbri","Simone Calderara","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2308.11513v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11509v1","updated":"2023-08-22T15:38:39Z","published":"2023-08-22T15:38:39Z","title":"SwinFace: A Multi-task Transformer for Face Recognition, Expression\n Recognition, Age Estimation and Attribute Estimation","summary":" In recent years, vision transformers have been introduced into face\nrecognition and analysis and have achieved performance breakthroughs. However,\nmost previous methods generally train a single model or an ensemble of models\nto perform the desired task, which ignores the synergy among different tasks\nand fails to achieve improved prediction accuracy, increased data efficiency,\nand reduced training time. This paper presents a multi-purpose algorithm for\nsimultaneous face recognition, facial expression recognition, age estimation,\nand face attribute estimation (40 attributes including gender) based on a\nsingle Swin Transformer. Our design, the SwinFace, consists of a single shared\nbackbone together with a subnet for each set of related tasks. To address the\nconflicts among multiple tasks and meet the different demands of tasks, a\nMulti-Level Channel Attention (MLCA) module is integrated into each\ntask-specific analysis subnet, which can adaptively select the features from\noptimal levels and channels to perform the desired tasks. Extensive experiments\nshow that the proposed model has a better understanding of the face and\nachieves excellent performance for all tasks. Especially, it achieves 90.97%\naccuracy on RAF-DB and 0.22 $\\epsilon$-error on CLAP2015, which are\nstate-of-the-art results on facial expression recognition and age estimation\nrespectively. The code and models will be made publicly available at\nhttps://github.com/lxq1000/SwinFace.\n","authors":["Lixiong Qin","Mei Wang","Chao Deng","Ke Wang","Xi Chen","Jiani Hu","Weihong Deng"],"pdf_url":"https://arxiv.org/pdf/2308.11509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11507v1","updated":"2023-08-22T15:28:49Z","published":"2023-08-22T15:28:49Z","title":"Unsupervised Prototype Adapter for Vision-Language Models","summary":" Recently, large-scale pre-trained vision-language models (e.g. CLIP and\nALIGN) have demonstrated remarkable effectiveness in acquiring transferable\nvisual representations. To leverage the valuable knowledge encoded within these\nmodels for downstream tasks, several fine-tuning approaches, including prompt\ntuning methods and adapter-based methods, have been developed to adapt\nvision-language models effectively with supervision. However, these methods\nrely on the availability of annotated samples, which can be labor-intensive and\ntime-consuming to acquire, thus limiting scalability. To address this issue, in\nthis work, we design an unsupervised fine-tuning approach for vision-language\nmodels called Unsupervised Prototype Adapter (UP-Adapter). Specifically, for\nthe unannotated target datasets, we leverage the text-image aligning capability\nof CLIP to automatically select the most confident samples for each class.\nUtilizing these selected samples, we generate class prototypes, which serve as\nthe initialization for the learnable prototype model. After fine-tuning, the\nprototype model prediction is combined with the original CLIP's prediction by a\nresidual connection to perform downstream recognition tasks. Our extensive\nexperimental results on image recognition and domain generalization show that\nthe proposed unsupervised method outperforms 8-shot CoOp, 8-shot Tip-Adapter,\nand also the state-of-the-art UPL method by large margins.\n","authors":["Yi Zhang","Ce Zhang","Xueting Hu","Zhihai He"],"pdf_url":"https://arxiv.org/pdf/2308.11507v1.pdf","comment":"Accepted by PRCV 2023"},{"id":"http://arxiv.org/abs/2308.11506v1","updated":"2023-08-22T15:27:52Z","published":"2023-08-22T15:27:52Z","title":"LCCo: Lending CLIP to Co-Segmentation","summary":" This paper studies co-segmenting the common semantic object in a set of\nimages. Existing works either rely on carefully engineered networks to mine the\nimplicit semantic information in visual features or require extra data (i.e.,\nclassification labels) for training. In this paper, we leverage the contrastive\nlanguage-image pre-training framework (CLIP) for the task. With a backbone\nsegmentation network that independently processes each image from the set, we\nintroduce semantics from CLIP into the backbone features, refining them in a\ncoarse-to-fine manner with three key modules: i) an image set feature\ncorrespondence module, encoding global consistent semantic information of the\nimage set; ii) a CLIP interaction module, using CLIP-mined common semantics of\nthe image set to refine the backbone feature; iii) a CLIP regularization\nmodule, drawing CLIP towards this co-segmentation task, identifying the best\nCLIP semantic and using it to regularize the backbone feature. Experiments on\nfour standard co-segmentation benchmark datasets show that the performance of\nour method outperforms state-of-the-art methods.\n","authors":["Xin Duan","Yan Yang","Liyuan Pan","Xiabi Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11506v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03193v2","updated":"2023-08-22T15:23:07Z","published":"2023-04-06T16:17:28Z","title":"Improving automatic endoscopic stone recognition using a multi-view\n fusion approach enhanced with two-step transfer learning","summary":" This contribution presents a deep-learning method for extracting and fusing\nimage information acquired from different viewpoints, with the aim to produce\nmore discriminant object features for the identification of the type of kidney\nstones seen in endoscopic images. The model was further improved with a\ntwo-step transfer learning approach and by attention blocks to refine the\nlearned feature maps. Deep feature fusion strategies improved the results of\nsingle view extraction backbone models by more than 6% in terms of accuracy of\nthe kidney stones classification.\n","authors":["Francisco Lopez-Tiro","Elias Villalvazo-Avila","Juan Pablo Betancur-Rengifo","Ivan Reyes-Amezcua","Jacques Hubert","Gilberto Ochoa-Ruiz","Christian Daul"],"pdf_url":"https://arxiv.org/pdf/2304.03193v2.pdf","comment":"This paper has been accepted at the LatinX in Computer Vision (LXCV)\n Research workshop at ICCV 2023 (Paris, France)"},{"id":"http://arxiv.org/abs/2308.11489v1","updated":"2023-08-22T15:10:42Z","published":"2023-08-22T15:10:42Z","title":"Learning from Semantic Alignment between Unpaired Multiviews for\n Egocentric Video Recognition","summary":" We are concerned with a challenging scenario in unpaired multiview video\nlearning. In this case, the model aims to learn comprehensive multiview\nrepresentations while the cross-view semantic information exhibits variations.\nWe propose Semantics-based Unpaired Multiview Learning (SUM-L) to tackle this\nunpaired multiview learning problem. The key idea is to build cross-view\npseudo-pairs and do view-invariant alignment by leveraging the semantic\ninformation of videos. To facilitate the data efficiency of multiview learning,\nwe further perform video-text alignment for first-person and third-person\nvideos, to fully leverage the semantic knowledge to improve video\nrepresentations. Extensive experiments on multiple benchmark datasets verify\nthe effectiveness of our framework. Our method also outperforms multiple\nexisting view-alignment methods, under the more challenging scenario than\ntypical paired or unpaired multimodal or multiview learning. Our code is\navailable at https://github.com/wqtwjt1996/SUM-L.\n","authors":["Qitong Wang","Long Zhao","Liangzhe Yuan","Ting Liu","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2308.11489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11488v1","updated":"2023-08-22T15:08:02Z","published":"2023-08-22T15:08:02Z","title":"Opening the Vocabulary of Egocentric Actions","summary":" Human actions in egocentric videos are often hand-object interactions\ncomposed from a verb (performed by the hand) applied to an object. Despite\ntheir extensive scaling up, egocentric datasets still face two limitations -\nsparsity of action compositions and a closed set of interacting objects. This\npaper proposes a novel open vocabulary action recognition task. Given a set of\nverbs and objects observed during training, the goal is to generalize the verbs\nto an open vocabulary of actions with seen and novel objects. To this end, we\ndecouple the verb and object predictions via an object-agnostic verb encoder\nand a prompt-based object encoder. The prompting leverages CLIP representations\nto predict an open vocabulary of interacting objects. We create open vocabulary\nbenchmarks on the EPIC-KITCHENS-100 and Assembly101 datasets; whereas\nclosed-action methods fail to generalize, our proposed method is effective. In\naddition, our object encoder significantly outperforms existing open-vocabulary\nvisual recognition methods in recognizing novel interacting objects.\n","authors":["Dibyadip Chatterjee","Fadime Sener","Shugao Ma","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2308.11488v1.pdf","comment":"20 pages, 7 figures; https://dibschat.github.io/openvocab-egoAR/"},{"id":"http://arxiv.org/abs/2308.11487v1","updated":"2023-08-22T15:06:14Z","published":"2023-08-22T15:06:14Z","title":"Free Lunch for Gait Recognition: A Novel Relation Descriptor","summary":" Gait recognition is to seek correct matches for query individuals by their\nunique walking patterns at a long distance. However, current methods focus\nsolely on individual gait features, disregarding inter-personal relationships.\nIn this paper, we reconsider gait representation, asserting that gait is not\njust an aggregation of individual features, but also the relationships among\ndifferent subjects' gait features once reference gaits are established. From\nthis perspective, we redefine classifier weights as reference-anchored gaits,\nallowing each person's gait to be described by their relationship with these\nreferences. In our work, we call this novel descriptor Relationship Descriptor\n(RD). This Relationship Descriptor offers two benefits: emphasizing meaningful\nfeatures and enhancing robustness. To be specific, The normalized dot product\nbetween gait features and classifier weights signifies a similarity relation,\nwhere each dimension indicates the similarity between the test sample and each\ntraining ID's gait prototype, respectively. Despite its potential, the direct\nuse of relationship descriptors poses dimensionality challenges since the\ndimension of RD depends on the training set's identity count. To address this,\nwe propose a Farthest Anchored gaits Selection algorithm and a dimension\nreduction method to boost gait recognition performance. Our method can be built\non top of off-the-shelf pre-trained classification-based models without extra\nparameters. We show that RD achieves higher recognition performance than\ndirectly using extracted features. We evaluate the effectiveness of our method\non the popular GREW, Gait3D, CASIA-B, and OU-MVLP, showing that our method\nconsistently outperforms the baselines and achieves state-of-the-art\nperformances.\n","authors":["Jilong Wang","Saihui Hou","Yan Huang","Chunshui Cao","Xu Liu","Yongzhen Huang","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11485v1","updated":"2023-08-22T15:03:16Z","published":"2023-08-22T15:03:16Z","title":"Composed Image Retrieval using Contrastive Learning and Task-oriented\n CLIP-based Features","summary":" Given a query composed of a reference image and a relative caption, the\nComposed Image Retrieval goal is to retrieve images visually similar to the\nreference one that integrates the modifications expressed by the caption. Given\nthat recent research has demonstrated the efficacy of large-scale vision and\nlanguage pre-trained (VLP) models in various tasks, we rely on features from\nthe OpenAI CLIP model to tackle the considered task. We initially perform a\ntask-oriented fine-tuning of both CLIP encoders using the element-wise sum of\nvisual and textual features. Then, in the second stage, we train a Combiner\nnetwork that learns to combine the image-text features integrating the bimodal\ninformation and providing combined features used to perform the retrieval. We\nuse contrastive learning in both stages of training. Starting from the bare\nCLIP features as a baseline, experimental results show that the task-oriented\nfine-tuning and the carefully crafted Combiner network are highly effective and\noutperform more complex state-of-the-art approaches on FashionIQ and CIRR, two\npopular and challenging datasets for composed image retrieval. Code and\npre-trained models are available at https://github.com/ABaldrati/CLIP4Cir\n","authors":["Alberto Baldrati","Marco Bertini","Tiberio Uricchio","Alberto del Bimbo"],"pdf_url":"https://arxiv.org/pdf/2308.11485v1.pdf","comment":"Accepted in ACM Transactions on Multimedia Computing Communications\n and Applications (TOMM)"},{"id":"http://arxiv.org/abs/2308.11484v1","updated":"2023-08-22T14:59:17Z","published":"2023-08-22T14:59:17Z","title":"Pose2Gait: Extracting Gait Features from Monocular Video of Individuals\n with Dementia","summary":" Video-based ambient monitoring of gait for older adults with dementia has the\npotential to detect negative changes in health and allow clinicians and\ncaregivers to intervene early to prevent falls or hospitalizations. Computer\nvision-based pose tracking models can process video data automatically and\nextract joint locations; however, publicly available models are not optimized\nfor gait analysis on older adults or clinical populations. In this work we\ntrain a deep neural network to map from a two dimensional pose sequence,\nextracted from a video of an individual walking down a hallway toward a\nwall-mounted camera, to a set of three-dimensional spatiotemporal gait features\naveraged over the walking sequence. The data of individuals with dementia used\nin this work was captured at two sites using a wall-mounted system to collect\nthe video and depth information used to train and evaluate our model. Our\nPose2Gait model is able to extract velocity and step length values from the\nvideo that are correlated with the features from the depth camera, with\nSpearman's correlation coefficients of .83 and .60 respectively, showing that\nthree dimensional spatiotemporal features can be predicted from monocular\nvideo. Future work remains to improve the accuracy of other features, such as\nstep time and step width, and test the utility of the predicted values for\ndetecting meaningful changes in gait during longitudinal ambient monitoring.\n","authors":["Caroline Malin-Mayor","Vida Adeli","Andrea Sabo","Sergey Noritsyn","Carolina Gorodetsky","Alfonso Fasano","Andrea Iaboni","Babak Taati"],"pdf_url":"https://arxiv.org/pdf/2308.11484v1.pdf","comment":"14 pages, 3 figures. Code is available at\n https://github.com/TaatiTeam/pose2gait_public . To be published at the\n Ambient Intelligence for Health Care Workshop at MICCAI 2023"},{"id":"http://arxiv.org/abs/2305.16376v2","updated":"2023-08-22T14:55:55Z","published":"2023-05-25T14:42:04Z","title":"Constrained Probabilistic Mask Learning for Task-specific Undersampled\n MRI Reconstruction","summary":" Undersampling is a common method in Magnetic Resonance Imaging (MRI) to\nsubsample the number of data points in k-space, reducing acquisition times at\nthe cost of decreased image quality. A popular approach is to employ\nundersampling patterns following various strategies, e.g., variable density\nsampling or radial trajectories. In this work, we propose a method that\ndirectly learns the undersampling masks from data points, thereby also\nproviding task- and domain-specific patterns. To solve the resulting discrete\noptimization problem, we propose a general optimization routine called ProM: A\nfully probabilistic, differentiable, versatile, and model-free framework for\nmask optimization that enforces acceleration factors through a convex\nconstraint. Analyzing knee, brain, and cardiac MRI datasets with our method, we\ndiscover that different anatomic regions reveal distinct optimal undersampling\nmasks, demonstrating the benefits of using custom masks, tailored for a\ndownstream task. For example, ProM can create undersampling masks that maximize\nperformance in downstream tasks like segmentation with networks trained on\nfully-sampled MRIs. Even with extreme acceleration factors, ProM yields\nreasonable performance while being more versatile than existing methods, paving\nthe way for data-driven all-purpose mask generation.\n","authors":["Tobias Weber","Michael Ingrisch","Bernd Bischl","David Rügamer"],"pdf_url":"https://arxiv.org/pdf/2305.16376v2.pdf","comment":"accepted at WACV 2024"},{"id":"http://arxiv.org/abs/2308.11480v1","updated":"2023-08-22T14:52:44Z","published":"2023-08-22T14:52:44Z","title":"Expecting The Unexpected: Towards Broad Out-Of-Distribution Detection","summary":" Improving the reliability of deployed machine learning systems often involves\ndeveloping methods to detect out-of-distribution (OOD) inputs. However,\nexisting research often narrowly focuses on samples from classes that are\nabsent from the training set, neglecting other types of plausible distribution\nshifts. This limitation reduces the applicability of these methods in\nreal-world scenarios, where systems encounter a wide variety of anomalous\ninputs. In this study, we categorize five distinct types of distribution shifts\nand critically evaluate the performance of recent OOD detection methods on each\nof them. We publicly release our benchmark under the name BROAD (Benchmarking\nResilience Over Anomaly Diversity). Our findings reveal that while these\nmethods excel in detecting unknown classes, their performance is inconsistent\nwhen encountering other types of distribution shifts. In other words, they only\nreliably detect unexpected inputs that they have been specifically designed to\nexpect. As a first step toward broad OOD detection, we learn a generative model\nof existing detection scores with a Gaussian mixture. By doing so, we present\nan ensemble approach that offers a more consistent and comprehensive solution\nfor broad OOD detection, demonstrating superior performance compared to\nexisting methods. Our code to download BROAD and reproduce our experiments is\npublicly available.\n","authors":["Charles Guille-Escuret","Pierre-André Noël","Ioannis Mitliagkas","David Vazquez","Joao Monteiro"],"pdf_url":"https://arxiv.org/pdf/2308.11480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11473v1","updated":"2023-08-22T14:39:17Z","published":"2023-08-22T14:39:17Z","title":"IT3D: Improved Text-to-3D Generation with Explicit View Synthesis","summary":" Recent strides in Text-to-3D techniques have been propelled by distilling\nknowledge from powerful large text-to-image diffusion models (LDMs).\nNonetheless, existing Text-to-3D approaches often grapple with challenges such\nas over-saturation, inadequate detailing, and unrealistic outputs. This study\npresents a novel strategy that leverages explicitly synthesized multi-view\nimages to address these issues. Our approach involves the utilization of\nimage-to-image pipelines, empowered by LDMs, to generate posed high-quality\nimages based on the renderings of coarse 3D models. Although the generated\nimages mostly alleviate the aforementioned issues, challenges such as view\ninconsistency and significant content variance persist due to the inherent\ngenerative nature of large diffusion models, posing extensive difficulties in\nleveraging these images effectively. To overcome this hurdle, we advocate\nintegrating a discriminator alongside a novel Diffusion-GAN dual training\nstrategy to guide the training of 3D models. For the incorporated\ndiscriminator, the synthesized multi-view images are considered real data,\nwhile the renderings of the optimized 3D models function as fake data. We\nconduct a comprehensive set of experiments that demonstrate the effectiveness\nof our method over baseline approaches.\n","authors":["Yiwen Chen","Chi Zhang","Xiaofeng Yang","Zhongang Cai","Gang Yu","Lei Yang","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2308.11473v1.pdf","comment":"Project Page: https://github.com/buaacyw/IT3D-text-to-3D"},{"id":"http://arxiv.org/abs/2308.11471v1","updated":"2023-08-22T14:36:59Z","published":"2023-08-22T14:36:59Z","title":"Dynamic Open Vocabulary Enhanced Safe-landing with Intelligence\n (DOVESEI)","summary":" This work targets what we consider to be the foundational step for urban\nairborne robots, a safe landing. Our attention is directed toward what we deem\nthe most crucial aspect of the safe landing perception stack: segmentation. We\npresent a streamlined reactive UAV system that employs visual servoing by\nharnessing the capabilities of open vocabulary image segmentation. This\napproach can adapt to various scenarios with minimal adjustments, bypassing the\nnecessity for extensive data accumulation for refining internal models, thanks\nto its open vocabulary methodology. Given the limitations imposed by local\nauthorities, our primary focus centers on operations originating from altitudes\nof 100 meters. This choice is deliberate, as numerous preceding works have\ndealt with altitudes up to 30 meters, aligning with the capabilities of small\nstereo cameras. Consequently, we leave the remaining 20m to be navigated using\nconventional 3D path planning methods. Utilizing monocular cameras and image\nsegmentation, our findings demonstrate the system's capability to successfully\nexecute landing maneuvers at altitudes as low as 20 meters. However, this\napproach is vulnerable to intermittent and occasionally abrupt fluctuations in\nthe segmentation between frames in a video stream. To address this challenge,\nwe enhance the image segmentation output by introducing what we call a dynamic\nfocus: a masking mechanism that self adjusts according to the current landing\nstage. This dynamic focus guides the control system to avoid regions beyond the\ndrone's safety radius projected onto the ground, thus mitigating the problems\nwith fluctuations. Through the implementation of this supplementary layer, our\nexperiments have reached improvements in the landing success rate of almost\ntenfold when compared to global segmentation. All the source code is open\nsource and available online (github.com/MISTLab/DOVESEI).\n","authors":["Haechan Mark Bon","Rongge Zhang","Ricardo de Azambuja","Giovanni Beltrame"],"pdf_url":"https://arxiv.org/pdf/2308.11471v1.pdf","comment":"Submitted to IROS 2013 The Last-Mile Robotics Workshop"},{"id":"http://arxiv.org/abs/2308.11468v1","updated":"2023-08-22T14:29:19Z","published":"2023-08-22T14:29:19Z","title":"Multitemporal analysis in Google Earth Engine for detecting urban\n changes using optical data and machine learning algorithms","summary":" The aim of this work is to perform a multitemporal analysis using the Google\nEarth Engine (GEE) platform for the detection of changes in urban areas using\noptical data and specific machine learning (ML) algorithms. As a case study,\nCairo City has been identified, in Egypt country, as one of the five most\npopulous megacities of the last decade in the world. Classification and change\ndetection analysis of the region of interest (ROI) have been carried out from\nJuly 2013 to July 2021. Results demonstrate the validity of the proposed method\nin identifying changed and unchanged urban areas over the selected period.\nFurthermore, this work aims to evidence the growing significance of GEE as an\nefficient cloud-based solution for managing large quantities of satellite data.\n","authors":["Mariapia Rita Iandolo","Francesca Razzano","Chiara Zarro","G. S. Yogesh","Silvia Liberata Ullo"],"pdf_url":"https://arxiv.org/pdf/2308.11468v1.pdf","comment":"4 pages, 6 figures, 2023 InGARSS Conference"},{"id":"http://arxiv.org/abs/2305.03807v3","updated":"2023-08-22T14:26:18Z","published":"2023-05-05T19:20:29Z","title":"Evading Watermark based Detection of AI-Generated Content","summary":" A generative AI model can generate extremely realistic-looking content,\nposing growing challenges to the authenticity of information. To address the\nchallenges, watermark has been leveraged to detect AI-generated content.\nSpecifically, a watermark is embedded into an AI-generated content before it is\nreleased. A content is detected as AI-generated if a similar watermark can be\ndecoded from it. In this work, we perform a systematic study on the robustness\nof such watermark-based AI-generated content detection. We focus on\nAI-generated images. Our work shows that an attacker can post-process a\nwatermarked image via adding a small, human-imperceptible perturbation to it,\nsuch that the post-processed image evades detection while maintaining its\nvisual quality. We show the effectiveness of our attack both theoretically and\nempirically. Moreover, to evade detection, our adversarial post-processing\nmethod adds much smaller perturbations to AI-generated images and thus better\nmaintain their visual quality than existing popular post-processing methods\nsuch as JPEG compression, Gaussian blur, and Brightness/Contrast. Our work\nshows the insufficiency of existing watermark-based detection of AI-generated\ncontent, highlighting the urgent needs of new methods. Our code is publicly\navailable: https://github.com/zhengyuan-jiang/WEvade.\n","authors":["Zhengyuan Jiang","Jinghuai Zhang","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2305.03807v3.pdf","comment":"To appear in ACM Conference on Computer and Communications Security\n (CCS), 2023"},{"id":"http://arxiv.org/abs/2307.08908v2","updated":"2023-08-22T14:10:06Z","published":"2023-07-18T00:48:56Z","title":"What Can Simple Arithmetic Operations Do for Temporal Modeling?","summary":" Temporal modeling plays a crucial role in understanding video content. To\ntackle this problem, previous studies built complicated temporal relations\nthrough time sequence thanks to the development of computationally powerful\ndevices. In this work, we explore the potential of four simple arithmetic\noperations for temporal modeling. Specifically, we first capture auxiliary\ntemporal cues by computing addition, subtraction, multiplication, and division\nbetween pairs of extracted frame features. Then, we extract corresponding\nfeatures from these cues to benefit the original temporal-irrespective domain.\nWe term such a simple pipeline as an Arithmetic Temporal Module (ATM), which\noperates on the stem of a visual backbone with a plug-and-play style. We\nconduct comprehensive ablation studies on the instantiation of ATMs and\ndemonstrate that this module provides powerful temporal modeling capability at\na low computational cost. Moreover, the ATM is compatible with both CNNs- and\nViTs-based architectures. Our results show that ATM achieves superior\nperformance over several popular video benchmarks. Specifically, on\nSomething-Something V1, V2 and Kinetics-400, we reach top-1 accuracy of 65.6%,\n74.6%, and 89.4% respectively. The code is available at\nhttps://github.com/whwu95/ATM.\n","authors":["Wenhao Wu","Yuxin Song","Zhun Sun","Jingdong Wang","Chang Xu","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2307.08908v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.00773v2","updated":"2023-08-22T14:09:43Z","published":"2023-08-01T18:26:55Z","title":"High-Fidelity Eye Animatable Neural Radiance Fields for Human Face","summary":" Face rendering using neural radiance fields (NeRF) is a rapidly developing\nresearch area in computer vision. While recent methods primarily focus on\ncontrolling facial attributes such as identity and expression, they often\noverlook the crucial aspect of modeling eyeball rotation, which holds\nimportance for various downstream tasks. In this paper, we aim to learn a face\nNeRF model that is sensitive to eye movements from multi-view images. We\naddress two key challenges in eye-aware face NeRF learning: how to effectively\ncapture eyeball rotation for training and how to construct a manifold for\nrepresenting eyeball rotation. To accomplish this, we first fit FLAME, a\nwell-established parametric face model, to the multi-view images considering\nmulti-view consistency. Subsequently, we introduce a new Dynamic Eye-aware NeRF\n(DeNeRF). DeNeRF transforms 3D points from different views into a canonical\nspace to learn a unified face NeRF model. We design an eye deformation field\nfor the transformation, including rigid transformation, e.g., eyeball rotation,\nand non-rigid transformation. Through experiments conducted on the ETH-XGaze\ndataset, we demonstrate that our model is capable of generating high-fidelity\nimages with accurate eyeball rotation and non-rigid periocular deformation,\neven under novel viewing angles. Furthermore, we show that utilizing the\nrendered images can effectively enhance gaze estimation performance.\n","authors":["Hengfei Wang","Zhongqun Zhang","Yihua Cheng","Hyung Jin Chang"],"pdf_url":"https://arxiv.org/pdf/2308.00773v2.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2308.10511v2","updated":"2023-08-22T14:08:20Z","published":"2023-08-21T06:51:58Z","title":"Performance Enhancement Leveraging Mask-RCNN on Bengali Document Layout\n Analysis","summary":" Understanding digital documents is like solving a puzzle, especially\nhistorical ones. Document Layout Analysis (DLA) helps with this puzzle by\ndividing documents into sections like paragraphs, images, and tables. This is\ncrucial for machines to read and understand these documents. In the DL Sprint\n2.0 competition, we worked on understanding Bangla documents. We used a dataset\ncalled BaDLAD with lots of examples. We trained a special model called Mask\nR-CNN to help with this understanding. We made this model better by\nstep-by-step hyperparameter tuning, and we achieved a good dice score of 0.889.\nHowever, not everything went perfectly. We tried using a model trained for\nEnglish documents, but it didn't fit well with Bangla. This showed us that each\nlanguage has its own challenges. Our solution for the DL Sprint 2.0 is publicly\navailable at https://www.kaggle.com/competitions/dlsprint2/discussion/432201\nalong with notebooks, weights, and inference notebook.\n","authors":["Shrestha Datta","Md Adith Mollah","Raisa Fairooz","Tariful Islam Fahim"],"pdf_url":"https://arxiv.org/pdf/2308.10511v2.pdf","comment":"Contest paper, Conest: DL sprint 2.0 (Link:\n https://www.kaggle.com/competitions/dlsprint2), Solution link:\n https://www.kaggle.com/competitions/dlsprint2/discussion/432201"},{"id":"http://arxiv.org/abs/2308.11452v1","updated":"2023-08-22T13:59:47Z","published":"2023-08-22T13:59:47Z","title":"Food Image Classification and Segmentation with Attention-based Multiple\n Instance Learning","summary":" The demand for accurate food quantification has increased in the recent\nyears, driven by the needs of applications in dietary monitoring. At the same\ntime, computer vision approaches have exhibited great potential in automating\ntasks within the food domain. Traditionally, the development of machine\nlearning models for these problems relies on training data sets with\npixel-level class annotations. However, this approach introduces challenges\narising from data collection and ground truth generation that quickly become\ncostly and error-prone since they must be performed in multiple settings and\nfor thousands of classes. To overcome these challenges, the paper presents a\nweakly supervised methodology for training food image classification and\nsemantic segmentation models without relying on pixel-level annotations. The\nproposed methodology is based on a multiple instance learning approach in\ncombination with an attention-based mechanism. At test time, the models are\nused for classification and, concurrently, the attention mechanism generates\nsemantic heat maps which are used for food class segmentation. In the paper, we\nconduct experiments on two meta-classes within the FoodSeg103 data set to\nverify the feasibility of the proposed approach and we explore the functioning\nproperties of the attention mechanism.\n","authors":["Valasia Vlachopoulou","Ioannis Sarafis","Alexandros Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2308.11452v1.pdf","comment":"Accepted for presentation at 18th International Workshop on Semantic\n and Social Media Adaptation & Personalization (SMAP 2023)"},{"id":"http://arxiv.org/abs/2308.11450v1","updated":"2023-08-22T13:58:45Z","published":"2023-08-22T13:58:45Z","title":"Towards Discriminative Representations with Contrastive Instances for\n Real-Time UAV Tracking","summary":" Maintaining high efficiency and high precision are two fundamental challenges\nin UAV tracking due to the constraints of computing resources, battery\ncapacity, and UAV maximum load. Discriminative correlation filters (DCF)-based\ntrackers can yield high efficiency on a single CPU but with inferior precision.\nLightweight Deep learning (DL)-based trackers can achieve a good balance\nbetween efficiency and precision but performance gains are limited by the\ncompression rate. High compression rate often leads to poor discriminative\nrepresentations. To this end, this paper aims to enhance the discriminative\npower of feature representations from a new feature-learning perspective.\nSpecifically, we attempt to learn more disciminative representations with\ncontrastive instances for UAV tracking in a simple yet effective manner, which\nnot only requires no manual annotations but also allows for developing and\ndeploying a lightweight model. We are the first to explore contrastive learning\nfor UAV tracking. Extensive experiments on four UAV benchmarks, including\nUAV123@10fps, DTB70, UAVDT and VisDrone2018, show that the proposed DRCI\ntracker significantly outperforms state-of-the-art UAV tracking methods.\n","authors":["Dan Zeng","Mingliang Zou","Xucheng Wang","Shuiwang Li"],"pdf_url":"https://arxiv.org/pdf/2308.11450v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2308.10262"},{"id":"http://arxiv.org/abs/2308.11448v1","updated":"2023-08-22T13:55:57Z","published":"2023-08-22T13:55:57Z","title":"Masked Momentum Contrastive Learning for Zero-shot Semantic\n Understanding","summary":" Self-supervised pretraining (SSP) has emerged as a popular technique in\nmachine learning, enabling the extraction of meaningful feature representations\nwithout labelled data. In the realm of computer vision, pretrained vision\ntransformers (ViTs) have played a pivotal role in advancing transfer learning.\nNonetheless, the escalating cost of finetuning these large models has posed a\nchallenge due to the explosion of model size. This study endeavours to evaluate\nthe effectiveness of pure self-supervised learning (SSL) techniques in computer\nvision tasks, obviating the need for finetuning, with the intention of\nemulating human-like capabilities in generalisation and recognition of unseen\nobjects. To this end, we propose an evaluation protocol for zero-shot\nsegmentation based on a prompting patch. Given a point on the target object as\na prompt, the algorithm calculates the similarity map between the selected\npatch and other patches, upon that, a simple thresholding is applied to segment\nthe target. Another evaluation is intra-object and inter-object similarity to\ngauge discriminatory ability of SSP ViTs. Insights from zero-shot segmentation\nfrom prompting and discriminatory abilities of SSP led to the design of a\nsimple SSP approach, termed MMC. This approaches combines Masked image\nmodelling for encouraging similarity of local features, Momentum based\nself-distillation for transferring semantics from global to local features, and\nglobal Contrast for promoting semantics of global features, to enhance\ndiscriminative representations of SSP ViTs. Consequently, our proposed method\nsignificantly reduces the overlap of intra-object and inter-object\nsimilarities, thereby facilitating effective object segmentation within an\nimage. Our experiments reveal that MMC delivers top-tier results in zero-shot\nsemantic segmentation across various datasets.\n","authors":["Jiantao Wu","Shentong Mo","Muhammad Awais","Sara Atito","Zhenhua Feng","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2308.11448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11443v1","updated":"2023-08-22T13:50:49Z","published":"2023-08-22T13:50:49Z","title":"Revisiting and Exploring Efficient Fast Adversarial Training via LAW:\n Lipschitz Regularization and Auto Weight Averaging","summary":" Fast Adversarial Training (FAT) not only improves the model robustness but\nalso reduces the training cost of standard adversarial training. However, fast\nadversarial training often suffers from Catastrophic Overfitting (CO), which\nresults in poor robustness performance. Catastrophic Overfitting describes the\nphenomenon of a sudden and significant decrease in robust accuracy during the\ntraining of fast adversarial training. Many effective techniques have been\ndeveloped to prevent Catastrophic Overfitting and improve the model robustness\nfrom different perspectives. However, these techniques adopt inconsistent\ntraining settings and require different training costs, i.e, training time and\nmemory costs, leading to unfair comparisons. In this paper, we conduct a\ncomprehensive study of over 10 fast adversarial training methods in terms of\nadversarial robustness and training costs. We revisit the effectiveness and\nefficiency of fast adversarial training techniques in preventing Catastrophic\nOverfitting from the perspective of model local nonlinearity and propose an\neffective Lipschitz regularization method for fast adversarial training.\nFurthermore, we explore the effect of data augmentation and weight averaging in\nfast adversarial training and propose a simple yet effective auto weight\naveraging method to improve robustness further. By assembling these techniques,\nwe propose a FGSM-based fast adversarial training method equipped with\nLipschitz regularization and Auto Weight averaging, abbreviated as FGSM-LAW.\nExperimental evaluations on four benchmark databases demonstrate the\nsuperiority of the proposed method over state-of-the-art fast adversarial\ntraining methods and the advanced standard adversarial training methods.\n","authors":["Xiaojun Jia","Yuefeng Chen","Xiaofeng Mao","Ranjie Duan","Jindong Gu","Rong Zhang","Hui Xue","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2308.11443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11442v1","updated":"2023-08-22T13:46:12Z","published":"2023-08-22T13:46:12Z","title":"SDeMorph: Towards Better Facial De-morphing from Single Morph","summary":" Face Recognition Systems (FRS) are vulnerable to morph attacks. A face morph\nis created by combining multiple identities with the intention to fool FRS and\nmaking it match the morph with multiple identities. Current Morph Attack\nDetection (MAD) can detect the morph but are unable to recover the identities\nused to create the morph with satisfactory outcomes. Existing work in\nde-morphing is mostly reference-based, i.e. they require the availability of\none identity to recover the other. Sudipta et al. \\cite{ref9} proposed a\nreference-free de-morphing technique but the visual realism of outputs produced\nwere feeble. In this work, we propose SDeMorph (Stably Diffused De-morpher), a\nnovel de-morphing method that is reference-free and recovers the identities of\nbona fides. Our method produces feature-rich outputs that are of significantly\nhigh quality in terms of definition and facial fidelity. Our method utilizes\nDenoising Diffusion Probabilistic Models (DDPM) by destroying the input morphed\nsignal and then reconstructing it back using a branched-UNet. Experiments on\nASML, FRLL-FaceMorph, FRLL-MorDIFF, and SMDD datasets support the effectiveness\nof the proposed method.\n","authors":["Nitish Shukla"],"pdf_url":"https://arxiv.org/pdf/2308.11442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11441v1","updated":"2023-08-22T13:45:35Z","published":"2023-08-22T13:45:35Z","title":"Learning a More Continuous Zero Level Set in Unsigned Distance Fields\n through Level Set Projection","summary":" Latest methods represent shapes with open surfaces using unsigned distance\nfunctions (UDFs). They train neural networks to learn UDFs and reconstruct\nsurfaces with the gradients around the zero level set of the UDF. However, the\ndifferential networks struggle from learning the zero level set where the UDF\nis not differentiable, which leads to large errors on unsigned distances and\ngradients around the zero level set, resulting in highly fragmented and\ndiscontinuous surfaces. To resolve this problem, we propose to learn a more\ncontinuous zero level set in UDFs with level set projections. Our insight is to\nguide the learning of zero level set using the rest non-zero level sets via a\nprojection procedure. Our idea is inspired from the observations that the\nnon-zero level sets are much smoother and more continuous than the zero level\nset. We pull the non-zero level sets onto the zero level set with gradient\nconstraints which align gradients over different level sets and correct\nunsigned distance errors on the zero level set, leading to a smoother and more\ncontinuous unsigned distance field. We conduct comprehensive experiments in\nsurface reconstruction for point clouds, real scans or depth maps, and further\nexplore the performance in unsupervised point cloud upsampling and unsupervised\npoint normal estimation with the learned UDF, which demonstrate our non-trivial\nimprovements over the state-of-the-art methods. Code is available at\nhttps://github.com/junshengzhou/LevelSetUDF .\n","authors":["Junsheng Zhou","Baorui Ma","Shujuan Li","Yu-Shen Liu","Zhizhong Han"],"pdf_url":"https://arxiv.org/pdf/2308.11441v1.pdf","comment":"To appear at ICCV2023. Code is available at\n https://github.com/junshengzhou/LevelSetUDF"},{"id":"http://arxiv.org/abs/2308.11440v1","updated":"2023-08-22T13:42:15Z","published":"2023-08-22T13:42:15Z","title":"PoseGraphNet++: Enriching 3D Human Pose with Orientation Estimation","summary":" Existing kinematic skeleton-based 3D human pose estimation methods only\npredict joint positions. Although this is sufficient to compute the yaw and\npitch of the bone rotations, the roll around the axis of the bones remains\nunresolved by these methods. In this paper, we propose a novel 2D-to-3D lifting\nGraph Convolution Network named PoseGraphNet++ to predict the complete human\npose including the joint positions and the bone orientations. We employ node\nand edge convolutions to utilize the joint and bone features. Our model is\nevaluated on multiple benchmark datasets, and its performance is either on par\nwith or better than the state-of-the-art in terms of both position and rotation\nmetrics. Through extensive ablation studies, we show that PoseGraphNet++\nbenefits from exploiting the mutual relationship between the joints and the\nbones.\n","authors":["Soubarna Banik","Edvard Avagyan","Alejandro Mendoza Gracia","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2308.11440v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.11421v1","updated":"2023-08-22T13:08:29Z","published":"2023-08-22T13:08:29Z","title":"TurboViT: Generating Fast Vision Transformers via Generative\n Architecture Search","summary":" Vision transformers have shown unprecedented levels of performance in\ntackling various visual perception tasks in recent years. However, the\narchitectural and computational complexity of such network architectures have\nmade them challenging to deploy in real-world applications with\nhigh-throughput, low-memory requirements. As such, there has been significant\nresearch recently on the design of efficient vision transformer architectures.\nIn this study, we explore the generation of fast vision transformer\narchitecture designs via generative architecture search (GAS) to achieve a\nstrong balance between accuracy and architectural and computational efficiency.\nThrough this generative architecture search process, we create TurboViT, a\nhighly efficient hierarchical vision transformer architecture design that is\ngenerated around mask unit attention and Q-pooling design patterns. The\nresulting TurboViT architecture design achieves significantly lower\narchitectural computational complexity (>2.47$\\times$ smaller than FasterViT-0\nwhile achieving same accuracy) and computational complexity (>3.4$\\times$ fewer\nFLOPs and 0.9% higher accuracy than MobileViT2-2.0) when compared to 10 other\nstate-of-the-art efficient vision transformer network architecture designs\nwithin a similar range of accuracy on the ImageNet-1K dataset. Furthermore,\nTurboViT demonstrated strong inference latency and throughput in both\nlow-latency and batch processing scenarios (>3.21$\\times$ lower latency and\n>3.18$\\times$ higher throughput compared to FasterViT-0 for low-latency\nscenario). These promising results demonstrate the efficacy of leveraging\ngenerative architecture search for generating efficient transformer\narchitecture designs for high-throughput scenarios.\n","authors":["Alexander Wong","Saad Abbasi","Saeejith Nair"],"pdf_url":"https://arxiv.org/pdf/2308.11421v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2308.11417v1","updated":"2023-08-22T13:02:23Z","published":"2023-08-22T13:02:23Z","title":"ScanNet++: A High-Fidelity Dataset of 3D Indoor Scenes","summary":" We present ScanNet++, a large-scale dataset that couples together capture of\nhigh-quality and commodity-level geometry and color of indoor scenes. Each\nscene is captured with a high-end laser scanner at sub-millimeter resolution,\nalong with registered 33-megapixel images from a DSLR camera, and RGB-D streams\nfrom an iPhone. Scene reconstructions are further annotated with an open\nvocabulary of semantics, with label-ambiguous scenarios explicitly annotated\nfor comprehensive semantic understanding. ScanNet++ enables a new real-world\nbenchmark for novel view synthesis, both from high-quality RGB capture, and\nimportantly also from commodity-level images, in addition to a new benchmark\nfor 3D semantic scene understanding that comprehensively encapsulates diverse\nand ambiguous semantic labeling scenarios. Currently, ScanNet++ contains 460\nscenes, 280,000 captured DSLR images, and over 3.7M iPhone RGBD frames.\n","authors":["Chandan Yeshwanth","Yueh-Cheng Liu","Matthias Nießner","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2308.11417v1.pdf","comment":"ICCV 2023. Video: https://youtu.be/E6P9e2r6M8I , Project page:\n https://cy94.github.io/scannetpp/"},{"id":"http://arxiv.org/abs/2308.11408v1","updated":"2023-08-22T12:54:48Z","published":"2023-08-22T12:54:48Z","title":"MatFuse: Controllable Material Generation with Diffusion Models","summary":" Creating high quality and realistic materials in computer graphics is a\nchallenging and time-consuming task, which requires great expertise. In this\npaper, we present MatFuse, a novel unified approach that harnesses the\ngenerative power of diffusion models (DM) to simplify the creation of SVBRDF\nmaps. Our DM-based pipeline integrates multiple sources of conditioning, such\nas color palettes, sketches, and pictures, enabling fine-grained control and\nflexibility in material synthesis. This design allows for the combination of\ndiverse information sources (e.g., sketch + image embedding), enhancing\ncreative possibilities in line with the principle of compositionality. We\ndemonstrate the generative capabilities of the proposed method under various\nconditioning settings; on the SVBRDF estimation task, we show that our method\nyields performance comparable to state-of-the-art approaches, both\nqualitatively and quantitatively.\n","authors":["Giuseppe Vecchio","Renato Sortino","Simone Palazzo","Concetto Spampinato"],"pdf_url":"https://arxiv.org/pdf/2308.11408v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06668v2","updated":"2023-08-22T12:53:56Z","published":"2023-04-13T16:57:02Z","title":"DynaMITe: Dynamic Query Bootstrapping for Multi-object Interactive\n Segmentation Transformer","summary":" Most state-of-the-art instance segmentation methods rely on large amounts of\npixel-precise ground-truth annotations for training, which are expensive to\ncreate. Interactive segmentation networks help generate such annotations based\non an image and the corresponding user interactions such as clicks. Existing\nmethods for this task can only process a single instance at a time and each\nuser interaction requires a full forward pass through the entire deep network.\nWe introduce a more efficient approach, called DynaMITe, in which we represent\nuser interactions as spatio-temporal queries to a Transformer decoder with a\npotential to segment multiple object instances in a single iteration. Our\narchitecture also alleviates any need to re-compute image features during\nrefinement, and requires fewer interactions for segmenting multiple instances\nin a single image when compared to other methods. DynaMITe achieves\nstate-of-the-art results on multiple existing interactive segmentation\nbenchmarks, and also on the new multi-instance benchmark that we propose in\nthis paper.\n","authors":["Amit Kumar Rana","Sabarinath Mahadevan","Alexander Hermans","Bastian Leibe"],"pdf_url":"https://arxiv.org/pdf/2304.06668v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11389v1","updated":"2023-08-22T12:28:09Z","published":"2023-08-22T12:28:09Z","title":"Non-Redundant Combination of Hand-Crafted and Deep Learning Radiomics:\n Application to the Early Detection of Pancreatic Cancer","summary":" We address the problem of learning Deep Learning Radiomics (DLR) that are not\nredundant with Hand-Crafted Radiomics (HCR). To do so, we extract DLR features\nusing a VAE while enforcing their independence with HCR features by minimizing\ntheir mutual information. The resulting DLR features can be combined with\nhand-crafted ones and leveraged by a classifier to predict early markers of\ncancer. We illustrate our method on four early markers of pancreatic cancer and\nvalidate it on a large independent test set. Our results highlight the value of\ncombining non-redundant DLR and HCR features, as evidenced by an improvement in\nthe Area Under the Curve compared to baseline methods that do not address\nredundancy or solely rely on HCR features.\n","authors":["Rebeca Vétil","Clément Abi-Nader","Alexandre Bône","Marie-Pierre Vullierme","Marc-Michel Rohé","Pietro Gori","Isabelle Bloch"],"pdf_url":"https://arxiv.org/pdf/2308.11389v1.pdf","comment":"CaPTion workshop MICCAI 2023"},{"id":"http://arxiv.org/abs/2303.14420v2","updated":"2023-08-22T12:26:07Z","published":"2023-03-25T10:09:03Z","title":"Human Preference Score: Better Aligning Text-to-Image Models with Human\n Preference","summary":" Recent years have witnessed a rapid growth of deep generative models, with\ntext-to-image models gaining significant attention from the public. However,\nexisting models often generate images that do not align well with human\npreferences, such as awkward combinations of limbs and facial expressions. To\naddress this issue, we collect a dataset of human choices on generated images\nfrom the Stable Foundation Discord channel. Our experiments demonstrate that\ncurrent evaluation metrics for generative models do not correlate well with\nhuman choices. Thus, we train a human preference classifier with the collected\ndataset and derive a Human Preference Score (HPS) based on the classifier.\nUsing HPS, we propose a simple yet effective method to adapt Stable Diffusion\nto better align with human preferences. Our experiments show that HPS\noutperforms CLIP in predicting human choices and has good generalization\ncapability toward images generated from other models. By tuning Stable\nDiffusion with the guidance of HPS, the adapted model is able to generate\nimages that are more preferred by human users. The project page is available\nhere: https://tgxs002.github.io/align_sd_web/ .\n","authors":["Xiaoshi Wu","Keqiang Sun","Feng Zhu","Rui Zhao","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2303.14420v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11386v1","updated":"2023-08-22T12:25:49Z","published":"2023-08-22T12:25:49Z","title":"Targeted Data Augmentation for bias mitigation","summary":" The development of fair and ethical AI systems requires careful consideration\nof bias mitigation, an area often overlooked or ignored. In this study, we\nintroduce a novel and efficient approach for addressing biases called Targeted\nData Augmentation (TDA), which leverages classical data augmentation techniques\nto tackle the pressing issue of bias in data and models. Unlike the laborious\ntask of removing biases, our method proposes to insert biases instead,\nresulting in improved performance. To identify biases, we annotated two diverse\ndatasets: a dataset of clinical skin lesions and a dataset of male and female\nfaces. These bias annotations are published for the first time in this study,\nproviding a valuable resource for future research. Through Counterfactual Bias\nInsertion, we discovered that biases associated with the frame, ruler, and\nglasses had a significant impact on models. By randomly introducing biases\nduring training, we mitigated these biases and achieved a substantial decrease\nin bias measures, ranging from two-fold to more than 50-fold, while maintaining\na negligible increase in the error rate.\n","authors":["Agnieszka Mikołajczyk-Bareła","Maria Ferlin","Michał Grochowski"],"pdf_url":"https://arxiv.org/pdf/2308.11386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09139v2","updated":"2023-08-22T12:17:15Z","published":"2023-08-17T18:12:05Z","title":"The Unreasonable Effectiveness of Large Language-Vision Models for\n Source-free Video Domain Adaptation","summary":" Source-Free Video Unsupervised Domain Adaptation (SFVUDA) task consists in\nadapting an action recognition model, trained on a labelled source dataset, to\nan unlabelled target dataset, without accessing the actual source data. The\nprevious approaches have attempted to address SFVUDA by leveraging\nself-supervision (e.g., enforcing temporal consistency) derived from the target\ndata itself. In this work, we take an orthogonal approach by exploiting\n\"web-supervision\" from Large Language-Vision Models (LLVMs), driven by the\nrationale that LLVMs contain a rich world prior surprisingly robust to\ndomain-shift. We showcase the unreasonable effectiveness of integrating LLVMs\nfor SFVUDA by devising an intuitive and parameter-efficient method, which we\nname Domain Adaptation with Large Language-Vision models (DALL-V), that\ndistills the world prior and complementary source model information into a\nstudent network tailored for the target. Despite the simplicity, DALL-V\nachieves significant improvement over state-of-the-art SFVUDA methods.\n","authors":["Giacomo Zara","Alessandro Conti","Subhankar Roy","Stéphane Lathuilière","Paolo Rota","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2308.09139v2.pdf","comment":"Accepted at ICCV2023, 14 pages, 7 figures, code is available at\n https://github.com/giaczara/dallv"},{"id":"http://arxiv.org/abs/2308.11381v1","updated":"2023-08-22T12:12:59Z","published":"2023-08-22T12:12:59Z","title":"DALNet: A Rail Detection Network Based on Dynamic Anchor Line","summary":" Rail detection is one of the key factors for intelligent train. In the paper,\nmotivated by the anchor line-based lane detection methods, we propose a rail\ndetection network called DALNet based on dynamic anchor line. Aiming to solve\nthe problem that the predefined anchor line is image agnostic, we design a\nnovel dynamic anchor line mechanism. It utilizes a dynamic anchor line\ngenerator to dynamically generate an appropriate anchor line for each rail\ninstance based on the position and shape of the rails in the input image. These\ndynamically generated anchor lines can be considered as better position\nreferences to accurately localize the rails than the predefined anchor lines.\nIn addition, we present a challenging urban rail detection dataset DL-Rail with\nhigh-quality annotations and scenario diversity. DL-Rail contains 7000 pairs of\nimages and annotations along with scene tags, and it is expected to encourage\nthe development of rail detection. We extensively compare DALNet with many\ncompetitive lane methods. The results show that our DALNet achieves\nstate-of-the-art performance on our DL-Rail rail detection dataset and the\npopular Tusimple and LLAMAS lane detection benchmarks. The code will be\nreleased at \\url{https://github.com/Yzichen/mmLaneDet}.\n","authors":["Zichen Yu","Quanli Liu","Wei Wang","Liyong Zhang","Xiaoguang Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.11381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12577v2","updated":"2023-08-22T12:02:31Z","published":"2023-05-21T21:54:31Z","title":"GMD: Controllable Human Motion Synthesis via Guided Diffusion Models","summary":" Denoising diffusion models have shown great promise in human motion synthesis\nconditioned on natural language descriptions. However, integrating spatial\nconstraints, such as pre-defined motion trajectories and obstacles, remains a\nchallenge despite being essential for bridging the gap between isolated human\nmotion and its surrounding environment. To address this issue, we propose\nGuided Motion Diffusion (GMD), a method that incorporates spatial constraints\ninto the motion generation process. Specifically, we propose an effective\nfeature projection scheme that manipulates motion representation to enhance the\ncoherency between spatial information and local poses. Together with a new\nimputation formulation, the generated motion can reliably conform to spatial\nconstraints such as global motion trajectories. Furthermore, given sparse\nspatial constraints (e.g. sparse keyframes), we introduce a new dense guidance\napproach to turn a sparse signal, which is susceptible to being ignored during\nthe reverse steps, into denser signals to guide the generated motion to the\ngiven constraints. Our extensive experiments justify the development of GMD,\nwhich achieves a significant improvement over state-of-the-art methods in\ntext-based motion generation while allowing control of the synthesized motions\nwith spatial constraints.\n","authors":["Korrawe Karunratanakul","Konpat Preechakul","Supasorn Suwajanakorn","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2305.12577v2.pdf","comment":"ICCV23. Project page: https://korrawe.github.io/gmd-project/"},{"id":"http://arxiv.org/abs/2308.11376v1","updated":"2023-08-22T12:02:05Z","published":"2023-08-22T12:02:05Z","title":"Boundary-RL: Reinforcement Learning for Weakly-Supervised Prostate\n Segmentation in TRUS Images","summary":" We propose Boundary-RL, a novel weakly supervised segmentation method that\nutilises only patch-level labels for training. We envision the segmentation as\na boundary detection problem, rather than a pixel-level classification as in\nprevious works. This outlook on segmentation may allow for boundary delineation\nunder challenging scenarios such as where noise artefacts may be present within\nthe region-of-interest (ROI) boundaries, where traditional pixel-level\nclassification-based weakly supervised methods may not be able to effectively\nsegment the ROI. Particularly of interest, ultrasound images, where intensity\nvalues represent acoustic impedance differences between boundaries, may also\nbenefit from the boundary delineation approach. Our method uses reinforcement\nlearning to train a controller function to localise boundaries of ROIs using a\nreward derived from a pre-trained boundary-presence classifier. The classifier\nindicates when an object boundary is encountered within a patch, as the\ncontroller modifies the patch location in a sequential Markov decision process.\nThe classifier itself is trained using only binary patch-level labels of object\npresence, which are the only labels used during training of the entire boundary\ndelineation framework, and serves as a weak signal to inform the boundary\ndelineation. The use of a controller function ensures that a sliding window\nover the entire image is not necessary. It also prevents possible\nfalse-positive or -negative cases by minimising number of patches passed to the\nboundary-presence classifier. We evaluate our proposed approach for a\nclinically relevant task of prostate gland segmentation on trans-rectal\nultrasound images. We show improved performance compared to other tested weakly\nsupervised methods, using the same labels e.g., multiple instance learning.\n","authors":["Weixi Yi","Vasilis Stavrinides","Zachary M. C. Baum","Qianye Yang","Dean C. Barratt","Matthew J. Clarkson","Yipeng Hu","Shaheer U. Saeed"],"pdf_url":"https://arxiv.org/pdf/2308.11376v1.pdf","comment":"Accepted to MICCAI Workshop MLMI 2023 (14th International Conference\n on Machine Learning in Medical Imaging)"},{"id":"http://arxiv.org/abs/2303.17895v3","updated":"2023-08-22T11:56:19Z","published":"2023-03-31T08:56:29Z","title":"EA-LSS: Edge-aware Lift-splat-shot Framework for 3D BEV Object Detection","summary":" In recent years, great progress has been made in the Lift-Splat-Shot-based\n(LSS-based) 3D object detection method. However, inaccurate depth estimation\nremains an important constraint to the accuracy of camera-only and multi-model\n3D object detection models, especially in regions where the depth changes\nsignificantly (i.e., the ``depth jump'' problem). In this paper, we proposed a\nnovel Edge-aware Lift-splat-shot (EA-LSS) framework. Specifically, edge-aware\ndepth fusion (EADF) module is proposed to alleviate the ``depth jump'' problem\nand fine-grained depth (FGD) module to further enforce refined supervision on\ndepth. Our EA-LSS framework is compatible for any LSS-based 3D object detection\nmodels, and effectively boosts their performances with negligible increment of\ninference time. Experiments on nuScenes benchmarks demonstrate that EA-LSS is\neffective in either camera-only or multi-model models. It is worth mentioning\nthat EA-LSS achieved the state-of-the-art performance on nuScenes test\nbenchmarks with mAP and NDS of 76.5% and 77.6%, respectively.\n","authors":["Haotian Hu","Fanyi Wang","Jingwen Su","Yaonong Wang","Laifeng Hu","Weiye Fang","Jingwei Xu","Zhiwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.17895v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11369v1","updated":"2023-08-22T11:48:43Z","published":"2023-08-22T11:48:43Z","title":"Enhancing Interpretable Object Abstraction via Clustering-based Slot\n Initialization","summary":" Object-centric representations using slots have shown the advances towards\nefficient, flexible and interpretable abstraction from low-level perceptual\nfeatures in a compositional scene. Current approaches randomize the initial\nstate of slots followed by an iterative refinement. As we show in this paper,\nthe random slot initialization significantly affects the accuracy of the final\nslot prediction. Moreover, current approaches require a predetermined number of\nslots from prior knowledge of the data, which limits the applicability in the\nreal world. In our work, we initialize the slot representations with clustering\nalgorithms conditioned on the perceptual input features. This requires an\nadditional layer in the architecture to initialize the slots given the\nidentified clusters. We design permutation invariant and permutation\nequivariant versions of this layer to enable the exchangeable slot\nrepresentations after clustering. Additionally, we employ mean-shift clustering\nto automatically identify the number of slots for a given scene. We evaluate\nour method on object discovery and novel view synthesis tasks with various\ndatasets. The results show that our method outperforms prior works\nconsistently, especially for complex scenes.\n","authors":["Ning Gao","Bernard Hohmann","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2308.11369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11365v1","updated":"2023-08-22T11:41:08Z","published":"2023-08-22T11:41:08Z","title":"Towards Clip-Free Quantized Super-Resolution Networks: How to Tame\n Representative Images","summary":" Super-resolution (SR) networks have been investigated for a while, with their\nmobile and lightweight versions gaining noticeable popularity recently.\nQuantization, the procedure of decreasing the precision of network parameters\n(mostly FP32 to INT8), is also utilized in SR networks for establishing mobile\ncompatibility. This study focuses on a very important but mostly overlooked\npost-training quantization (PTQ) step: representative dataset (RD), which\nadjusts the quantization range for PTQ. We propose a novel pipeline (clip-free\nquantization pipeline, CFQP) backed up with extensive experimental\njustifications to cleverly augment RD images by only using outputs of the FP32\nmodel. Using the proposed pipeline for RD, we can successfully eliminate\nunwanted clipped activation layers, which nearly all mobile SR methods utilize\nto make the model more robust to PTQ in return for a large overhead in runtime.\nRemoving clipped activations with our method significantly benefits overall\nincreased stability, decreased inference runtime up to 54% on some SR models,\nbetter visual quality results compared to INT8 clipped models - and outperforms\neven some FP32 non-quantized models, both in runtime and visual quality,\nwithout the need for retraining with clipped activation.\n","authors":["Alperen Kalay","Bahri Batuhan Bilecen","Mustafa Ayazoglu"],"pdf_url":"https://arxiv.org/pdf/2308.11365v1.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2306.06960v2","updated":"2023-08-22T11:31:21Z","published":"2023-06-12T08:46:02Z","title":"Semantic Parsing of Colonoscopy Videos with Multi-Label Temporal\n Networks","summary":" Following the successful debut of polyp detection and characterization, more\nadvanced automation tools are being developed for colonoscopy. The new\nautomation tasks, such as quality metrics or report generation, require\nunderstanding of the procedure flow that includes activities, events,\nanatomical landmarks, etc. In this work we present a method for automatic\nsemantic parsing of colonoscopy videos. The method uses a novel DL multi-label\ntemporal segmentation model trained in supervised and unsupervised regimes. We\nevaluate the accuracy of the method on a test set of over 300 annotated\ncolonoscopy videos, and use ablation to explore the relative importance of\nvarious method's components.\n","authors":["Ori Kelner","Or Weinstein","Ehud Rivlin","Roman Goldenberg"],"pdf_url":"https://arxiv.org/pdf/2306.06960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07713v2","updated":"2023-08-22T11:20:49Z","published":"2023-06-13T12:00:49Z","title":"Robustness of SAM: Segment Anything Under Corruptions and Beyond","summary":" Segment anything model (SAM), as the name suggests, is claimed to be capable\nof cutting out any object and demonstrates impressive zero-shot transfer\nperformance with the guidance of a prompt. However, there is currently a lack\nof comprehensive evaluation regarding its robustness under various corruptions.\nUnderstanding SAM's robustness across different corruption scenarios is crucial\nfor its real-world deployment. Prior works show that SAM is biased towards\ntexture (style) rather than shape, motivated by which we start by investigating\nSAM's robustness against style transfer, which is synthetic corruption.\nFollowing the interpretation of the corruption's effect as style change, we\nproceed to conduct a comprehensive evaluation of the SAM for its robustness\nagainst 15 types of common corruption. These corruptions mainly fall into\ncategories such as digital, noise, weather, and blur. Within each of these\ncorruption categories, we explore 5 severity levels to simulate real-world\ncorruption scenarios. Beyond the corruptions, we further assess its robustness\nregarding local occlusion and local adversarial patch attacks in images. To the\nbest of our knowledge, our work is the first of its kind to evaluate the\nrobustness of SAM under style change, local occlusion, and local adversarial\npatch attacks. Considering that patch attacks visible to human eyes are easily\ndetectable, we also assess SAM's robustness against adversarial perturbations\nthat are imperceptible to human eyes. Overall, this work provides a\ncomprehensive empirical study on SAM's robustness, evaluating its performance\nunder various corruptions and extending the assessment to critical aspects like\nlocal occlusion, local patch attacks, and imperceptible adversarial\nperturbations, which yields valuable insights into SAM's practical\napplicability and effectiveness in addressing real-world challenges.\n","authors":["Yu Qiao","Chaoning Zhang","Taegoo Kang","Donghun Kim","Shehbaz Tariq","Chenshuang Zhang","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2306.07713v2.pdf","comment":"The first work evaluates the robustness of SAM under various\n corruptions such as style transfer, local occlusion, and adversarial patch\n attack"},{"id":"http://arxiv.org/abs/2308.11358v1","updated":"2023-08-22T11:20:40Z","published":"2023-08-22T11:20:40Z","title":"How Much Temporal Long-Term Context is Needed for Action Segmentation?","summary":" Modeling long-term context in videos is crucial for many fine-grained tasks\nincluding temporal action segmentation. An interesting question that is still\nopen is how much long-term temporal context is needed for optimal performance.\nWhile transformers can model the long-term context of a video, this becomes\ncomputationally prohibitive for long videos. Recent works on temporal action\nsegmentation thus combine temporal convolutional networks with self-attentions\nthat are computed only for a local temporal window. While these approaches show\ngood results, their performance is limited by their inability to capture the\nfull context of a video. In this work, we try to answer how much long-term\ntemporal context is required for temporal action segmentation by introducing a\ntransformer-based model that leverages sparse attention to capture the full\ncontext of a video. We compare our model with the current state of the art on\nthree datasets for temporal action segmentation, namely 50Salads, Breakfast,\nand Assembly101. Our experiments show that modeling the full context of a video\nis necessary to obtain the best performance for temporal action segmentation.\n","authors":["Emad Bahrami","Gianpiero Francesca","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2308.11358v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11357v1","updated":"2023-08-22T11:17:51Z","published":"2023-08-22T11:17:51Z","title":"Exemplar-Free Continual Transformer with Convolutions","summary":" Continual Learning (CL) involves training a machine learning model in a\nsequential manner to learn new information while retaining previously learned\ntasks without the presence of previous training data. Although there has been\nsignificant interest in CL, most recent CL approaches in computer vision have\nfocused on convolutional architectures only. However, with the recent success\nof vision transformers, there is a need to explore their potential for CL.\nAlthough there have been some recent CL approaches for vision transformers,\nthey either store training instances of previous tasks or require a task\nidentifier during test time, which can be limiting. This paper proposes a new\nexemplar-free approach for class/task incremental learning called ConTraCon,\nwhich does not require task-id to be explicitly present during inference and\navoids the need for storing previous training instances. The proposed approach\nleverages the transformer architecture and involves re-weighting the key,\nquery, and value weights of the multi-head self-attention layers of a\ntransformer trained on a similar task. The re-weighting is done using\nconvolution, which enables the approach to maintain low parameter requirements\nper task. Additionally, an image augmentation-based entropic task\nidentification approach is used to predict tasks without requiring task-ids\nduring inference. Experiments on four benchmark datasets demonstrate that the\nproposed approach outperforms several competitive approaches while requiring\nfewer parameters.\n","authors":["Anurag Roy","Vinay Kumar Verma","Sravan Voonna","Kripabandhu Ghosh","Saptarshi Ghosh","Abir Das"],"pdf_url":"https://arxiv.org/pdf/2308.11357v1.pdf","comment":"Accepted in ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11356v1","updated":"2023-08-22T11:16:24Z","published":"2023-08-22T11:16:24Z","title":"Semantic RGB-D Image Synthesis","summary":" Collecting diverse sets of training images for RGB-D semantic image\nsegmentation is not always possible. In particular, when robots need to operate\nin privacy-sensitive areas like homes, the collection is often limited to a\nsmall set of locations. As a consequence, the annotated images lack diversity\nin appearance and approaches for RGB-D semantic image segmentation tend to\noverfit the training data. In this paper, we thus introduce semantic RGB-D\nimage synthesis to address this problem. It requires synthesising a\nrealistic-looking RGB-D image for a given semantic label map. Current\napproaches, however, are uni-modal and cannot cope with multi-modal data.\nIndeed, we show that extending uni-modal approaches to multi-modal data does\nnot perform well. In this paper, we therefore propose a generator for\nmulti-modal data that separates modal-independent information of the semantic\nlayout from the modal-dependent information that is needed to generate an RGB\nand a depth image, respectively. Furthermore, we propose a discriminator that\nensures semantic consistency between the label maps and the generated images\nand perceptual similarity between the real and generated images. Our\ncomprehensive experiments demonstrate that the proposed method outperforms\nprevious uni-modal methods by a large margin and that the accuracy of an\napproach for RGB-D semantic segmentation can be significantly improved by\nmixing real and generated images during training.\n","authors":["Shijie Li","Rong Li","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2308.11356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11722v3","updated":"2023-08-22T11:01:57Z","published":"2023-03-21T10:24:29Z","title":"Implicit Neural Representation for Cooperative Low-light Image\n Enhancement","summary":" The following three factors restrict the application of existing low-light\nimage enhancement methods: unpredictable brightness degradation and noise,\ninherent gap between metric-favorable and visual-friendly versions, and the\nlimited paired training data. To address these limitations, we propose an\nimplicit Neural Representation method for Cooperative low-light image\nenhancement, dubbed NeRCo. It robustly recovers perceptual-friendly results in\nan unsupervised manner. Concretely, NeRCo unifies the diverse degradation\nfactors of real-world scenes with a controllable fitting function, leading to\nbetter robustness. In addition, for the output results, we introduce\nsemantic-orientated supervision with priors from the pre-trained\nvision-language model. Instead of merely following reference images, it\nencourages results to meet subjective expectations, finding more\nvisual-friendly solutions. Further, to ease the reliance on paired data and\nreduce solution space, we develop a dual-closed-loop constrained enhancement\nmodule. It is trained cooperatively with other affiliated modules in a\nself-supervised manner. Finally, extensive experiments demonstrate the\nrobustness and superior effectiveness of our proposed NeRCo. Our code is\navailable at https://github.com/Ysz2022/NeRCo.\n","authors":["Shuzhou Yang","Moxuan Ding","Yanmin Wu","Zihan Li","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.11722v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11340v1","updated":"2023-08-22T10:44:30Z","published":"2023-08-22T10:44:30Z","title":"Integration of Sentinel-1 and Sentinel-2 data for Earth surface\n classification using Machine Learning algorithms implemented on Google Earth\n Engine","summary":" In this study, Synthetic Aperture Radar (SAR) and optical data are both\nconsidered for Earth surface classification. Specifically, the integration of\nSentinel-1 (S-1) and Sentinel-2 (S-2) data is carried out through supervised\nMachine Learning (ML) algorithms implemented on the Google Earth Engine (GEE)\nplatform for the classification of a particular region of interest. Achieved\nresults demonstrate how in this case radar and optical remote detection provide\ncomplementary information, benefiting surface cover classification and\ngenerally leading to increased mapping accuracy. In addition, this paper works\nin the direction of proving the emerging role of GEE as an effective\ncloud-based tool for handling large amounts of satellite data.\n","authors":["Francesca Razzano","Mariapia Rita Iandolo","Chiara Zarro","G. S. Yogesh","Silvia Liberata Ullo"],"pdf_url":"https://arxiv.org/pdf/2308.11340v1.pdf","comment":"4 pages, 7 figures, IEEE InGARSS conference"},{"id":"http://arxiv.org/abs/2308.11331v1","updated":"2023-08-22T10:07:49Z","published":"2023-08-22T10:07:49Z","title":"GrowCLIP: Data-aware Automatic Model Growing for Large-scale Contrastive\n Language-Image Pre-training","summary":" Cross-modal pre-training has shown impressive performance on a wide range of\ndownstream tasks, benefiting from massive image-text pairs collected from the\nInternet. In practice, online data are growing constantly, highlighting the\nimportance of the ability of pre-trained model to learn from data that is\ncontinuously growing. Existing works on cross-modal pre-training mainly focus\non training a network with fixed architecture. However, it is impractical to\nlimit the model capacity when considering the continuously growing nature of\npre-training data in real-world applications. On the other hand, it is\nimportant to utilize the knowledge in the current model to obtain efficient\ntraining and better performance. To address the above issues, in this paper, we\npropose GrowCLIP, a data-driven automatic model growing algorithm for\ncontrastive language-image pre-training with continuous image-text pairs as\ninput. Specially, we adopt a dynamic growth space and seek out the optimal\narchitecture at each growth step to adapt to online learning scenarios. And the\nshared encoder is proposed in our growth space to enhance the degree of\ncross-modal fusion. Besides, we explore the effect of growth in different\ndimensions, which could provide future references for the design of cross-modal\nmodel architecture. Finally, we employ parameter inheriting with momentum (PIM)\nto maintain the previous knowledge and address the issue of the local minimum\ndilemma. Compared with the existing methods, GrowCLIP improves 2.3% average\ntop-1 accuracy on zero-shot image classification of 9 downstream tasks. As for\nzero-shot image retrieval, GrowCLIP can improve 1.2% for top-1 image-to-text\nrecall on Flickr30K dataset.\n","authors":["Xinchi Deng","Han Shi","Runhui Huang","Changlin Li","Hang Xu","Jianhua Han","James Kwok","Shen Zhao","Wei Zhang","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2308.11331v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.11327v1","updated":"2023-08-22T09:59:35Z","published":"2023-08-22T09:59:35Z","title":"Object Detection Difficulty: Suppressing Over-aggregation for Faster and\n Better Video Object Detection","summary":" Current video object detection (VOD) models often encounter issues with\nover-aggregation due to redundant aggregation strategies, which perform feature\naggregation on every frame. This results in suboptimal performance and\nincreased computational complexity. In this work, we propose an image-level\nObject Detection Difficulty (ODD) metric to quantify the difficulty of\ndetecting objects in a given image. The derived ODD scores can be used in the\nVOD process to mitigate over-aggregation. Specifically, we train an ODD\npredictor as an auxiliary head of a still-image object detector to compute the\nODD score for each image based on the discrepancies between detection results\nand ground-truth bounding boxes. The ODD score enhances the VOD system in two\nways: 1) it enables the VOD system to select superior global reference frames,\nthereby improving overall accuracy; and 2) it serves as an indicator in the\nnewly designed ODD Scheduler to eliminate the aggregation of frames that are\neasy to detect, thus accelerating the VOD process. Comprehensive experiments\ndemonstrate that, when utilized for selecting global reference frames, ODD-VOD\nconsistently enhances the accuracy of Global-frame-based VOD models. When\nemployed for acceleration, ODD-VOD consistently improves the frames per second\n(FPS) by an average of 73.3% across 8 different VOD models without sacrificing\naccuracy. When combined, ODD-VOD attains state-of-the-art performance when\ncompeting with many VOD methods in both accuracy and speed. Our work represents\na significant advancement towards making VOD more practical for real-world\napplications.\n","authors":["Bingqing Zhang","Sen Wang","Yifan Liu","Brano Kusy","Xue Li","Jiajun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11327v1.pdf","comment":"11 pages, 6 figures, accepted by ACM MM2023"},{"id":"http://arxiv.org/abs/2308.11322v1","updated":"2023-08-22T09:53:12Z","published":"2023-08-22T09:53:12Z","title":"CiteTracker: Correlating Image and Text for Visual Tracking","summary":" Existing visual tracking methods typically take an image patch as the\nreference of the target to perform tracking. However, a single image patch\ncannot provide a complete and precise concept of the target object as images\nare limited in their ability to abstract and can be ambiguous, which makes it\ndifficult to track targets with drastic variations. In this paper, we propose\nthe CiteTracker to enhance target modeling and inference in visual tracking by\nconnecting images and text. Specifically, we develop a text generation module\nto convert the target image patch into a descriptive text containing its class\nand attribute information, providing a comprehensive reference point for the\ntarget. In addition, a dynamic description module is designed to adapt to\ntarget variations for more effective target representation. We then associate\nthe target description and the search image using an attention-based\ncorrelation module to generate the correlated features for target state\nreference. Extensive experiments on five diverse datasets are conducted to\nevaluate the proposed algorithm and the favorable performance against the\nstate-of-the-art methods demonstrates the effectiveness of the proposed\ntracking method.\n","authors":["Xin Li","Yuqing Huang","Zhenyu He","Yaowei Wang","Huchuan Lu","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2308.11322v1.pdf","comment":"accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11316v1","updated":"2023-08-22T09:49:26Z","published":"2023-08-22T09:49:26Z","title":"Using and Abusing Equivariance","summary":" In this paper we show how Group Equivariant Convolutional Neural Networks use\nsubsampling to learn to break equivariance to their symmetries. We focus on 2D\nrotations and reflections and investigate the impact of broken equivariance on\nnetwork performance. We show that a change in the input dimension of a network\nas small as a single pixel can be enough for commonly used architectures to\nbecome approximately equivariant, rather than exactly. We investigate the\nimpact of networks not being exactly equivariant and find that approximately\nequivariant networks generalise significantly worse to unseen symmetries\ncompared to their exactly equivariant counterparts. However, when the\nsymmetries in the training data are not identical to the symmetries of the\nnetwork, we find that approximately equivariant networks are able to relax\ntheir own equivariant constraints, causing them to match or outperform exactly\nequivariant networks on common benchmark datasets.\n","authors":["Tom Edixhoven","Attila Lengyel","Jan van Gemert"],"pdf_url":"https://arxiv.org/pdf/2308.11316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.09944v4","updated":"2023-08-22T09:31:35Z","published":"2022-07-20T14:41:09Z","title":"Probable Domain Generalization via Quantile Risk Minimization","summary":" Domain generalization (DG) seeks predictors which perform well on unseen test\ndistributions by leveraging data drawn from multiple related training\ndistributions or domains. To achieve this, DG is commonly formulated as an\naverage- or worst-case problem over the set of possible domains. However,\npredictors that perform well on average lack robustness while predictors that\nperform well in the worst case tend to be overly-conservative. To address this,\nwe propose a new probabilistic framework for DG where the goal is to learn\npredictors that perform well with high probability. Our key idea is that\ndistribution shifts seen during training should inform us of probable shifts at\ntest time, which we realize by explicitly relating training and test domains as\ndraws from the same underlying meta-distribution. To achieve probable DG, we\npropose a new optimization problem called Quantile Risk Minimization (QRM). By\nminimizing the $\\alpha$-quantile of predictor's risk distribution over domains,\nQRM seeks predictors that perform well with probability $\\alpha$. To solve QRM\nin practice, we propose the Empirical QRM (EQRM) algorithm and provide: (i) a\ngeneralization bound for EQRM; and (ii) the conditions under which EQRM\nrecovers the causal predictor as $\\alpha \\to 1$. In our experiments, we\nintroduce a more holistic quantile-focused evaluation protocol for DG and\ndemonstrate that EQRM outperforms state-of-the-art baselines on datasets from\nWILDS and DomainBed.\n","authors":["Cian Eastwood","Alexander Robey","Shashank Singh","Julius von Kügelgen","Hamed Hassani","George J. Pappas","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2207.09944v4.pdf","comment":"NeurIPS 2022 camera-ready (+ minor corrections)"},{"id":"http://arxiv.org/abs/2308.11300v1","updated":"2023-08-22T09:29:05Z","published":"2023-08-22T09:29:05Z","title":"Approaching human 3D shape perception with neurally mappable models","summary":" Humans effortlessly infer the 3D shape of objects. What computations underlie\nthis ability? Although various computational models have been proposed, none of\nthem capture the human ability to match object shape across viewpoints. Here,\nwe ask whether and how this gap might be closed. We begin with a relatively\nnovel class of computational models, 3D neural fields, which encapsulate the\nbasic principles of classic analysis-by-synthesis in a deep neural network\n(DNN). First, we find that a 3D Light Field Network (3D-LFN) supports 3D\nmatching judgments well aligned to humans for within-category comparisons,\nadversarially-defined comparisons that accentuate the 3D failure cases of\nstandard DNN models, and adversarially-defined comparisons for algorithmically\ngenerated shapes with no category structure. We then investigate the source of\nthe 3D-LFN's ability to achieve human-aligned performance through a series of\ncomputational experiments. Exposure to multiple viewpoints of objects during\ntraining and a multi-view learning objective are the primary factors behind\nmodel-human alignment; even conventional DNN architectures come much closer to\nhuman behavior when trained with multi-view objectives. Finally, we find that\nwhile the models trained with multi-view learning objectives are able to\npartially generalize to new object categories, they fall short of human\nalignment. This work provides a foundation for understanding human shape\ninferences within neurally mappable computational architectures and highlights\nimportant questions for future work.\n","authors":["Thomas P. O'Connell","Tyler Bonnen","Yoni Friedman","Ayush Tewari","Josh B. Tenenbaum","Vincent Sitzmann","Nancy Kanwisher"],"pdf_url":"https://arxiv.org/pdf/2308.11300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04869v2","updated":"2023-08-22T09:21:46Z","published":"2023-03-08T20:22:08Z","title":"CROSSFIRE: Camera Relocalization On Self-Supervised Features from an\n Implicit Representation","summary":" Beyond novel view synthesis, Neural Radiance Fields are useful for\napplications that interact with the real world. In this paper, we use them as\nan implicit map of a given scene and propose a camera relocalization algorithm\ntailored for this representation. The proposed method enables to compute in\nreal-time the precise position of a device using a single RGB camera, during\nits navigation. In contrast with previous work, we do not rely on pose\nregression or photometric alignment but rather use dense local features\nobtained through volumetric rendering which are specialized on the scene with a\nself-supervised objective. As a result, our algorithm is more accurate than\ncompetitors, able to operate in dynamic outdoor environments with changing\nlightning conditions and can be readily integrated in any volumetric neural\nrenderer.\n","authors":["Arthur Moreau","Nathan Piasco","Moussab Bennehar","Dzmitry Tsishkou","Bogdan Stanciulescu","Arnaud de La Fortelle"],"pdf_url":"https://arxiv.org/pdf/2303.04869v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11298v1","updated":"2023-08-22T09:20:55Z","published":"2023-08-22T09:20:55Z","title":"BHSD: A 3D Multi-Class Brain Hemorrhage Segmentation Dataset","summary":" Intracranial hemorrhage (ICH) is a pathological condition characterized by\nbleeding inside the skull or brain, which can be attributed to various factors.\nIdentifying, localizing and quantifying ICH has important clinical\nimplications, in a bleed-dependent manner. While deep learning techniques are\nwidely used in medical image segmentation and have been applied to the ICH\nsegmentation task, existing public ICH datasets do not support the multi-class\nsegmentation problem. To address this, we develop the Brain Hemorrhage\nSegmentation Dataset (BHSD), which provides a 3D multi-class ICH dataset\ncontaining 192 volumes with pixel-level annotations and 2200 volumes with\nslice-level annotations across five categories of ICH. To demonstrate the\nutility of the dataset, we formulate a series of supervised and semi-supervised\nICH segmentation tasks. We provide experimental results with state-of-the-art\nmodels as reference benchmarks for further model developments and evaluations\non this dataset.\n","authors":["Biao Wu","Yutong Xie","Zeyu Zhang","Jinchao Ge","Kaspar Yaxley","Suzan Bahadir","Qi Wu","Yifan Liu","Minh-Son To"],"pdf_url":"https://arxiv.org/pdf/2308.11298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11291v1","updated":"2023-08-22T09:12:11Z","published":"2023-08-22T09:12:11Z","title":"Improving Knot Prediction in Wood Logs with Longitudinal Feature\n Propagation","summary":" The quality of a wood log in the wood industry depends heavily on the\npresence of both outer and inner defects, including inner knots that are a\nresult of the growth of tree branches. Today, locating the inner knots require\nthe use of expensive equipment such as X-ray scanners. In this paper, we\naddress the task of predicting the location of inner defects from the outer\nshape of the logs. The dataset is built by extracting both the contours and the\nknots with X-ray measurements. We propose to solve this binary segmentation\ntask by leveraging convolutional recurrent neural networks. Once the neural\nnetwork is trained, inference can be performed from the outer shape measured\nwith cheap devices such as laser profilers. We demonstrate the effectiveness of\nour approach on fir and spruce tree species and perform ablation on the\nrecurrence to demonstrate its importance.\n","authors":["Salim Khazem","Jeremy Fix","Cédric Pradalier"],"pdf_url":"https://arxiv.org/pdf/2308.11291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14729v2","updated":"2023-08-22T09:06:01Z","published":"2023-07-27T09:35:56Z","title":"Understanding Silent Failures in Medical Image Classification","summary":" To ensure the reliable use of classification systems in medical applications,\nit is crucial to prevent silent failures. This can be achieved by either\ndesigning classifiers that are robust enough to avoid failures in the first\nplace, or by detecting remaining failures using confidence scoring functions\n(CSFs). A predominant source of failures in image classification is\ndistribution shifts between training data and deployment data. To understand\nthe current state of silent failure prevention in medical imaging, we conduct\nthe first comprehensive analysis comparing various CSFs in four biomedical\ntasks and a diverse range of distribution shifts. Based on the result that none\nof the benchmarked CSFs can reliably prevent silent failures, we conclude that\na deeper understanding of the root causes of failures in the data is required.\nTo facilitate this, we introduce SF-Visuals, an interactive analysis tool that\nuses latent space clustering to visualize shifts and failures. On the basis of\nvarious examples, we demonstrate how this tool can help researchers gain\ninsight into the requirements for safe application of classification systems in\nthe medical domain. The open-source benchmark and tool are at:\nhttps://github.com/IML-DKFZ/sf-visuals.\n","authors":["Till J. Bungert","Levin Kobelke","Paul F. Jaeger"],"pdf_url":"https://arxiv.org/pdf/2307.14729v2.pdf","comment":"Accepted at MICCAI 23"},{"id":"http://arxiv.org/abs/2308.11281v1","updated":"2023-08-22T08:50:38Z","published":"2023-08-22T08:50:38Z","title":"PCMC-T1: Free-breathing myocardial T1 mapping with\n Physically-Constrained Motion Correction","summary":" T1 mapping is a quantitative magnetic resonance imaging (qMRI) technique that\nhas emerged as a valuable tool in the diagnosis of diffuse myocardial diseases.\nHowever, prevailing approaches have relied heavily on breath-hold sequences to\neliminate respiratory motion artifacts. This limitation hinders accessibility\nand effectiveness for patients who cannot tolerate breath-holding. Image\nregistration can be used to enable free-breathing T1 mapping. Yet, inherent\nintensity differences between the different time points make the registration\ntask challenging. We introduce PCMC-T1, a physically-constrained deep-learning\nmodel for motion correction in free-breathing T1 mapping. We incorporate the\nsignal decay model into the network architecture to encourage\nphysically-plausible deformations along the longitudinal relaxation axis. We\ncompared PCMC-T1 to baseline deep-learning-based image registration approaches\nusing a 5-fold experimental setup on a publicly available dataset of 210\npatients. PCMC-T1 demonstrated superior model fitting quality (R2: 0.955) and\nachieved the highest clinical impact (clinical score: 3.93) compared to\nbaseline methods (0.941, 0.946 and 3.34, 3.62 respectively). Anatomical\nalignment results were comparable (Dice score: 0.9835 vs. 0.984, 0.988). Our\ncode and trained models are available at https://github.com/eyalhana/PCMC-T1.\n","authors":["Eyal Hanania","Ilya Volovik","Lilach Barkat","Israel Cohen","Moti Freiman"],"pdf_url":"https://arxiv.org/pdf/2308.11281v1.pdf","comment":"Accepted to MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.11277v1","updated":"2023-08-22T08:46:30Z","published":"2023-08-22T08:46:30Z","title":"CNN based Cuneiform Sign Detection Learned from Annotated 3D Renderings\n and Mapped Photographs with Illumination Augmentation","summary":" Motivated by the challenges of the Digital Ancient Near Eastern Studies\n(DANES) community, we develop digital tools for processing cuneiform script\nbeing a 3D script imprinted into clay tablets used for more than three\nmillennia and at least eight major languages. It consists of thousands of\ncharacters that have changed over time and space. Photographs are the most\ncommon representations usable for machine learning, while ink drawings are\nprone to interpretation. Best suited 3D datasets that are becoming available.\nWe created and used the HeiCuBeDa and MaiCuBeDa datasets, which consist of\naround 500 annotated tablets. For our novel OCR-like approach to mixed image\ndata, we provide an additional mapping tool for transferring annotations\nbetween 3D renderings and photographs. Our sign localization uses a RepPoints\ndetector to predict the locations of characters as bounding boxes. We use image\ndata from GigaMesh's MSII (curvature, see https://gigamesh.eu) based rendering,\nPhong-shaded 3D models, and photographs as well as illumination augmentation.\nThe results show that using rendered 3D images for sign detection performs\nbetter than other work on photographs. In addition, our approach gives\nreasonably good results for photographs only, while it is best used for mixed\ndatasets. More importantly, the Phong renderings, and especially the MSII\nrenderings, improve the results on photographs, which is the largest dataset on\na global scale.\n","authors":["Ernst Stötzner","Timo Homburg","Hubert Mara"],"pdf_url":"https://arxiv.org/pdf/2308.11277v1.pdf","comment":"This paper was accepted to ICCV23 and includes the DOI for an Open\n Access Dataset with annotated cuneiform script"},{"id":"http://arxiv.org/abs/2303.08923v2","updated":"2023-08-22T08:26:40Z","published":"2023-03-15T20:41:24Z","title":"Panoptic Mapping with Fruit Completion and Pose Estimation for\n Horticultural Robots","summary":" Monitoring plants and fruits at high resolution play a key role in the future\nof agriculture. Accurate 3D information can pave the way to a diverse number of\nrobotic applications in agriculture ranging from autonomous harvesting to\nprecise yield estimation. Obtaining such 3D information is non-trivial as\nagricultural environments are often repetitive and cluttered, and one has to\naccount for the partial observability of fruit and plants. In this paper, we\naddress the problem of jointly estimating complete 3D shapes of fruit and their\npose in a 3D multi-resolution map built by a mobile robot. To this end, we\npropose an online multi-resolution panoptic mapping system where regions of\ninterest are represented with a higher resolution. We exploit data to learn a\ngeneral fruit shape representation that we use at inference time together with\nan occlusion-aware differentiable rendering pipeline to complete partial fruit\nobservations and estimate the 7 DoF pose of each fruit in the map. The\nexperiments presented in this paper evaluated both in the controlled\nenvironment and in a commercial greenhouse, show that our novel algorithm\nyields higher completion and pose estimation accuracy than existing methods,\nwith an improvement of 41% in completion accuracy and 52% in pose estimation\naccuracy while keeping a low inference time of 0.6s in average. Codes are\navailable at: https://github.com/PRBonn/HortiMapping.\n","authors":["Yue Pan","Federico Magistri","Thomas Läbe","Elias Marks","Claus Smitt","Chris McCool","Jens Behley","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2303.08923v2.pdf","comment":"8 pages, IROS 2023"},{"id":"http://arxiv.org/abs/2308.11261v1","updated":"2023-08-22T08:07:12Z","published":"2023-08-22T08:07:12Z","title":"HMD-NeMo: Online 3D Avatar Motion Generation From Sparse Observations","summary":" Generating both plausible and accurate full body avatar motion is the key to\nthe quality of immersive experiences in mixed reality scenarios. Head-Mounted\nDevices (HMDs) typically only provide a few input signals, such as head and\nhands 6-DoF. Recently, different approaches achieved impressive performance in\ngenerating full body motion given only head and hands signal. However, to the\nbest of our knowledge, all existing approaches rely on full hand visibility.\nWhile this is the case when, e.g., using motion controllers, a considerable\nproportion of mixed reality experiences do not involve motion controllers and\ninstead rely on egocentric hand tracking. This introduces the challenge of\npartial hand visibility owing to the restricted field of view of the HMD. In\nthis paper, we propose the first unified approach, HMD-NeMo, that addresses\nplausible and accurate full body motion generation even when the hands may be\nonly partially visible. HMD-NeMo is a lightweight neural network that predicts\nthe full body motion in an online and real-time fashion. At the heart of\nHMD-NeMo is the spatio-temporal encoder with novel temporally adaptable mask\ntokens that encourage plausible motion in the absence of hand observations. We\nperform extensive analysis of the impact of different components in HMD-NeMo\nand introduce a new state-of-the-art on AMASS dataset through our evaluation.\n","authors":["Sadegh Aliakbarian","Fatemeh Saleh","David Collier","Pashmina Cameron","Darren Cosker"],"pdf_url":"https://arxiv.org/pdf/2308.11261v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.08476v2","updated":"2023-08-22T07:52:58Z","published":"2023-08-16T16:31:36Z","title":"Classification Committee for Active Deep Object Detection","summary":" In object detection, the cost of labeling is much high because it needs not\nonly to confirm the categories of multiple objects in an image but also to\naccurately determine the bounding boxes of each object. Thus, integrating\nactive learning into object detection will raise pretty positive significance.\nIn this paper, we propose a classification committee for active deep object\ndetection method by introducing a discrepancy mechanism of multiple classifiers\nfor samples' selection when training object detectors. The model contains a\nmain detector and a classification committee. The main detector denotes the\ntarget object detector trained from a labeled pool composed of the selected\ninformative images. The role of the classification committee is to select the\nmost informative images according to their uncertainty values from the view of\nclassification, which is expected to focus more on the discrepancy and\nrepresentative of instances. Specifically, they compute the uncertainty for a\nspecified instance within the image by measuring its discrepancy output by the\ncommittee pre-trained via the proposed Maximum Classifiers Discrepancy Group\nLoss (MCDGL). The most informative images are finally determined by selecting\nthe ones with many high-uncertainty instances. Besides, to mitigate the impact\nof interference instances, we design a Focus on Positive Instances Loss (FPIL)\nto make the committee the ability to automatically focus on the representative\ninstances as well as precisely encode their discrepancies for the same\ninstance. Experiments are conducted on Pascal VOC and COCO datasets versus some\npopular object detectors. And results show that our method outperforms the\nstate-of-the-art active learning methods, which verifies the effectiveness of\nthe proposed method.\n","authors":["Lei Zhao","Bo Li","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2308.08476v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00409v2","updated":"2023-08-22T07:45:09Z","published":"2023-06-01T07:19:28Z","title":"Adapting Pre-trained Language Models to Vision-Language Tasks via\n Dynamic Visual Prompting","summary":" Pre-trained language models (PLMs) have played an increasing role in\nmultimedia research. In terms of vision-language (VL) tasks, they often serve\nas a language encoder and still require an additional fusion network for VL\nreasoning, resulting in excessive memory overhead. In this paper, we focus on\nexploring PLMs as a stand-alone model for VL reasoning tasks. Inspired by the\nrecently popular prompt tuning, we first prove that the processed visual\nfeatures can be also projected onto the semantic space of PLMs and act as\nprompt tokens to bridge the gap between single- and multi-modal learning.\nHowever, this solution exhibits obvious redundancy in visual information and\nmodel inference, and the placement of prompt tokens also greatly affects the\nfinal performance. Based on these observations, we further propose a novel\ntransfer learning approach for PLMs, termed Dynamic Visual Prompting (DVP).\nConcretely, DVP first deploys a cross-attention module to obtain text-related\nand compact visual prompt tokens, thereby greatly reducing the input length of\nPLMs. To obtain the optimal placement, we also equip DVP with a\nreinforcement-learning based search algorithm, which can automatically merge\nDVP with PLMs for different VL tasks via a very short search process. In\naddition, we also experiment DVP with the recently popular adapter approach to\nkeep the most parameters of PLMs intact when adapting to VL tasks, helping PLMs\nachieve a quick shift between single- and multi-modal tasks. We apply DVP to\ntwo representative PLMs, namely BERT and T5, and conduct extensive experiments\non a set of VL reasoning benchmarks including VQA2.0, GQA and SNLIVE. The\nexperimental results not only show the advantage of DVP on efficiency and\nperformance, but also confirm its superiority in adapting pre-trained language\nmodels to VL tasks.\n","authors":["Shubin Huang","Qiong Wu","Yiyi Zhou","Weijie Chen","Rongsheng Zhang","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2306.00409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11249v1","updated":"2023-08-22T07:44:59Z","published":"2023-08-22T07:44:59Z","title":"Video BagNet: short temporal receptive fields increase robustness in\n long-term action recognition","summary":" Previous work on long-term video action recognition relies on deep\n3D-convolutional models that have a large temporal receptive field (RF). We\nargue that these models are not always the best choice for temporal modeling in\nvideos. A large temporal receptive field allows the model to encode the exact\nsub-action order of a video, which causes a performance decrease when testing\nvideos have a different sub-action order. In this work, we investigate whether\nwe can improve the model robustness to the sub-action order by shrinking the\ntemporal receptive field of action recognition models. For this, we design\nVideo BagNet, a variant of the 3D ResNet-50 model with the temporal receptive\nfield size limited to 1, 9, 17 or 33 frames. We analyze Video BagNet on\nsynthetic and real-world video datasets and experimentally compare models with\nvarying temporal receptive fields. We find that short receptive fields are\nrobust to sub-action order changes, while larger temporal receptive fields are\nsensitive to the sub-action order.\n","authors":["Ombretta Strafforello","Xin Liu","Klamer Schutte","Jan van Gemert"],"pdf_url":"https://arxiv.org/pdf/2308.11249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11244v1","updated":"2023-08-22T07:39:31Z","published":"2023-08-22T07:39:31Z","title":"Are current long-term video understanding datasets long-term?","summary":" Many real-world applications, from sport analysis to surveillance, benefit\nfrom automatic long-term action recognition. In the current deep learning\nparadigm for automatic action recognition, it is imperative that models are\ntrained and tested on datasets and tasks that evaluate if such models actually\nlearn and reason over long-term information. In this work, we propose a method\nto evaluate how suitable a video dataset is to evaluate models for long-term\naction recognition. To this end, we define a long-term action as excluding all\nthe videos that can be correctly recognized using solely short-term\ninformation. We test this definition on existing long-term classification tasks\non three popular real-world datasets, namely Breakfast, CrossTask and LVU, to\ndetermine if these datasets are truly evaluating long-term recognition. Our\nstudy reveals that these datasets can be effectively solved using shortcuts\nbased on short-term information. Following this finding, we encourage long-term\naction recognition researchers to make use of datasets that need long-term\ninformation to be solved.\n","authors":["Ombretta Strafforello","Klamer Schutte","Jan van Gemert"],"pdf_url":"https://arxiv.org/pdf/2308.11244v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11239v1","updated":"2023-08-22T07:27:09Z","published":"2023-08-22T07:27:09Z","title":"LOCATE: Self-supervised Object Discovery via Flow-guided Graph-cut and\n Bootstrapped Self-training","summary":" Learning object segmentation in image and video datasets without human\nsupervision is a challenging problem. Humans easily identify moving salient\nobjects in videos using the gestalt principle of common fate, which suggests\nthat what moves together belongs together. Building upon this idea, we propose\na self-supervised object discovery approach that leverages motion and\nappearance information to produce high-quality object segmentation masks.\nSpecifically, we redesign the traditional graph cut on images to include motion\ninformation in a linear combination with appearance information to produce edge\nweights. Remarkably, this step produces object segmentation masks comparable to\nthe current state-of-the-art on multiple benchmarks. To further improve\nperformance, we bootstrap a segmentation network trained on these preliminary\nmasks as pseudo-ground truths to learn from its own outputs via self-training.\nWe demonstrate the effectiveness of our approach, named LOCATE, on multiple\nstandard video object segmentation, image saliency detection, and object\nsegmentation benchmarks, achieving results on par with and, in many cases\nsurpassing state-of-the-art methods. We also demonstrate the transferability of\nour approach to novel domains through a qualitative study on in-the-wild\nimages. Additionally, we present extensive ablation analysis to support our\ndesign choices and highlight the contribution of each component of our proposed\nmethod.\n","authors":["Silky Singh","Shripad Deshmukh","Mausoom Sarkar","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2308.11239v1.pdf","comment":"Accepted to the British Machine Vision Conference (BMVC) 2023"},{"id":"http://arxiv.org/abs/2308.10315v2","updated":"2023-08-22T07:20:59Z","published":"2023-08-20T16:27:17Z","title":"Improving Adversarial Robustness of Masked Autoencoders via Test-time\n Frequency-domain Prompting","summary":" In this paper, we investigate the adversarial robustness of vision\ntransformers that are equipped with BERT pretraining (e.g., BEiT, MAE). A\nsurprising observation is that MAE has significantly worse adversarial\nrobustness than other BERT pretraining methods. This observation drives us to\nrethink the basic differences between these BERT pretraining methods and how\nthese differences affect the robustness against adversarial perturbations. Our\nempirical analysis reveals that the adversarial robustness of BERT pretraining\nis highly related to the reconstruction target, i.e., predicting the raw pixels\nof masked image patches will degrade more adversarial robustness of the model\nthan predicting the semantic context, since it guides the model to concentrate\nmore on medium-/high-frequency components of images. Based on our analysis, we\nprovide a simple yet effective way to boost the adversarial robustness of MAE.\nThe basic idea is using the dataset-extracted domain knowledge to occupy the\nmedium-/high-frequency of images, thus narrowing the optimization space of\nadversarial perturbations. Specifically, we group the distribution of\npretraining data and optimize a set of cluster-specific visual prompts on\nfrequency domain. These prompts are incorporated with input images through\nprototype-based prompt selection during test period. Extensive evaluation shows\nthat our method clearly boost MAE's adversarial robustness while maintaining\nits clean performance on ImageNet-1k classification. Our code is available at:\nhttps://github.com/shikiw/RobustMAE.\n","authors":["Qidong Huang","Xiaoyi Dong","Dongdong Chen","Yinpeng Chen","Lu Yuan","Gang Hua","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2308.10315v2.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.10685v2","updated":"2023-08-22T07:15:30Z","published":"2023-07-20T08:25:38Z","title":"Pre-train, Adapt and Detect: Multi-Task Adapter Tuning for Camouflaged\n Object Detection","summary":" Camouflaged object detection (COD), aiming to segment camouflaged objects\nwhich exhibit similar patterns with the background, is a challenging task. Most\nexisting works are dedicated to establishing specialized modules to identify\ncamouflaged objects with complete and fine details, while the boundary can not\nbe well located for the lack of object-related semantics. In this paper, we\npropose a novel ``pre-train, adapt and detect\" paradigm to detect camouflaged\nobjects. By introducing a large pre-trained model, abundant knowledge learned\nfrom massive multi-modal data can be directly transferred to COD. A lightweight\nparallel adapter is inserted to adjust the features suitable for the downstream\nCOD task. Extensive experiments on four challenging benchmark datasets\ndemonstrate that our method outperforms existing state-of-the-art COD models by\nlarge margins. Moreover, we design a multi-task learning scheme for tuning the\nadapter to exploit the shareable knowledge across different semantic classes.\nComprehensive experimental results showed that the generalization ability of\nour model can be substantially improved with multi-task adapter initialization\non source tasks and multi-task adaptation on target tasks.\n","authors":["Yinghui Xing","Dexuan Kong","Shizhou Zhang","Geng Chen","Lingyan Ran","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10685v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11233v1","updated":"2023-08-22T07:14:29Z","published":"2023-08-22T07:14:29Z","title":"Affordance segmentation of hand-occluded containers from exocentric\n images","summary":" Visual affordance segmentation identifies the surfaces of an object an agent\ncan interact with. Common challenges for the identification of affordances are\nthe variety of the geometry and physical properties of these surfaces as well\nas occlusions. In this paper, we focus on occlusions of an object that is\nhand-held by a person manipulating it. To address this challenge, we propose an\naffordance segmentation model that uses auxiliary branches to process the\nobject and hand regions separately. The proposed model learns affordance\nfeatures under hand-occlusion by weighting the feature map through hand and\nobject segmentation. To train the model, we annotated the visual affordances of\nan existing dataset with mixed-reality images of hand-held containers in\nthird-person (exocentric) images. Experiments on both real and mixed-reality\nimages show that our model achieves better affordance segmentation and\ngeneralisation than existing models.\n","authors":["Tommaso Apicella","Alessio Xompero","Edoardo Ragusa","Riccardo Berta","Andrea Cavallaro","Paolo Gastaldo"],"pdf_url":"https://arxiv.org/pdf/2308.11233v1.pdf","comment":"Paper accepted to Workshop on Assistive Computer Vision and Robotics\n (ACVR) in International Conference on Computer Vision (ICCV) 2023; 10 pages,\n 4 figures, 2 tables. Data, code, and trained models are available at\n https://apicis.github.io/projects/acanet.html"},{"id":"http://arxiv.org/abs/2308.06452v3","updated":"2023-08-22T07:11:04Z","published":"2023-08-12T03:13:38Z","title":"Improved YOLOv8 Detection Algorithm in Security Inspection Image","summary":" Security inspection is the first line of defense to ensure the safety of\npeople's lives and property, and intelligent security inspection is an\ninevitable trend in the future development of the security inspection industry.\nAiming at the problems of overlapping detection objects, false detection of\ncontraband, and missed detection in the process of X-ray image detection, an\nimproved X-ray contraband detection algorithm CSS-YOLO based on YOLOv8s is\nproposed.\n","authors":["Liyao Lu"],"pdf_url":"https://arxiv.org/pdf/2308.06452v3.pdf","comment":"23 pages,23 figures"},{"id":"http://arxiv.org/abs/2308.11223v1","updated":"2023-08-22T06:28:55Z","published":"2023-08-22T06:28:55Z","title":"LDP-Feat: Image Features with Local Differential Privacy","summary":" Modern computer vision services often require users to share raw feature\ndescriptors with an untrusted server. This presents an inherent privacy risk,\nas raw descriptors may be used to recover the source images from which they\nwere extracted. To address this issue, researchers recently proposed\nprivatizing image features by embedding them within an affine subspace\ncontaining the original feature as well as adversarial feature samples. In this\npaper, we propose two novel inversion attacks to show that it is possible to\n(approximately) recover the original image features from these embeddings,\nallowing us to recover privacy-critical image content. In light of such\nsuccesses and the lack of theoretical privacy guarantees afforded by existing\nvisual privacy methods, we further propose the first method to privatize image\nfeatures via local differential privacy, which, unlike prior approaches,\nprovides a guaranteed bound for privacy leakage regardless of the strength of\nthe attacks. In addition, our method yields strong performance in visual\nlocalization as a downstream task while enjoying the privacy guarantee.\n","authors":["Francesco Pittaluga","Bingbing Zhuang"],"pdf_url":"https://arxiv.org/pdf/2308.11223v1.pdf","comment":"11 pages, 4 figures, to be published in International Conference on\n Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2303.02608v2","updated":"2023-08-22T06:19:48Z","published":"2023-03-05T08:44:01Z","title":"Event-based Camera Simulation using Monte Carlo Path Tracing with\n Adaptive Denoising","summary":" This paper presents an algorithm to obtain an event-based video from noisy\nframes given by physics-based Monte Carlo path tracing over a synthetic 3D\nscene. Given the nature of dynamic vision sensor (DVS), rendering event-based\nvideo can be viewed as a process of detecting the changes from noisy brightness\nvalues. We extend a denoising method based on a weighted local regression (WLR)\nto detect the brightness changes rather than applying denoising to every pixel.\nSpecifically, we derive a threshold to determine the likelihood of event\noccurrence and reduce the number of times to perform the regression. Our method\nis robust to noisy video frames obtained from a few path-traced samples.\nDespite its efficiency, our method performs comparably to or even better than\nan approach that exhaustively denoises every frame.\n","authors":["Yuta Tsuji","Tatsuya Yatagawa","Hiroyuki Kubo","Shigeo Morishima"],"pdf_url":"https://arxiv.org/pdf/2303.02608v2.pdf","comment":"8 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2308.11206v1","updated":"2023-08-22T05:43:33Z","published":"2023-08-22T05:43:33Z","title":"DiffCloth: Diffusion Based Garment Synthesis and Manipulation via\n Structural Cross-modal Semantic Alignment","summary":" Cross-modal garment synthesis and manipulation will significantly benefit the\nway fashion designers generate garments and modify their designs via flexible\nlinguistic interfaces.Current approaches follow the general text-to-image\nparadigm and mine cross-modal relations via simple cross-attention modules,\nneglecting the structural correspondence between visual and textual\nrepresentations in the fashion design domain. In this work, we instead\nintroduce DiffCloth, a diffusion-based pipeline for cross-modal garment\nsynthesis and manipulation, which empowers diffusion models with flexible\ncompositionality in the fashion domain by structurally aligning the cross-modal\nsemantics. Specifically, we formulate the part-level cross-modal alignment as a\nbipartite matching problem between the linguistic Attribute-Phrases (AP) and\nthe visual garment parts which are obtained via constituency parsing and\nsemantic segmentation, respectively. To mitigate the issue of attribute\nconfusion, we further propose a semantic-bundled cross-attention to preserve\nthe spatial structure similarities between the attention maps of attribute\nadjectives and part nouns in each AP. Moreover, DiffCloth allows for\nmanipulation of the generated results by simply replacing APs in the text\nprompts. The manipulation-irrelevant regions are recognized by blended masks\nobtained from the bundled attention maps of the APs and kept unchanged.\nExtensive experiments on the CM-Fashion benchmark demonstrate that DiffCloth\nboth yields state-of-the-art garment synthesis results by leveraging the\ninherent structural information and supports flexible manipulation with region\nconsistency.\n","authors":["Xujie Zhang","Binbin Yang","Michael C. Kampffmeyer","Wenqing Zhang","Shiyue Zhang","Guansong Lu","Liang Lin","Hang Xu","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2308.11206v1.pdf","comment":"accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.11201v1","updated":"2023-08-22T05:36:39Z","published":"2023-08-22T05:36:39Z","title":"Masked Cross-image Encoding for Few-shot Segmentation","summary":" Few-shot segmentation (FSS) is a dense prediction task that aims to infer the\npixel-wise labels of unseen classes using only a limited number of annotated\nimages. The key challenge in FSS is to classify the labels of query pixels\nusing class prototypes learned from the few labeled support exemplars. Prior\napproaches to FSS have typically focused on learning class-wise descriptors\nindependently from support images, thereby ignoring the rich contextual\ninformation and mutual dependencies among support-query features. To address\nthis limitation, we propose a joint learning method termed Masked Cross-Image\nEncoding (MCE), which is designed to capture common visual properties that\ndescribe object details and to learn bidirectional inter-image dependencies\nthat enhance feature interaction. MCE is more than a visual representation\nenrichment module; it also considers cross-image mutual dependencies and\nimplicit guidance. Experiments on FSS benchmarks PASCAL-$5^i$ and COCO-$20^i$\ndemonstrate the advanced meta-learning ability of the proposed method.\n","authors":["Wenbo Xu","Huaxi Huang","Ming Cheng","Litao Yu","Qiang Wu","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11201v1.pdf","comment":"conference"},{"id":"http://arxiv.org/abs/2302.04308v2","updated":"2023-08-22T05:23:21Z","published":"2023-02-08T19:53:07Z","title":"Enhancing Modality-Agnostic Representations via Meta-Learning for Brain\n Tumor Segmentation","summary":" In medical vision, different imaging modalities provide complementary\ninformation. However, in practice, not all modalities may be available during\ninference or even training. Previous approaches, e.g., knowledge distillation\nor image synthesis, often assume the availability of full modalities for all\npatients during training; this is unrealistic and impractical due to the\nvariability in data collection across sites. We propose a novel approach to\nlearn enhanced modality-agnostic representations by employing a meta-learning\nstrategy in training, even when only limited full modality samples are\navailable. Meta-learning enhances partial modality representations to full\nmodality representations by meta-training on partial modality data and\nmeta-testing on limited full modality samples. Additionally, we co-supervise\nthis feature enrichment by introducing an auxiliary adversarial learning\nbranch. More specifically, a missing modality detector is used as a\ndiscriminator to mimic the full modality setting. Our segmentation framework\nsignificantly outperforms state-of-the-art brain tumor segmentation techniques\nin missing modality scenarios.\n","authors":["Aishik Konwer","Xiaoling Hu","Joseph Bae","Xuan Xu","Chao Chen","Prateek Prasanna"],"pdf_url":"https://arxiv.org/pdf/2302.04308v2.pdf","comment":"Accepted in ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11199v1","updated":"2023-08-22T05:21:31Z","published":"2023-08-22T05:21:31Z","title":"ConcatPlexer: Additional Dim1 Batching for Faster ViTs","summary":" Transformers have demonstrated tremendous success not only in the natural\nlanguage processing (NLP) domain but also the field of computer vision,\nigniting various creative approaches and applications. Yet, the superior\nperformance and modeling flexibility of transformers came with a severe\nincrease in computation costs, and hence several works have proposed methods to\nreduce this burden. Inspired by a cost-cutting method originally proposed for\nlanguage models, Data Multiplexing (DataMUX), we propose a novel approach for\nefficient visual recognition that employs additional dim1 batching (i.e.,\nconcatenation) that greatly improves the throughput with little compromise in\nthe accuracy. We first introduce a naive adaptation of DataMux for vision\nmodels, Image Multiplexer, and devise novel components to overcome its\nweaknesses, rendering our final model, ConcatPlexer, at the sweet spot between\ninference speed and accuracy. The ConcatPlexer was trained on ImageNet1K and\nCIFAR100 dataset and it achieved 23.5% less GFLOPs than ViT-B/16 with 69.5% and\n83.4% validation accuracy, respectively.\n","authors":["Donghoon Han","Seunghyeon Seo","Donghyeon Jeon","Jiho Jang","Chaerin Kong","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2308.11199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11198v1","updated":"2023-08-22T05:17:41Z","published":"2023-08-22T05:17:41Z","title":"Novel-view Synthesis and Pose Estimation for Hand-Object Interaction\n from Sparse Views","summary":" Hand-object interaction understanding and the barely addressed novel view\nsynthesis are highly desired in the immersive communication, whereas it is\nchallenging due to the high deformation of hand and heavy occlusions between\nhand and object. In this paper, we propose a neural rendering and pose\nestimation system for hand-object interaction from sparse views, which can also\nenable 3D hand-object interaction editing. We share the inspiration from recent\nscene understanding work that shows a scene specific model built beforehand can\nsignificantly improve and unblock vision tasks especially when inputs are\nsparse, and extend it to the dynamic hand-object interaction scenario and\npropose to solve the problem in two stages. We first learn the shape and\nappearance prior knowledge of hands and objects separately with the neural\nrepresentation at the offline stage. During the online stage, we design a\nrendering-based joint model fitting framework to understand the dynamic\nhand-object interaction with the pre-built hand and object models as well as\ninteraction priors, which thereby overcomes penetration and separation issues\nbetween hand and object and also enables novel view synthesis. In order to get\nstable contact during the hand-object interaction process in a sequence, we\npropose a stable contact loss to make the contact region to be consistent.\nExperiments demonstrate that our method outperforms the state-of-the-art\nmethods. Code and dataset are available in project webpage\nhttps://iscas3dv.github.io/HO-NeRF.\n","authors":["Wentian Qu","Zhaopeng Cui","Yinda Zhang","Chenyu Meng","Cuixia Ma","Xiaoming Deng","Hongan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11194v1","updated":"2023-08-22T05:03:09Z","published":"2023-08-22T05:03:09Z","title":"ViLLA: Fine-Grained Vision-Language Representation Learning from\n Real-World Data","summary":" Vision-language models (VLMs), such as CLIP and ALIGN, are generally trained\non datasets consisting of image-caption pairs obtained from the web. However,\nreal-world multimodal datasets, such as healthcare data, are significantly more\ncomplex: each image (e.g. X-ray) is often paired with text (e.g. physician\nreport) that describes many distinct attributes occurring in fine-grained\nregions of the image. We refer to these samples as exhibiting high pairwise\ncomplexity, since each image-text pair can be decomposed into a large number of\nregion-attribute pairings. The extent to which VLMs can capture fine-grained\nrelationships between image regions and textual attributes when trained on such\ndata has not been previously evaluated. The first key contribution of this work\nis to demonstrate through systematic evaluations that as the pairwise\ncomplexity of the training dataset increases, standard VLMs struggle to learn\nregion-attribute relationships, exhibiting performance degradations of up to\n37% on retrieval tasks. In order to address this issue, we introduce ViLLA as\nour second key contribution. ViLLA, which is trained to capture fine-grained\nregion-attribute relationships from complex datasets, involves two components:\n(a) a lightweight, self-supervised mapping model to decompose image-text\nsamples into region-attribute pairs, and (b) a contrastive VLM to learn\nrepresentations from generated region-attribute pairs. We demonstrate with\nexperiments across four domains (synthetic, product, medical, and natural\nimages) that ViLLA outperforms comparable VLMs on fine-grained reasoning tasks,\nsuch as zero-shot object detection (up to 3.6 AP50 points on COCO and 0.6 mAP\npoints on LVIS) and retrieval (up to 14.2 R-Precision points).\n","authors":["Maya Varma","Jean-Benoit Delbrouck","Sarah Hooper","Akshay Chaudhari","Curtis Langlotz"],"pdf_url":"https://arxiv.org/pdf/2308.11194v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10299v2","updated":"2023-08-22T04:25:44Z","published":"2023-08-20T15:38:40Z","title":"Boosting Adversarial Transferability by Block Shuffle and Rotation","summary":" Adversarial examples mislead deep neural networks with imperceptible\nperturbations and have brought significant threats to deep learning. An\nimportant aspect is their transferability, which refers to their ability to\ndeceive other models, thus enabling attacks in the black-box setting. Though\nvarious methods have been proposed to boost transferability, the performance\nstill falls short compared with white-box attacks. In this work, we observe\nthat existing input transformation based attacks, one of the mainstream\ntransfer-based attacks, result in different attention heatmaps on various\nmodels, which might limit the transferability. We also find that breaking the\nintrinsic relation of the image can disrupt the attention heatmap of the\noriginal image. Based on this finding, we propose a novel input transformation\nbased attack called block shuffle and rotation (BSR). Specifically, BSR splits\nthe input image into several blocks, then randomly shuffles and rotates these\nblocks to construct a set of new images for gradient calculation. Empirical\nevaluations on the ImageNet dataset demonstrate that BSR could achieve\nsignificantly better transferability than the existing input transformation\nbased methods under single-model and ensemble-model settings. Combining BSR\nwith the current input transformation method can further improve the\ntransferability, which significantly outperforms the state-of-the-art methods.\n","authors":["Kunyu Wang","Xuanran He","Wenxuan Wang","Xiaosen Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10299v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11186v1","updated":"2023-08-22T04:24:45Z","published":"2023-08-22T04:24:45Z","title":"Knowledge-Aware Prompt Tuning for Generalizable Vision-Language Models","summary":" Pre-trained vision-language models, e.g., CLIP, working with manually\ndesigned prompts have demonstrated great capacity of transfer learning.\nRecently, learnable prompts achieve state-of-the-art performance, which however\nare prone to overfit to seen classes, failing to generalize to unseen classes.\nIn this paper, we propose a Knowledge-Aware Prompt Tuning (KAPT) framework for\nvision-language models. Our approach takes inspiration from human intelligence\nin which external knowledge is usually incorporated into recognizing novel\ncategories of objects. Specifically, we design two complementary types of\nknowledge-aware prompts for the text encoder to leverage the distinctive\ncharacteristics of category-related external knowledge. The discrete prompt\nextracts the key information from descriptions of an object category, and the\nlearned continuous prompt captures overall contexts. We further design an\nadaptation head for the visual encoder to aggregate salient attentive visual\ncues, which establishes discriminative and task-aware visual representations.\nWe conduct extensive experiments on 11 widely-used benchmark datasets and the\nresults verify the effectiveness in few-shot image classification, especially\nin generalizing to unseen categories. Compared with the state-of-the-art CoCoOp\nmethod, KAPT exhibits favorable performance and achieves an absolute gain of\n3.22% on new classes and 2.57% in terms of harmonic mean.\n","authors":["Baoshuo Kan","Teng Wang","Wenpeng Lu","Xiantong Zhen","Weili Guan","Feng Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.11186v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11185v1","updated":"2023-08-22T04:23:59Z","published":"2023-08-22T04:23:59Z","title":"MEGA: Multimodal Alignment Aggregation and Distillation For Cinematic\n Video Segmentation","summary":" Previous research has studied the task of segmenting cinematic videos into\nscenes and into narrative acts. However, these studies have overlooked the\nessential task of multimodal alignment and fusion for effectively and\nefficiently processing long-form videos (>60min). In this paper, we introduce\nMultimodal alignmEnt aGgregation and distillAtion (MEGA) for cinematic\nlong-video segmentation. MEGA tackles the challenge by leveraging multiple\nmedia modalities. The method coarsely aligns inputs of variable lengths and\ndifferent modalities with alignment positional encoding. To maintain temporal\nsynchronization while reducing computation, we further introduce an enhanced\nbottleneck fusion layer which uses temporal alignment. Additionally, MEGA\nemploys a novel contrastive loss to synchronize and transfer labels across\nmodalities, enabling act segmentation from labeled synopsis sentences on video\nshots. Our experimental results show that MEGA outperforms state-of-the-art\nmethods on MovieNet dataset for scene segmentation (with an Average Precision\nimprovement of +1.19%) and on TRIPOD dataset for act segmentation (with a Total\nAgreement improvement of +5.51%)\n","authors":["Najmeh Sadoughi","Xinyu Li","Avijit Vajpayee","David Fan","Bing Shuai","Hector Santos-Villalobos","Vimal Bhat","Rohith MV"],"pdf_url":"https://arxiv.org/pdf/2308.11185v1.pdf","comment":"ICCV 2023 accepted"},{"id":"http://arxiv.org/abs/2308.11184v1","updated":"2023-08-22T04:20:18Z","published":"2023-08-22T04:20:18Z","title":"ReFit: Recurrent Fitting Network for 3D Human Recovery","summary":" We present Recurrent Fitting (ReFit), a neural network architecture for\nsingle-image, parametric 3D human reconstruction. ReFit learns a\nfeedback-update loop that mirrors the strategy of solving an inverse problem\nthrough optimization. At each iterative step, it reprojects keypoints from the\nhuman model to feature maps to query feedback, and uses a recurrent-based\nupdater to adjust the model to fit the image better. Because ReFit encodes\nstrong knowledge of the inverse problem, it is faster to train than previous\nregression models. At the same time, ReFit improves state-of-the-art\nperformance on standard benchmarks. Moreover, ReFit applies to other\noptimization settings, such as multi-view fitting and single-view shape\nfitting. Project website: https://yufu-wang.github.io/refit_humans/\n","authors":["Yufu Wang","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2308.11184v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11179v1","updated":"2023-08-22T04:10:14Z","published":"2023-08-22T04:10:14Z","title":"A three in one bottom-up framework for simultaneous semantic\n segmentation, instance segmentation and classification of multi-organ nuclei\n in digital cancer histology","summary":" Simultaneous segmentation and classification of nuclei in digital histology\nplay an essential role in computer-assisted cancer diagnosis; however, it\nremains challenging. The highest achieved binary and multi-class Panoptic\nQuality (PQ) remains as low as 0.68 bPQ and 0.49 mPQ, respectively. It is due\nto the higher staining variability, variability across the tissue, rough\nclinical conditions, overlapping nuclei, and nuclear class imbalance. The\ngeneric deep-learning methods usually rely on end-to-end models, which fail to\naddress these problems associated explicitly with digital histology. In our\nprevious work, DAN-NucNet, we resolved these issues for semantic segmentation\nwith an end-to-end model. This work extends our previous model to simultaneous\ninstance segmentation and classification. We introduce additional decoder heads\nwith independent weighted losses, which produce semantic segmentation, edge\nproposals, and classification maps. We use the outputs from the three-head\nmodel to apply post-processing to produce the final segmentation and\nclassification. Our multi-stage approach utilizes edge proposals and semantic\nsegmentations compared to direct segmentation and classification strategies\nfollowed by most state-of-the-art methods. Due to this, we demonstrate a\nsignificant performance improvement in producing high-quality instance\nsegmentation and nuclei classification. We have achieved a 0.841 Dice score for\nsemantic segmentation, 0.713 bPQ scores for instance segmentation, and 0.633\nmPQ for nuclei classification. Our proposed framework is generalized across 19\ntypes of tissues. Furthermore, the framework is less complex compared to the\nstate-of-the-art.\n","authors":["Ibtihaj Ahmad","Syed Muhammad Israr","Zain Ul Islam"],"pdf_url":"https://arxiv.org/pdf/2308.11179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10522v2","updated":"2023-08-22T04:03:19Z","published":"2023-08-21T07:19:47Z","title":"Information Theory-Guided Heuristic Progressive Multi-View Coding","summary":" Multi-view representation learning aims to capture comprehensive information\nfrom multiple views of a shared context. Recent works intuitively apply\ncontrastive learning to different views in a pairwise manner, which is still\nscalable: view-specific noise is not filtered in learning view-shared\nrepresentations; the fake negative pairs, where the negative terms are actually\nwithin the same class as the positive, and the real negative pairs are\ncoequally treated; evenly measuring the similarities between terms might\ninterfere with optimization. Importantly, few works study the theoretical\nframework of generalized self-supervised multi-view learning, especially for\nmore than two views. To this end, we rethink the existing multi-view learning\nparadigm from the perspective of information theory and then propose a novel\ninformation theoretical framework for generalized multi-view learning. Guided\nby it, we build a multi-view coding method with a three-tier progressive\narchitecture, namely Information theory-guided hierarchical Progressive\nMulti-view Coding (IPMC). In the distribution-tier, IPMC aligns the\ndistribution between views to reduce view-specific noise. In the set-tier, IPMC\nconstructs self-adjusted contrasting pools, which are adaptively modified by a\nview filter. Lastly, in the instance-tier, we adopt a designed unified loss to\nlearn representations and reduce the gradient interference. Theoretically and\nempirically, we demonstrate the superiority of IPMC over state-of-the-art\nmethods.\n","authors":["Jiangmeng Li","Hang Gao","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.10522v2.pdf","comment":"This paper is accepted by the jourcal of Neural Networks (Elsevier)\n by 2023. A revised manuscript of arXiv:2109.02344"},{"id":"http://arxiv.org/abs/2308.11171v1","updated":"2023-08-22T04:01:01Z","published":"2023-08-22T04:01:01Z","title":"ViCo: Engaging Video Comment Generation with Human Preference Rewards","summary":" Engaging video comments play an important role in video social media, as they\nare the carrier of feelings, thoughts, or humor of the audience. Preliminary\nworks have made initial exploration for video comment generation by adopting\ncaption-style encoder-decoder models. However, comment generation presents some\nunique challenges distinct from caption generation, which makes these methods\nsomewhat less effective at generating engaging comments. In contrast to the\nobjective and descriptive nature of captions, comments tend to be inherently\nsubjective, making it hard to quantify and evaluate the engagement of comments.\nFurthermore, the scarcity of truly engaging comments brings difficulty to\ncollecting enough high-quality training examples. In this paper, we propose\nViCo with three novel designs to tackle the above challenges for generating\nengaging Video Comments. Firstly, to quantify the engagement of comments, we\nutilize the number of \"likes\" each comment receives as a proxy of human\npreference after an appropriate debiasing procedure. Secondly, to automatically\nevaluate the engagement of comments, we train a reward model to align its\njudgment to the above proxy. Our user studies indicate that this reward model\neffectively aligns with human judgments. Lastly, to alleviate the scarcity of\nhigh-quality comments, an initial generator is trained on readily available but\nnoisy data to generate comments. Then the reward model is employed to offer\nfeedback on the generated comments, thus optimizing the initial generator. To\nfacilitate the research of video commenting, we collect a large video\ncomment-dataset (ViCo-20k) with rich metadata from a popular video website.\nExperiments on ViCo-20k show that the comments generated by our ViCo model\nexhibit the best performance in terms of both quantitative and qualitative\nresults, particularly when engagement is considered.\n","authors":["Yuchong Sun","Bei Liu","Xu Chen","Ruihua Song","Jianlong Fu"],"pdf_url":"https://arxiv.org/pdf/2308.11171v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.02344v2","updated":"2023-08-22T03:55:42Z","published":"2021-09-06T10:32:24Z","title":"Information Theory-Guided Heuristic Progressive Multi-View Coding","summary":" Multi-view representation learning captures comprehensive information from\nmultiple views of a shared context. Recent works intuitively apply contrastive\nlearning (CL) to learn representations, regarded as a pairwise manner, which is\nstill scalable: view-specific noise is not filtered in learning view-shared\nrepresentations; the fake negative pairs, where the negative terms are actually\nwithin the same class as the positive, and the real negative pairs are\ncoequally treated; and evenly measuring the similarities between terms might\ninterfere with optimization. Importantly, few works research the theoretical\nframework of generalized self-supervised multi-view learning, especially for\nmore than two views. To this end, we rethink the existing multi-view learning\nparadigm from the information theoretical perspective and then propose a novel\ninformation theoretical framework for generalized multi-view learning. Guided\nby it, we build a multi-view coding method with a three-tier progressive\narchitecture, namely Information theory-guided heuristic Progressive Multi-view\nCoding (IPMC). In the distribution-tier, IPMC aligns the distribution between\nviews to reduce view-specific noise. In the set-tier, IPMC builds self-adjusted\npools for contrasting, which utilizes a view filter to adaptively modify the\npools. Lastly, in the instance-tier, we adopt a designed unified loss to learn\ndiscriminative representations and reduce the gradient interference.\nTheoretically and empirically, we demonstrate the superiority of IPMC over\nstate-of-the-art methods.\n","authors":["Jiangmeng Li","Wenwen Qiang","Hang Gao","Bing Su","Farid Razzak","Jie Hu","Changwen Zheng","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2109.02344v2.pdf","comment":"We have uploaded a new version of this paper in arXiv:2308.10522, so\n that we have to withdrawal this paper"},{"id":"http://arxiv.org/abs/2308.11166v1","updated":"2023-08-22T03:52:05Z","published":"2023-08-22T03:52:05Z","title":"Hierarchical Point-based Active Learning for Semi-supervised Point Cloud\n Semantic Segmentation","summary":" Impressive performance on point cloud semantic segmentation has been achieved\nby fully-supervised methods with large amounts of labelled data. As it is\nlabour-intensive to acquire large-scale point cloud data with point-wise\nlabels, many attempts have been made to explore learning 3D point cloud\nsegmentation with limited annotations. Active learning is one of the effective\nstrategies to achieve this purpose but is still under-explored. The most recent\nmethods of this kind measure the uncertainty of each pre-divided region for\nmanual labelling but they suffer from redundant information and require\nadditional efforts for region division. This paper aims at addressing this\nissue by developing a hierarchical point-based active learning strategy.\nSpecifically, we measure the uncertainty for each point by a hierarchical\nminimum margin uncertainty module which considers the contextual information at\nmultiple levels. Then, a feature-distance suppression strategy is designed to\nselect important and representative points for manual labelling. Besides, to\nbetter exploit the unlabelled data, we build a semi-supervised segmentation\nframework based on our active strategy. Extensive experiments on the S3DIS and\nScanNetV2 datasets demonstrate that the proposed framework achieves 96.5% and\n100% performance of fully-supervised baseline with only 0.07% and 0.1% training\ndata, respectively, outperforming the state-of-the-art weakly-supervised and\nactive learning methods. The code will be available at\nhttps://github.com/SmiletoE/HPAL.\n","authors":["Zongyi Xu","Bo Yuan","Shanshan Zhao","Qianni Zhang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2308.11166v1.pdf","comment":"International Conference on Computer Vision (ICCV) 2023"},{"id":"http://arxiv.org/abs/2308.11165v1","updated":"2023-08-22T03:46:24Z","published":"2023-08-22T03:46:24Z","title":"Improving Misaligned Multi-modality Image Fusion with One-stage\n Progressive Dense Registration","summary":" Misalignments between multi-modality images pose challenges in image fusion,\nmanifesting as structural distortions and edge ghosts. Existing efforts\ncommonly resort to registering first and fusing later, typically employing two\ncascaded stages for registration,i.e., coarse registration and fine\nregistration. Both stages directly estimate the respective target deformation\nfields. In this paper, we argue that the separated two-stage registration is\nnot compact, and the direct estimation of the target deformation fields is not\naccurate enough. To address these challenges, we propose a Cross-modality\nMulti-scale Progressive Dense Registration (C-MPDR) scheme, which accomplishes\nthe coarse-to-fine registration exclusively using a one-stage optimization,\nthus improving the fusion performance of misaligned multi-modality images.\nSpecifically, two pivotal components are involved, a dense Deformation Field\nFusion (DFF) module and a Progressive Feature Fine (PFF) module. The DFF\naggregates the predicted multi-scale deformation sub-fields at the current\nscale, while the PFF progressively refines the remaining misaligned features.\nBoth work together to accurately estimate the final deformation fields. In\naddition, we develop a Transformer-Conv-based Fusion (TCF) subnetwork that\nconsiders local and long-range feature dependencies, allowing us to capture\nmore informative features from the registered infrared and visible images for\nthe generation of high-quality fused images. Extensive experimental analysis\ndemonstrates the superiority of the proposed method in the fusion of misaligned\ncross-modality images.\n","authors":["Di Wang","Jinyuan Liu","Long Ma","Risheng Liu","Xin Fan"],"pdf_url":"https://arxiv.org/pdf/2308.11165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11164v1","updated":"2023-08-22T03:45:13Z","published":"2023-08-22T03:45:13Z","title":"Decoupled Contrastive Multi-view Clustering with High-order Random Walks","summary":" In recent, some robust contrastive multi-view clustering (MvC) methods have\nbeen proposed, which construct data pairs from neighborhoods to alleviate the\nfalse negative issue, i.e., some intra-cluster samples are wrongly treated as\nnegative pairs. Although promising performance has been achieved by these\nmethods, the false negative issue is still far from addressed and the false\npositive issue emerges because all in- and out-of-neighborhood samples are\nsimply treated as positive and negative, respectively. To address the issues,\nwe propose a novel robust method, dubbed decoupled contrastive multi-view\nclustering with high-order random walks (DIVIDE). In brief, DIVIDE leverages\nrandom walks to progressively identify data pairs in a global instead of local\nmanner. As a result, DIVIDE could identify in-neighborhood negatives and\nout-of-neighborhood positives. Moreover, DIVIDE embraces a novel MvC\narchitecture to perform inter- and intra-view contrastive learning in different\nembedding spaces, thus boosting clustering performance and embracing the\nrobustness against missing views. To verify the efficacy of DIVIDE, we carry\nout extensive experiments on four benchmark datasets comparing with nine\nstate-of-the-art MvC methods in both complete and incomplete MvC settings.\n","authors":["Yiding Lu","Yijie Lin","Mouxing Yang","Dezhong Peng","Peng Hu","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2308.11164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11162v1","updated":"2023-08-22T03:40:46Z","published":"2023-08-22T03:40:46Z","title":"A Preliminary Investigation into Search and Matching for Tumour\n Discrimination in WHO Breast Taxonomy Using Deep Networks","summary":" Breast cancer is one of the most common cancers affecting women worldwide.\nThey include a group of malignant neoplasms with a variety of biological,\nclinical, and histopathological characteristics. There are more than 35\ndifferent histological forms of breast lesions that can be classified and\ndiagnosed histologically according to cell morphology, growth, and architecture\npatterns. Recently, deep learning, in the field of artificial intelligence, has\ndrawn a lot of attention for the computerized representation of medical images.\nSearchable digital atlases can provide pathologists with patch matching tools\nallowing them to search among evidently diagnosed and treated archival cases, a\ntechnology that may be regarded as computational second opinion. In this study,\nwe indexed and analyzed the WHO breast taxonomy (Classification of Tumours 5th\nEd.) spanning 35 tumour types. We visualized all tumour types using deep\nfeatures extracted from a state-of-the-art deep learning model, pre-trained on\nmillions of diagnostic histopathology images from the TCGA repository.\nFurthermore, we test the concept of a digital \"atlas\" as a reference for search\nand matching with rare test cases. The patch similarity search within the WHO\nbreast taxonomy data reached over 88% accuracy when validating through\n\"majority vote\" and more than 91% accuracy when validating using top-n tumour\ntypes. These results show for the first time that complex relationships among\ncommon and rare breast lesions can be investigated using an indexed digital\narchive.\n","authors":["Abubakr Shafique","Ricardo Gonzalez","Liron Pantanowitz","Puay Hoon Tan","Alberto Machado","Ian A Cree","Hamid R. Tizhoosh"],"pdf_url":"https://arxiv.org/pdf/2308.11162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11159v1","updated":"2023-08-22T03:31:52Z","published":"2023-08-22T03:31:52Z","title":"SwinV2DNet: Pyramid and Self-Supervision Compounded Feature Learning for\n Remote Sensing Images Change Detection","summary":" Among the current mainstream change detection networks, transformer is\ndeficient in the ability to capture accurate low-level details, while\nconvolutional neural network (CNN) is wanting in the capacity to understand\nglobal information and establish remote spatial relationships. Meanwhile, both\nof the widely used early fusion and late fusion frameworks are not able to well\nlearn complete change features. Therefore, based on swin transformer V2 (Swin\nV2) and VGG16, we propose an end-to-end compounded dense network SwinV2DNet to\ninherit the advantages of both transformer and CNN and overcome the\nshortcomings of existing networks in feature learning. Firstly, it captures the\nchange relationship features through the densely connected Swin V2 backbone,\nand provides the low-level pre-changed and post-changed features through a CNN\nbranch. Based on these three change features, we accomplish accurate change\ndetection results. Secondly, combined with transformer and CNN, we propose\nmixed feature pyramid (MFP) which provides inter-layer interaction information\nand intra-layer multi-scale information for complete feature learning. MFP is a\nplug and play module which is experimentally proven to be also effective in\nother change detection networks. Further more, we impose a self-supervision\nstrategy to guide a new CNN branch, which solves the untrainable problem of the\nCNN branch and provides the semantic change information for the features of\nencoder. The state-of-the-art (SOTA) change detection scores and fine-grained\nchange maps were obtained compared with other advanced methods on four commonly\nused public remote sensing datasets. The code is available at\nhttps://github.com/DalongZ/SwinV2DNet.\n","authors":["Dalong Zheng","Zebin Wu","Jia Liu","Zhihui Wei"],"pdf_url":"https://arxiv.org/pdf/2308.11159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11158v1","updated":"2023-08-22T03:31:40Z","published":"2023-08-22T03:31:40Z","title":"Domain Generalization via Rationale Invariance","summary":" This paper offers a new perspective to ease the challenge of domain\ngeneralization, which involves maintaining robust results even in unseen\nenvironments. Our design focuses on the decision-making process in the final\nclassifier layer. Specifically, we propose treating the element-wise\ncontributions to the final results as the rationale for making a decision and\nrepresenting the rationale for each sample as a matrix. For a well-generalized\nmodel, we suggest the rationale matrices for samples belonging to the same\ncategory should be similar, indicating the model relies on domain-invariant\nclues to make decisions, thereby ensuring robust results. To implement this\nidea, we introduce a rationale invariance loss as a simple regularization\ntechnique, requiring only a few lines of code. Our experiments demonstrate that\nthe proposed approach achieves competitive results across various datasets,\ndespite its simplicity. Code is available at\n\\url{https://github.com/liangchen527/RIDG}.\n","authors":["Liang Chen","Yong Zhang","Yibing Song","Anton van den Hengel","Lingqiao Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11158v1.pdf","comment":"Accepted in ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11157v1","updated":"2023-08-22T03:30:22Z","published":"2023-08-22T03:30:22Z","title":"TOPIC: A Parallel Association Paradigm for Multi-Object Tracking under\n Complex Motions and Diverse Scenes","summary":" Video data and algorithms have been driving advances in multi-object tracking\n(MOT). While existing MOT datasets focus on occlusion and appearance\nsimilarity, complex motion patterns are widespread yet overlooked. To address\nthis issue, we introduce a new dataset called BEE23 to highlight complex\nmotions. Identity association algorithms have long been the focus of MOT\nresearch. Existing trackers can be categorized into two association paradigms:\nsingle-feature paradigm (based on either motion or appearance feature) and\nserial paradigm (one feature serves as secondary while the other is primary).\nHowever, these paradigms are incapable of fully utilizing different features.\nIn this paper, we propose a parallel paradigm and present the Two rOund\nParallel matchIng meChanism (TOPIC) to implement it. The TOPIC leverages both\nmotion and appearance features and can adaptively select the preferable one as\nthe assignment metric based on motion level. Moreover, we provide an\nAttention-based Appearance Reconstruct Module (AARM) to reconstruct appearance\nfeature embeddings, thus enhancing the representation of appearance features.\nComprehensive experiments show that our approach achieves state-of-the-art\nperformance on four public datasets and BEE23. Notably, our proposed parallel\nparadigm surpasses the performance of existing association paradigms by a large\nmargin, e.g., reducing false negatives by 12% to 51% compared to the\nsingle-feature association paradigm. The introduced dataset and association\nparadigm in this work offers a fresh perspective for advancing the MOT field.\nThe source code and dataset are available at\nhttps://github.com/holmescao/TOPICTrack.\n","authors":["Xiaoyan Cao","Yiyao Zheng","Yao Yao","Huapeng Qin","Xiaoyu Cao","Shihui Guo"],"pdf_url":"https://arxiv.org/pdf/2308.11157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10608v2","updated":"2023-08-22T03:23:35Z","published":"2023-08-21T10:16:52Z","title":"FocalDreamer: Text-driven 3D Editing via Focal-fusion Assembly","summary":" While text-3D editing has made significant strides in leveraging score\ndistillation sampling, emerging approaches still fall short in delivering\nseparable, precise and consistent outcomes that are vital to content creation.\nIn response, we introduce FocalDreamer, a framework that merges base shape with\neditable parts according to text prompts for fine-grained editing within\ndesired regions. Specifically, equipped with geometry union and dual-path\nrendering, FocalDreamer assembles independent 3D parts into a complete object,\ntailored for convenient instance reuse and part-wise control. We propose\ngeometric focal loss and style consistency regularization, which encourage\nfocal fusion and congruent overall appearance. Furthermore, FocalDreamer\ngenerates high-fidelity geometry and PBR textures which are compatible with\nwidely-used graphics engines. Extensive experiments have highlighted the\nsuperior editing capabilities of FocalDreamer in both quantitative and\nqualitative evaluations.\n","authors":["Yuhan Li","Yishun Dou","Yue Shi","Yu Lei","Xuanhong Chen","Yi Zhang","Peng Zhou","Bingbing Ni"],"pdf_url":"https://arxiv.org/pdf/2308.10608v2.pdf","comment":"Project website: https://focaldreamer.github.io"},{"id":"http://arxiv.org/abs/2308.10755v2","updated":"2023-08-22T02:57:45Z","published":"2023-08-21T14:40:48Z","title":"WanJuan: A Comprehensive Multimodal Dataset for Advancing English and\n Chinese Large Models","summary":" The rise in popularity of ChatGPT and GPT-4 has significantly accelerated the\ndevelopment of large models, leading to the creation of numerous impressive\nlarge language models(LLMs) and multimodal large language models (MLLMs). These\ncutting-edge models owe their remarkable performance to high-quality data.\nHowever, the details of the training data used in leading paradigms are often\nkept confidential. This lack of transparency, coupled with the scarcity of\nopen-source data, impedes further developments within the community. As a\nresponse, this paper presents \"Wan Juan\", a large-scale multimodal dataset\ncomposed of both Chinese and English data, collected from a wide range of web\nsources. The dataset incorporates text, image-text, and video modalities, with\na total volume exceeding 2TB. It was utilized in the training of InternLM, a\nmodel that demonstrated significant advantages in multi-dimensional evaluations\nwhen compared to models of a similar scale. All data can be accessed at\nhttps://opendatalab.org.cn/WanJuan1.0.\n","authors":["Conghui He","Zhenjiang Jin","Chao Xu","Jiantao Qiu","Bin Wang","Wei Li","Hang Yan","Jiaqi Wang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.10755v2.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2308.10195v2","updated":"2023-08-22T02:55:39Z","published":"2023-08-20T07:56:34Z","title":"WMFormer++: Nested Transformer for Visible Watermark Removal via Implict\n Joint Learning","summary":" Watermarking serves as a widely adopted approach to safeguard media\ncopyright. In parallel, the research focus has extended to watermark removal\ntechniques, offering an adversarial means to enhance watermark robustness and\nfoster advancements in the watermarking field. Existing watermark removal\nmethods mainly rely on UNet with task-specific decoder branches--one for\nwatermark localization and the other for background image restoration. However,\nwatermark localization and background restoration are not isolated tasks;\nprecise watermark localization inherently implies regions necessitating\nrestoration, and the background restoration process contributes to more\naccurate watermark localization. To holistically integrate information from\nboth branches, we introduce an implicit joint learning paradigm. This empowers\nthe network to autonomously navigate the flow of information between implicit\nbranches through a gate mechanism. Furthermore, we employ cross-channel\nattention to facilitate local detail restoration and holistic structural\ncomprehension, while harnessing nested structures to integrate multi-scale\ninformation. Extensive experiments are conducted on various challenging\nbenchmarks to validate the effectiveness of our proposed method. The results\ndemonstrate our approach's remarkable superiority, surpassing existing\nstate-of-the-art methods by a large margin.\n","authors":["Dongjian Huo","Zehong Zhang","Hanjing Su","Guanbin Li","Chaowei Fang","Qingyao Wu"],"pdf_url":"https://arxiv.org/pdf/2308.10195v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11144v1","updated":"2023-08-22T02:54:42Z","published":"2023-08-22T02:54:42Z","title":"Exploring Unsupervised Cell Recognition with Prior Self-activation Maps","summary":" The success of supervised deep learning models on cell recognition tasks\nrelies on detailed annotations. Many previous works have managed to reduce the\ndependency on labels. However, considering the large number of cells contained\nin a patch, costly and inefficient labeling is still inevitable. To this end,\nwe explored label-free methods for cell recognition. Prior self-activation maps\n(PSM) are proposed to generate pseudo masks as training targets. To be\nspecific, an activation network is trained with self-supervised learning. The\ngradient information in the shallow layers of the network is aggregated to\ngenerate prior self-activation maps. Afterward, a semantic clustering module is\nthen introduced as a pipeline to transform PSMs to pixel-level semantic pseudo\nmasks for downstream tasks. We evaluated our method on two histological\ndatasets: MoNuSeg (cell segmentation) and BCData (multi-class cell detection).\nCompared with other fully-supervised and weakly-supervised methods, our method\ncan achieve competitive performance without any manual annotations. Our simple\nbut effective framework can also achieve multi-class cell detection which can\nnot be done by existing unsupervised methods. The results show the potential of\nPSMs that might inspire other research to deal with the hunger for labels in\nmedical area.\n","authors":["Pingyi Chen","Chenglu Zhu","Zhongyi Shui","Jiatong Cai","Sunyi Zheng","Shichuan Zhang","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2308.11144v1.pdf","comment":"MICCAI 2023. arXiv admin note: substantial text overlap with\n arXiv:2210.07862"},{"id":"http://arxiv.org/abs/2308.11140v1","updated":"2023-08-22T02:44:03Z","published":"2023-08-22T02:44:03Z","title":"High Dynamic Range Imaging of Dynamic Scenes with Saturation\n Compensation but without Explicit Motion Compensation","summary":" High dynamic range (HDR) imaging is a highly challenging task since a large\namount of information is lost due to the limitations of camera sensors. For HDR\nimaging, some methods capture multiple low dynamic range (LDR) images with\naltering exposures to aggregate more information. However, these approaches\nintroduce ghosting artifacts when significant inter-frame motions are present.\nMoreover, although multi-exposure images are given, we have little information\nin severely over-exposed areas. Most existing methods focus on motion\ncompensation, i.e., alignment of multiple LDR shots to reduce the ghosting\nartifacts, but they still produce unsatisfying results. These methods also\nrather overlook the need to restore the saturated areas. In this paper, we\ngenerate well-aligned multi-exposure features by reformulating a motion\nalignment problem into a simple brightness adjustment problem. In addition, we\npropose a coarse-to-fine merging strategy with explicit saturation\ncompensation. The saturated areas are reconstructed with similar well-exposed\ncontent using adaptive contextual attention. We demonstrate that our method\noutperforms the state-of-the-art methods regarding qualitative and quantitative\nevaluations.\n","authors":["Haesoo Chung","Nam Ik Cho"],"pdf_url":"https://arxiv.org/pdf/2308.11140v1.pdf","comment":"WACV 2022"},{"id":"http://arxiv.org/abs/2308.09878v2","updated":"2023-08-22T02:32:01Z","published":"2023-08-19T02:11:49Z","title":"DatasetEquity: Are All Samples Created Equal? In The Quest For Equity\n Within Datasets","summary":" Data imbalance is a well-known issue in the field of machine learning,\nattributable to the cost of data collection, the difficulty of labeling, and\nthe geographical distribution of the data. In computer vision, bias in data\ndistribution caused by image appearance remains highly unexplored. Compared to\ncategorical distributions using class labels, image appearance reveals complex\nrelationships between objects beyond what class labels provide. Clustering deep\nperceptual features extracted from raw pixels gives a richer representation of\nthe data. This paper presents a novel method for addressing data imbalance in\nmachine learning. The method computes sample likelihoods based on image\nappearance using deep perceptual embeddings and clustering. It then uses these\nlikelihoods to weigh samples differently during training with a proposed\n$\\textbf{Generalized Focal Loss}$ function. This loss can be easily integrated\nwith deep learning algorithms. Experiments validate the method's effectiveness\nacross autonomous driving vision datasets including KITTI and nuScenes. The\nloss function improves state-of-the-art 3D object detection methods, achieving\nover $200\\%$ AP gains on under-represented classes (Cyclist) in the KITTI\ndataset. The results demonstrate the method is generalizable, complements\nexisting techniques, and is particularly beneficial for smaller datasets and\nrare classes. Code is available at:\nhttps://github.com/towardsautonomy/DatasetEquity\n","authors":["Shubham Shrivastava","Xianling Zhang","Sushruth Nagesh","Armin Parchami"],"pdf_url":"https://arxiv.org/pdf/2308.09878v2.pdf","comment":"ICCV 2023 Workshop"},{"id":"http://arxiv.org/abs/2308.10647v2","updated":"2023-08-22T02:32:01Z","published":"2023-08-21T11:35:28Z","title":"bbOCR: An Open-source Multi-domain OCR Pipeline for Bengali Documents","summary":" Despite the existence of numerous Optical Character Recognition (OCR) tools,\nthe lack of comprehensive open-source systems hampers the progress of document\ndigitization in various low-resource languages, including Bengali. Low-resource\nlanguages, especially those with an alphasyllabary writing system, suffer from\nthe lack of large-scale datasets for various document OCR components such as\nword-level OCR, document layout extraction, and distortion correction; which\nare available as individual modules in high-resource languages. In this paper,\nwe introduce Bengali$.$AI-BRACU-OCR (bbOCR): an open-source scalable document\nOCR system that can reconstruct Bengali documents into a structured searchable\ndigitized format that leverages a novel Bengali text recognition model and two\nnovel synthetic datasets. We present extensive component-level and system-level\nevaluation: both use a novel diversified evaluation dataset and comprehensive\nevaluation metrics. Our extensive evaluation suggests that our proposed\nsolution is preferable over the current state-of-the-art Bengali OCR systems.\nThe source codes and datasets are available here:\nhttps://bengaliai.github.io/bbocr.\n","authors":["Imam Mohammad Zulkarnain","Shayekh Bin Islam","Md. Zami Al Zunaed Farabe","Md. Mehedi Hasan Shawon","Jawaril Munshad Abedin","Beig Rajibul Hasan","Marsia Haque","Istiak Shihab","Syed Mobassir","MD. Nazmuddoha Ansary","Asif Sushmit","Farig Sadeque"],"pdf_url":"https://arxiv.org/pdf/2308.10647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11130v1","updated":"2023-08-22T02:23:28Z","published":"2023-08-22T02:23:28Z","title":"Efficient View Synthesis with Neural Radiance Distribution Field","summary":" Recent work on Neural Radiance Fields (NeRF) has demonstrated significant\nadvances in high-quality view synthesis. A major limitation of NeRF is its low\nrendering efficiency due to the need for multiple network forwardings to render\na single pixel. Existing methods to improve NeRF either reduce the number of\nrequired samples or optimize the implementation to accelerate the network\nforwarding. Despite these efforts, the problem of multiple sampling persists\ndue to the intrinsic representation of radiance fields. In contrast, Neural\nLight Fields (NeLF) reduce the computation cost of NeRF by querying only one\nsingle network forwarding per pixel. To achieve a close visual quality to NeRF,\nexisting NeLF methods require significantly larger network capacities which\nlimits their rendering efficiency in practice. In this work, we propose a new\nrepresentation called Neural Radiance Distribution Field (NeRDF) that targets\nefficient view synthesis in real-time. Specifically, we use a small network\nsimilar to NeRF while preserving the rendering speed with a single network\nforwarding per pixel as in NeLF. The key is to model the radiance distribution\nalong each ray with frequency basis and predict frequency weights using the\nnetwork. Pixel values are then computed via volume rendering on radiance\ndistributions. Experiments show that our proposed method offers a better\ntrade-off among speed, quality, and network size than existing methods: we\nachieve a ~254x speed-up over NeRF with similar network size, with only a\nmarginal performance decline. Our project page is at\nyushuang-wu.github.io/NeRDF.\n","authors":["Yushuang Wu","Xiao Li","Jinglu Wang","Xiaoguang Han","Shuguang Cui","Yan Lu"],"pdf_url":"https://arxiv.org/pdf/2308.11130v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2305.06252v3","updated":"2023-08-22T02:17:16Z","published":"2023-05-10T15:33:15Z","title":"Embedded Feature Similarity Optimization with Specific Parameter\n Initialization for 2D/3D Medical Image Registration","summary":" We present a novel deep learning-based framework: Embedded Feature Similarity\nOptimization with Specific Parameter Initialization (SOPI) for 2D/3D medical\nimage registration which is a most challenging problem due to the difficulty\nsuch as dimensional mismatch, heavy computation load and lack of golden\nevaluation standard. The framework we design includes a parameter specification\nmodule to efficiently choose initialization pose parameter and a\nfine-registration module to align images. The proposed framework takes\nextracting multi-scale features into consideration using a novel composite\nconnection encoder with special training techniques. We compare the method with\nboth learning-based methods and optimization-based methods on a in-house\nCT/X-ray dataset as well as simulated data to further evaluate performance. Our\nexperiments demonstrate that the method in this paper has improved the\nregistration performance, and thereby outperforms the existing methods in terms\nof accuracy and running time. We also show the potential of the proposed method\nas an initial pose estimator. The code is available at\nhttps://github.com/m1nhengChen/SOPI\n","authors":["Minheng Chen","Zhirun Zhang","Shuheng Gu","Youyong Kong"],"pdf_url":"https://arxiv.org/pdf/2305.06252v3.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2203.11725v2","updated":"2023-08-22T02:16:37Z","published":"2022-03-22T13:32:42Z","title":"Unsupervised Anomaly Detection in Medical Images with a Memory-augmented\n Multi-level Cross-attentional Masked Autoencoder","summary":" Unsupervised anomaly detection (UAD) aims to find anomalous images by\noptimising a detector using a training set that contains only normal images.\nUAD approaches can be based on reconstruction methods, self-supervised\napproaches, and Imagenet pre-trained models. Reconstruction methods, which\ndetect anomalies from image reconstruction errors, are advantageous because\nthey do not rely on the design of problem-specific pretext tasks needed by\nself-supervised approaches, and on the unreliable translation of models\npre-trained from non-medical datasets. However, reconstruction methods may fail\nbecause they can have low reconstruction errors even for anomalous images. In\nthis paper, we introduce a new reconstruction-based UAD approach that addresses\nthis low-reconstruction error issue for anomalous images. Our UAD approach, the\nmemory-augmented multi-level cross-attentional masked autoencoder (MemMC-MAE),\nis a transformer-based approach, consisting of a novel memory-augmented\nself-attention operator for the encoder and a new multi-level cross-attention\noperator for the decoder. MemMCMAE masks large parts of the input image during\nits reconstruction, reducing the risk that it will produce low reconstruction\nerrors because anomalies are likely to be masked and cannot be reconstructed.\nHowever, when the anomaly is not masked, then the normal patterns stored in the\nencoder's memory combined with the decoder's multi-level cross attention will\nconstrain the accurate reconstruction of the anomaly. We show that our method\nachieves SOTA anomaly detection and localisation on colonoscopy, pneumonia, and\ncovid-19 chest x-ray datasets.\n","authors":["Yu Tian","Guansong Pang","Yuyuan Liu","Chong Wang","Yuanhong Chen","Fengbei Liu","Rajvinder Singh","Johan W Verjans","Mengyu Wang","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2203.11725v2.pdf","comment":"Accepted to MICCAI MLMI2023"},{"id":"http://arxiv.org/abs/2307.14571v2","updated":"2023-08-22T02:08:51Z","published":"2023-07-27T01:20:47Z","title":"Robust Detection, Association, and Localization of Vehicle Lights: A\n Context-Based Cascaded CNN Approach and Evaluations","summary":" Vehicle light detection, association, and localization are required for\nimportant downstream safe autonomous driving tasks, such as predicting a\nvehicle's light state to determine if the vehicle is making a lane change or\nturning. Currently, many vehicle light detectors use single-stage detectors\nwhich predict bounding boxes to identify a vehicle light, in a manner decoupled\nfrom vehicle instances. In this paper, we present a method for detecting a\nvehicle light given an upstream vehicle detection and approximation of a\nvisible light's center. Our method predicts four approximate corners associated\nwith each vehicle light. We experiment with CNN architectures, data\naugmentation, and contextual preprocessing methods designed to reduce\nsurrounding-vehicle confusion. We achieve an average distance error from the\nground truth corner of 4.77 pixels, about 16.33% of the size of the vehicle\nlight on average. We train and evaluate our model on the LISA Lights Dataset,\nallowing us to thoroughly evaluate our vehicle light corner detection model on\na large variety of vehicle light shapes and lighting conditions. We propose\nthat this model can be integrated into a pipeline with vehicle detection and\nvehicle light center detection to make a fully-formed vehicle light detection\nnetwork, valuable to identifying trajectory-informative signals in driving\nscenes.\n","authors":["Akshay Gopalkrishnan","Ross Greer","Maitrayee Keskar","Mohan Trivedi"],"pdf_url":"https://arxiv.org/pdf/2307.14571v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11123v1","updated":"2023-08-22T02:06:27Z","published":"2023-08-22T02:06:27Z","title":"Hey That's Mine Imperceptible Watermarks are Preserved in Diffusion\n Generated Outputs","summary":" Generative models have seen an explosion in popularity with the release of\nhuge generative Diffusion models like Midjourney and Stable Diffusion to the\npublic. Because of this new ease of access, questions surrounding the automated\ncollection of data and issues regarding content ownership have started to\nbuild. In this paper we present new work which aims to provide ways of\nprotecting content when shared to the public. We show that a generative\nDiffusion model trained on data that has been imperceptibly watermarked will\ngenerate new images with these watermarks present. We further show that if a\ngiven watermark is correlated with a certain feature of the training data, the\ngenerated images will also have this correlation. Using statistical tests we\nshow that we are able to determine whether a model has been trained on marked\ndata, and what data was marked. As a result our system offers a solution to\nprotect intellectual property when sharing content online.\n","authors":["Luke Ditria","Tom Drummond"],"pdf_url":"https://arxiv.org/pdf/2308.11123v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11119v1","updated":"2023-08-22T01:55:03Z","published":"2023-08-22T01:55:03Z","title":"Random Word Data Augmentation with CLIP for Zero-Shot Anomaly Detection","summary":" This paper presents a novel method that leverages a visual-language model,\nCLIP, as a data source for zero-shot anomaly detection. Tremendous efforts have\nbeen put towards developing anomaly detectors due to their potential industrial\napplications. Considering the difficulty in acquiring various anomalous samples\nfor training, most existing methods train models with only normal samples and\nmeasure discrepancies from the distribution of normal samples during inference,\nwhich requires training a model for each object category. The problem of this\ninefficient training requirement has been tackled by designing a CLIP-based\nanomaly detector that applies prompt-guided classification to each part of an\nimage in a sliding window manner. However, the method still suffers from the\nlabor of careful prompt ensembling with known object categories. To overcome\nthe issues above, we propose leveraging CLIP as a data source for training. Our\nmethod generates text embeddings with the text encoder in CLIP with typical\nprompts that include words of normal and anomaly. In addition to these words,\nwe insert several randomly generated words into prompts, which enables the\nencoder to generate a diverse set of normal and anomalous samples. Using the\ngenerated embeddings as training data, a feed-forward neural network learns to\nextract features of normal and anomaly from CLIP's embeddings, and as a result,\na category-agnostic anomaly detector can be obtained without any training\nimages. Experimental results demonstrate that our method achieves\nstate-of-the-art performance without laborious prompt ensembling in zero-shot\nsetups.\n","authors":["Masato Tamura"],"pdf_url":"https://arxiv.org/pdf/2308.11119v1.pdf","comment":"Accepted to BMVC2023"},{"id":"http://arxiv.org/abs/2308.11116v1","updated":"2023-08-22T01:43:00Z","published":"2023-08-22T01:43:00Z","title":"LAN-HDR: Luminance-based Alignment Network for High Dynamic Range Video\n Reconstruction","summary":" As demands for high-quality videos continue to rise, high-resolution and\nhigh-dynamic range (HDR) imaging techniques are drawing attention. To generate\nan HDR video from low dynamic range (LDR) images, one of the critical steps is\nthe motion compensation between LDR frames, for which most existing works\nemployed the optical flow algorithm. However, these methods suffer from flow\nestimation errors when saturation or complicated motions exist. In this paper,\nwe propose an end-to-end HDR video composition framework, which aligns LDR\nframes in the feature space and then merges aligned features into an HDR frame,\nwithout relying on pixel-domain optical flow. Specifically, we propose a\nluminance-based alignment network for HDR (LAN-HDR) consisting of an alignment\nmodule and a hallucination module. The alignment module aligns a frame to the\nadjacent reference by evaluating luminance-based attention, excluding color\ninformation. The hallucination module generates sharp details, especially for\nwashed-out areas due to saturation. The aligned and hallucinated features are\nthen blended adaptively to complement each other. Finally, we merge the\nfeatures to generate a final HDR frame. In training, we adopt a temporal loss,\nin addition to frame reconstruction losses, to enhance temporal consistency and\nthus reduce flickering. Extensive experiments demonstrate that our method\nperforms better or comparable to state-of-the-art methods on several\nbenchmarks.\n","authors":["Haesoo Chung","Nam Ik Cho"],"pdf_url":"https://arxiv.org/pdf/2308.11116v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2210.09996v3","updated":"2023-08-22T01:40:44Z","published":"2022-10-18T17:01:35Z","title":"Perceptual Grouping in Contrastive Vision-Language Models","summary":" Recent advances in zero-shot image recognition suggest that vision-language\nmodels learn generic visual representations with a high degree of semantic\ninformation that may be arbitrarily probed with natural language phrases.\nUnderstanding an image, however, is not just about understanding what content\nresides within an image, but importantly, where that content resides. In this\nwork we examine how well vision-language models are able to understand where\nobjects reside within an image and group together visually related parts of the\nimagery. We demonstrate how contemporary vision and language representation\nlearning models based on contrastive losses and large web-based data capture\nlimited object localization information. We propose a minimal set of\nmodifications that results in models that uniquely learn both semantic and\nspatial information. We measure this performance in terms of zero-shot image\nrecognition, unsupervised bottom-up and top-down semantic segmentations, as\nwell as robustness analyses. We find that the resulting model achieves\nstate-of-the-art results in terms of unsupervised segmentation, and demonstrate\nthat the learned representations are uniquely robust to spurious correlations\nin datasets designed to probe the causal behavior of vision models.\n","authors":["Kanchana Ranasinghe","Brandon McKinzie","Sachin Ravi","Yinfei Yang","Alexander Toshev","Jonathon Shlens"],"pdf_url":"https://arxiv.org/pdf/2210.09996v3.pdf","comment":"Accepted and presented at ICCV 2023"},{"id":"http://arxiv.org/abs/2210.10959v6","updated":"2023-08-22T01:31:55Z","published":"2022-10-20T02:00:58Z","title":"Geo6D: Geometric Constraints Learning for 6D Pose Estimation","summary":" Numerous 6D pose estimation methods have been proposed that employ end-to-end\nregression to directly estimate the target pose parameters. Since the visible\nfeatures of objects are implicitly influenced by their poses, the network\nallows inferring the pose by analyzing the differences in features in the\nvisible region. However, due to the unpredictable and unrestricted range of\npose variations, the implicitly learned visible feature-pose constraints are\ninsufficiently covered by the training samples, making the network vulnerable\nto unseen object poses. To tackle these challenges, we proposed a novel\ngeometric constraints learning approach called Geo6D for direct regression 6D\npose estimation methods. It introduces a pose transformation formula expressed\nin relative offset representation, which is leveraged as geometric constraints\nto reconstruct the input and output targets of the network. These reconstructed\ndata enable the network to estimate the pose based on explicit geometric\nconstraints and relative offset representation mitigates the issue of the pose\ndistribution gap. Extensive experimental results show that when equipped with\nGeo6D, the direct 6D methods achieve state-of-the-art performance on multiple\ndatasets and demonstrate significant effectiveness, even with only 10% amount\nof data.\n","authors":["Jianqiu Chen","Mingshan Sun","Ye Zheng","Tianpeng Bao","Zhenyu He","Donghai Li","Guoqiang Jin","Rui Zhao","Liwei Wu","Xiaoke Jiang"],"pdf_url":"https://arxiv.org/pdf/2210.10959v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11112v1","updated":"2023-08-22T01:27:04Z","published":"2023-08-22T01:27:04Z","title":"Development of a Novel Quantum Pre-processing Filter to Improve Image\n Classification Accuracy of Neural Network Models","summary":" This paper proposes a novel quantum pre-processing filter (QPF) to improve\nthe image classification accuracy of neural network (NN) models. A simple four\nqubit quantum circuit that uses Y rotation gates for encoding and two\ncontrolled NOT gates for creating correlation among the qubits is applied as a\nfeature extraction filter prior to passing data into the fully connected NN\narchitecture. By applying the QPF approach, the results show that the image\nclassification accuracy based on the MNIST (handwritten 10 digits) and the\nEMNIST (handwritten 47 class digits and letters) datasets can be improved, from\n92.5% to 95.4% and from 68.9% to 75.9%, respectively. These improvements were\nobtained without introducing extra model parameters or optimizations in the\nmachine learning process. However, tests performed on the developed QPF\napproach against a relatively complex GTSRB dataset with 43 distinct class\nreal-life traffic sign images showed a degradation in the classification\naccuracy. Considering this result, further research into the understanding and\nthe design of a more suitable quantum circuit approach for image classification\nneural networks could be explored utilizing the baseline method proposed in\nthis paper.\n","authors":["Farina Riaz","Shahab Abdulla","Hajime Suzuki","Srinjoy Ganguly","Ravinesh C. Deo","Susan Hopkins"],"pdf_url":"https://arxiv.org/pdf/2308.11112v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.11111v1","updated":"2023-08-22T01:24:14Z","published":"2023-08-22T01:24:14Z","title":"CAME: Contrastive Automated Model Evaluation","summary":" The Automated Model Evaluation (AutoEval) framework entertains the\npossibility of evaluating a trained machine learning model without resorting to\na labeled testing set. Despite the promise and some decent results, the\nexisting AutoEval methods heavily rely on computing distribution shifts between\nthe unlabelled testing set and the training set. We believe this reliance on\nthe training set becomes another obstacle in shipping this technology to\nreal-world ML development. In this work, we propose Contrastive Automatic Model\nEvaluation (CAME), a novel AutoEval framework that is rid of involving training\nset in the loop. The core idea of CAME bases on a theoretical analysis which\nbonds the model performance with a contrastive loss. Further, with extensive\nempirical validation, we manage to set up a predictable relationship between\nthe two, simply by deducing on the unlabeled/unseen testing set. The resulting\nframework CAME establishes a new SOTA results for AutoEval by surpassing prior\nwork significantly.\n","authors":["Ru Peng","Qiuyang Duan","Haobo Wang","Jiachen Ma","Yanbo Jiang","Yongjun Tu","Xiu Jiang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.11111v1.pdf","comment":"ICCV2023 main conference"},{"id":"http://arxiv.org/abs/2308.09949v2","updated":"2023-08-22T01:21:48Z","published":"2023-08-19T08:56:35Z","title":"Scene-Aware Feature Matching","summary":" Current feature matching methods focus on point-level matching, pursuing\nbetter representation learning of individual features, but lacking further\nunderstanding of the scene. This results in significant performance degradation\nwhen handling challenging scenes such as scenes with large viewpoint and\nillumination changes. To tackle this problem, we propose a novel model named\nSAM, which applies attentional grouping to guide Scene-Aware feature Matching.\nSAM handles multi-level features, i.e., image tokens and group tokens, with\nattention layers, and groups the image tokens with the proposed token grouping\nmodule. Our model can be trained by ground-truth matches only and produce\nreasonable grouping results. With the sense-aware grouping guidance, SAM is not\nonly more accurate and robust but also more interpretable than conventional\nfeature matching models. Sufficient experiments on various applications,\nincluding homography estimation, pose estimation, and image matching,\ndemonstrate that our model achieves state-of-the-art performance.\n","authors":["Xiaoyong Lu","Yaping Yan","Tong Wei","Songlin Du"],"pdf_url":"https://arxiv.org/pdf/2308.09949v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2305.06077v2","updated":"2023-08-22T01:06:42Z","published":"2023-05-10T11:57:49Z","title":"Relightify: Relightable 3D Faces from a Single Image via Diffusion\n Models","summary":" Following the remarkable success of diffusion models on image generation,\nrecent works have also demonstrated their impressive ability to address a\nnumber of inverse problems in an unsupervised way, by properly constraining the\nsampling process based on a conditioning input. Motivated by this, in this\npaper, we present the first approach to use diffusion models as a prior for\nhighly accurate 3D facial BRDF reconstruction from a single image. We start by\nleveraging a high-quality UV dataset of facial reflectance (diffuse and\nspecular albedo and normals), which we render under varying illumination\nsettings to simulate natural RGB textures and, then, train an unconditional\ndiffusion model on concatenated pairs of rendered textures and reflectance\ncomponents. At test time, we fit a 3D morphable model to the given image and\nunwrap the face in a partial UV texture. By sampling from the diffusion model,\nwhile retaining the observed texture part intact, the model inpaints not only\nthe self-occluded areas but also the unknown reflectance components, in a\nsingle sequence of denoising steps. In contrast to existing methods, we\ndirectly acquire the observed texture from the input image, thus, resulting in\nmore faithful and consistent reflectance estimation. Through a series of\nqualitative and quantitative comparisons, we demonstrate superior performance\nin both texture completion as well as reflectance reconstruction tasks.\n","authors":["Foivos Paraperas Papantoniou","Alexandros Lattas","Stylianos Moschoglou","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2305.06077v2.pdf","comment":"ICCV 2023, 15 pages, 14 figures. Project page:\n https://foivospar.github.io/Relightify/"},{"id":"http://arxiv.org/abs/2308.11107v1","updated":"2023-08-22T01:05:31Z","published":"2023-08-22T01:05:31Z","title":"Classification of the lunar surface pattern by AI architectures: Does AI\n see a rabbit in the Moon?","summary":" In Asian countries, there is a tradition that a rabbit (the Moon rabbit)\nlives on the Moon. As the origin of this tradition, usually, two reasons are\nmentioned. One reason is that the color pattern of the lunar surface is similar\nto the shape of a rabbit. The other reason is that both the Moon and rabbit are\nsymbols of fertility because the Moon appears and disappears (i.e., waxing and\nwaning) cyclically, and rabbits bear children frequently. Considering the\nlatter reason, is the lunar surface color pattern not similar to a rabbit?\nHere, the similarity between rabbit and the lunar surface pattern was evaluated\nusing seven AI architectures. In the test by CLIP, assuming that people look at\nthe Moon in the early evening frequently, the lunar surface is more similar to\na rabbit than a face at low latitude regions, while it can be classified as\nface as latitude increases, which is consistent with that the oldest literature\nabout the Moon rabbit was written in India and that there is a culture of\nhuman's face in the Moon in Europe. Tested with ImageNet weights, ConvNeXt and\nCLIP sometimes classified the lunar surface pattern into rabbit with relatively\nhigh probabilities. Cultures are generated by our attitude to the environment.\nBoth dynamic and static similarities may be required to induce our imagination.\n","authors":["Daigo Shoji"],"pdf_url":"https://arxiv.org/pdf/2308.11107v1.pdf","comment":"15 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2308.11106v1","updated":"2023-08-22T01:02:15Z","published":"2023-08-22T01:02:15Z","title":"Recursive Video Lane Detection","summary":" A novel algorithm to detect road lanes in videos, called recursive video lane\ndetector (RVLD), is proposed in this paper, which propagates the state of a\ncurrent frame recursively to the next frame. RVLD consists of an intra-frame\nlane detector (ILD) and a predictive lane detector (PLD). First, we design ILD\nto localize lanes in a still frame. Second, we develop PLD to exploit the\ninformation of the previous frame for lane detection in a current frame. To\nthis end, we estimate a motion field and warp the previous output to the\ncurrent frame. Using the warped information, we refine the feature map of the\ncurrent frame to detect lanes more reliably. Experimental results show that\nRVLD outperforms existing detectors on video lane datasets. Our codes are\navailable at https://github.com/dongkwonjin/RVLD.\n","authors":["Dongkwon Jin","Dahyun Kim","Chang-Su Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11106v1.pdf","comment":"ICCV 2023 accepted"},{"id":"http://arxiv.org/abs/2308.11096v1","updated":"2023-08-22T00:40:37Z","published":"2023-08-22T00:40:37Z","title":"MosaiQ: Quantum Generative Adversarial Networks for Image Generation on\n NISQ Computers","summary":" Quantum machine learning and vision have come to the fore recently, with\nhardware advances enabling rapid advancement in the capabilities of quantum\nmachines. Recently, quantum image generation has been explored with many\npotential advantages over non-quantum techniques; however, previous techniques\nhave suffered from poor quality and robustness. To address these problems, we\nintroduce, MosaiQ, a high-quality quantum image generation GAN framework that\ncan be executed on today's Near-term Intermediate Scale Quantum (NISQ)\ncomputers.\n","authors":["Daniel Silver","Tirthak Patel","William Cutler","Aditya Ranjan","Harshitta Gandhi","Devesh Tiwari"],"pdf_url":"https://arxiv.org/pdf/2308.11096v1.pdf","comment":"Accepted to appear at ICCV'23"},{"id":"http://arxiv.org/abs/2308.09779v2","updated":"2023-08-22T00:27:55Z","published":"2023-08-18T18:59:27Z","title":"EAVL: Explicitly Align Vision and Language for Referring Image\n Segmentation","summary":" Referring image segmentation aims to segment an object mentioned in natural\nlanguage from an image. A main challenge is language-related localization,\nwhich means locating the object with the relevant language. Previous approaches\nmainly focus on the fusion of vision and language features without fully\naddressing language-related localization. In previous approaches, fused\nvision-language features are directly fed into a decoder and pass through a\nconvolution with a fixed kernel to obtain the result, which follows a similar\npattern as traditional image segmentation. This approach does not explicitly\nalign language and vision features in the segmentation stage, resulting in a\nsuboptimal language-related localization. Different from previous methods, we\npropose Explicitly Align the Vision and Language for Referring Image\nSegmentation (EAVL). Instead of using a fixed convolution kernel, we propose an\nAligner which explicitly aligns the vision and language features in the\nsegmentation stage. Specifically, a series of unfixed convolution kernels are\ngenerated based on the input l, and then are use to explicitly align the vision\nand language features. To achieve this, We generate multiple queries that\nrepresent different emphases of the language expression. These queries are\ntransformed into a series of query-based convolution kernels. Then, we utilize\nthese kernels to do convolutions in the segmentation stage and obtain a series\nof segmentation masks. The final result is obtained through the aggregation of\nall masks. Our method can not only fuse vision and language features\neffectively but also exploit their potential in the segmentation stage. And\nmost importantly, we explicitly align language features of different emphases\nwith the image features to achieve language-related localization. Our method\nsurpasses previous state-of-the-art methods on RefCOCO, RefCOCO+, and G-Ref by\nlarge margins.\n","authors":["Yichen Yan","Xingjian He","Wenxuan Wang","Sihan Chen","Jing Liu"],"pdf_url":"https://arxiv.org/pdf/2308.09779v2.pdf","comment":"10 pages, 4 figures. arXiv admin note: text overlap with\n arXiv:2305.14969"},{"id":"http://arxiv.org/abs/2308.11093v1","updated":"2023-08-22T00:21:32Z","published":"2023-08-22T00:21:32Z","title":"Video OWL-ViT: Temporally-consistent open-world localization in video","summary":" We present an architecture and a training recipe that adapts pre-trained\nopen-world image models to localization in videos. Understanding the open\nvisual world (without being constrained by fixed label spaces) is crucial for\nmany real-world vision tasks. Contrastive pre-training on large image-text\ndatasets has recently led to significant improvements for image-level tasks.\nFor more structured tasks involving object localization applying pre-trained\nmodels is more challenging. This is particularly true for video tasks, where\ntask-specific data is limited. We show successful transfer of open-world models\nby building on the OWL-ViT open-vocabulary detection model and adapting it to\nvideo by adding a transformer decoder. The decoder propagates object\nrepresentations recurrently through time by using the output tokens for one\nframe as the object queries for the next. Our model is end-to-end trainable on\nvideo data and enjoys improved temporal consistency compared to\ntracking-by-detection baselines, while retaining the open-world capabilities of\nthe backbone detector. We evaluate our model on the challenging TAO-OW\nbenchmark and demonstrate that open-world capabilities, learned from\nlarge-scale image-text pre-training, can be transferred successfully to\nopen-world localization across diverse videos.\n","authors":["Georg Heigold","Matthias Minderer","Alexey Gritsenko","Alex Bewley","Daniel Keysers","Mario Lučić","Fisher Yu","Thomas Kipf"],"pdf_url":"https://arxiv.org/pdf/2308.11093v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11090v1","updated":"2023-08-22T00:10:23Z","published":"2023-08-22T00:10:23Z","title":"Addressing Fairness and Explainability in Image Classification Using\n Optimal Transport","summary":" Algorithmic Fairness and the explainability of potentially unfair outcomes\nare crucial for establishing trust and accountability of Artificial\nIntelligence systems in domains such as healthcare and policing. Though\nsignificant advances have been made in each of the fields separately, achieving\nexplainability in fairness applications remains challenging, particularly so in\ndomains where deep neural networks are used. At the same time, ethical\ndata-mining has become ever more relevant, as it has been shown countless times\nthat fairness-unaware algorithms result in biased outcomes. Current approaches\nfocus on mitigating biases in the outcomes of the model, but few attempts have\nbeen made to try to explain \\emph{why} a model is biased. To bridge this gap,\nwe propose a comprehensive approach that leverages optimal transport theory to\nuncover the causes and implications of biased regions in images, which easily\nextends to tabular data as well. Through the use of Wasserstein barycenters, we\nobtain scores that are independent of a sensitive variable but keep their\nmarginal orderings. This step ensures predictive accuracy but also helps us to\nrecover the regions most associated with the generation of the biases. Our\nfindings hold significant implications for the development of trustworthy and\nunbiased AI systems, fostering transparency, accountability, and fairness in\ncritical decision-making scenarios across diverse domains.\n","authors":["Philipp Ratz","François Hu","Arthur Charpentier"],"pdf_url":"https://arxiv.org/pdf/2308.11090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06840v2","updated":"2023-08-22T23:19:03Z","published":"2023-03-13T04:06:42Z","title":"DDFM: Denoising Diffusion Model for Multi-Modality Image Fusion","summary":" Multi-modality image fusion aims to combine different modalities to produce\nfused images that retain the complementary features of each modality, such as\nfunctional highlights and texture details. To leverage strong generative priors\nand address challenges such as unstable training and lack of interpretability\nfor GAN-based generative methods, we propose a novel fusion algorithm based on\nthe denoising diffusion probabilistic model (DDPM). The fusion task is\nformulated as a conditional generation problem under the DDPM sampling\nframework, which is further divided into an unconditional generation subproblem\nand a maximum likelihood subproblem. The latter is modeled in a hierarchical\nBayesian manner with latent variables and inferred by the\nexpectation-maximization (EM) algorithm. By integrating the inference solution\ninto the diffusion sampling iteration, our method can generate high-quality\nfused images with natural image generative priors and cross-modality\ninformation from source images. Note that all we required is an unconditional\npre-trained generative model, and no fine-tuning is needed. Our extensive\nexperiments indicate that our approach yields promising fusion results in\ninfrared-visible image fusion and medical image fusion. The code is available\nat \\url{https://github.com/Zhaozixiang1228/MMIF-DDFM}.\n","authors":["Zixiang Zhao","Haowen Bai","Yuanzhi Zhu","Jiangshe Zhang","Shuang Xu","Yulun Zhang","Kai Zhang","Deyu Meng","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.06840v2.pdf","comment":"Accepted by ICCV 2023 (Oral)"},{"id":"http://arxiv.org/abs/2308.11822v1","updated":"2023-08-22T23:02:06Z","published":"2023-08-22T23:02:06Z","title":"PatchBackdoor: Backdoor Attack against Deep Neural Networks without\n Model Modification","summary":" Backdoor attack is a major threat to deep learning systems in safety-critical\nscenarios, which aims to trigger misbehavior of neural network models under\nattacker-controlled conditions. However, most backdoor attacks have to modify\nthe neural network models through training with poisoned data and/or direct\nmodel editing, which leads to a common but false belief that backdoor attack\ncan be easily avoided by properly protecting the model. In this paper, we show\nthat backdoor attacks can be achieved without any model modification. Instead\nof injecting backdoor logic into the training data or the model, we propose to\nplace a carefully-designed patch (namely backdoor patch) in front of the\ncamera, which is fed into the model together with the input images. The patch\ncan be trained to behave normally at most of the time, while producing wrong\nprediction when the input image contains an attacker-controlled trigger object.\nOur main techniques include an effective training method to generate the\nbackdoor patch and a digital-physical transformation modeling method to enhance\nthe feasibility of the patch in real deployments. Extensive experiments show\nthat PatchBackdoor can be applied to common deep learning models (VGG,\nMobileNet, ResNet) with an attack success rate of 93% to 99% on classification\ntasks. Moreover, we implement PatchBackdoor in real-world scenarios and show\nthat the attack is still threatening.\n","authors":["Yizhen Yuan","Rui Kong","Shenghao Xie","Yuanchun Li","Yunxin Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11822v1.pdf","comment":"accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2208.13930v5","updated":"2023-08-22T21:33:48Z","published":"2022-08-29T23:57:55Z","title":"SAFE: Sensitivity-Aware Features for Out-of-Distribution Object\n Detection","summary":" We address the problem of out-of-distribution (OOD) detection for the task of\nobject detection. We show that residual convolutional layers with batch\nnormalisation produce Sensitivity-Aware FEatures (SAFE) that are consistently\npowerful for distinguishing in-distribution from out-of-distribution\ndetections. We extract SAFE vectors for every detected object, and train a\nmultilayer perceptron on the surrogate task of distinguishing adversarially\nperturbed from clean in-distribution examples. This circumvents the need for\nrealistic OOD training data, computationally expensive generative models, or\nretraining of the base object detector. SAFE outperforms the state-of-the-art\nOOD object detectors on multiple benchmarks by large margins, e.g. reducing the\nFPR95 by an absolute 30.6% from 48.3% to 17.7% on the OpenImages dataset.\n","authors":["Samuel Wilson","Tobias Fischer","Feras Dayoub","Dimity Miller","Niko Sünderhauf"],"pdf_url":"https://arxiv.org/pdf/2208.13930v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11797v1","updated":"2023-08-22T21:29:55Z","published":"2023-08-22T21:29:55Z","title":"CLIP Multi-modal Hashing: A new baseline CLIPMH","summary":" The multi-modal hashing method is widely used in multimedia retrieval. It can\nfuse multi-source data to generate binary hash code. However, the current\nmulti-modal methods have the problem of low retrieval accuracy. The reason is\nthat the individual backbone networks have limited feature expression\ncapabilities and are not jointly pre-trained on large-scale unsupervised\nmulti-modal data. To solve this problem, we propose a new baseline CLIP\nMulti-modal Hashing (CLIPMH) method. It uses CLIP model to extract text and\nimage features, and then fuse to generate hash code. CLIP improves the\nexpressiveness of each modal feature. In this way, it can greatly improve the\nretrieval performance of multi-modal hashing methods. In comparison to\nstate-of-the-art unsupervised and supervised multi-modal hashing methods,\nexperiments reveal that the proposed CLIPMH can significantly enhance\nperformance (Maximum increase of 8.38%). CLIP also has great advantages over\nthe text and visual backbone networks commonly used before.\n","authors":["Jian Zhu","Mingkai Sheng","Mingda Ke","Zhangmin Huang","Jingfei Chang"],"pdf_url":"https://arxiv.org/pdf/2308.11797v1.pdf","comment":"submit to ICASSP2024"},{"id":"http://arxiv.org/abs/2308.11796v1","updated":"2023-08-22T21:28:58Z","published":"2023-08-22T21:28:58Z","title":"Time Does Tell: Self-Supervised Time-Tuning of Dense Image\n Representations","summary":" Spatially dense self-supervised learning is a rapidly growing problem domain\nwith promising applications for unsupervised segmentation and pretraining for\ndense downstream tasks. Despite the abundance of temporal data in the form of\nvideos, this information-rich source has been largely overlooked. Our paper\naims to address this gap by proposing a novel approach that incorporates\ntemporal consistency in dense self-supervised learning. While methods designed\nsolely for images face difficulties in achieving even the same performance on\nvideos, our method improves not only the representation quality for videos-but\nalso images. Our approach, which we call time-tuning, starts from\nimage-pretrained models and fine-tunes them with a novel self-supervised\ntemporal-alignment clustering loss on unlabeled videos. This effectively\nfacilitates the transfer of high-level information from videos to image\nrepresentations. Time-tuning improves the state-of-the-art by 8-10% for\nunsupervised semantic segmentation on videos and matches it for images. We\nbelieve this method paves the way for further self-supervised scaling by\nleveraging the abundant availability of videos. The implementation can be found\nhere : https://github.com/SMSD75/Timetuning\n","authors":["Mohammadreza Salehi","Efstratios Gavves","Cees G. M. Snoek","Yuki M. Asano"],"pdf_url":"https://arxiv.org/pdf/2308.11796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11793v1","updated":"2023-08-22T21:18:54Z","published":"2023-08-22T21:18:54Z","title":"Enhancing NeRF akin to Enhancing LLMs: Generalizable NeRF Transformer\n with Mixture-of-View-Experts","summary":" Cross-scene generalizable NeRF models, which can directly synthesize novel\nviews of unseen scenes, have become a new spotlight of the NeRF field. Several\nexisting attempts rely on increasingly end-to-end \"neuralized\" architectures,\ni.e., replacing scene representation and/or rendering modules with performant\nneural networks such as transformers, and turning novel view synthesis into a\nfeed-forward inference pipeline. While those feedforward \"neuralized\"\narchitectures still do not fit diverse scenes well out of the box, we propose\nto bridge them with the powerful Mixture-of-Experts (MoE) idea from large\nlanguage models (LLMs), which has demonstrated superior generalization ability\nby balancing between larger overall model capacity and flexible per-instance\nspecialization. Starting from a recent generalizable NeRF architecture called\nGNT, we first demonstrate that MoE can be neatly plugged in to enhance the\nmodel. We further customize a shared permanent expert and a geometry-aware\nconsistency loss to enforce cross-scene consistency and spatial smoothness\nrespectively, which are essential for generalizable view synthesis. Our\nproposed model, dubbed GNT with Mixture-of-View-Experts (GNT-MOVE), has\nexperimentally shown state-of-the-art results when transferring to unseen\nscenes, indicating remarkably better cross-scene generalization in both\nzero-shot and few-shot settings. Our codes are available at\nhttps://github.com/VITA-Group/GNT-MOVE.\n","authors":["Wenyan Cong","Hanxue Liang","Peihao Wang","Zhiwen Fan","Tianlong Chen","Mukund Varma","Yi Wang","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11793v1.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.11788v1","updated":"2023-08-22T21:03:58Z","published":"2023-08-22T21:03:58Z","title":"An extensible point-based method for data chart value detection","summary":" We present an extensible method for identifying semantic points to reverse\nengineer (i.e. extract the values of) data charts, particularly those in\nscientific articles. Our method uses a point proposal network (akin to region\nproposal networks for object detection) to directly predict the position of\npoints of interest in a chart, and it is readily extensible to multiple chart\ntypes and chart elements. We focus on complex bar charts in the scientific\nliterature, on which our model is able to detect salient points with an\naccuracy of 0.8705 F1 (@1.5-cell max deviation); it achieves 0.9810 F1 on\nsynthetically-generated charts similar to those used in prior works. We also\nexplore training exclusively on synthetic data with novel augmentations,\nreaching surprisingly competent performance in this way (0.6621 F1) on real\ncharts with widely varying appearance, and we further demonstrate our unchanged\nmethod applied directly to synthetic pie charts (0.8343 F1). Datasets, trained\nmodels, and evaluation code are available at\nhttps://github.com/BNLNLP/PPN_model.\n","authors":["Carlos Soto","Shinjae Yoo"],"pdf_url":"https://arxiv.org/pdf/2308.11788v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11783v1","updated":"2023-08-22T20:43:31Z","published":"2023-08-22T20:43:31Z","title":"Coarse-to-Fine Multi-Scene Pose Regression with Transformers","summary":" Absolute camera pose regressors estimate the position and orientation of a\ncamera given the captured image alone. Typically, a convolutional backbone with\na multi-layer perceptron (MLP) head is trained using images and pose labels to\nembed a single reference scene at a time. Recently, this scheme was extended to\nlearn multiple scenes by replacing the MLP head with a set of fully connected\nlayers. In this work, we propose to learn multi-scene absolute camera pose\nregression with Transformers, where encoders are used to aggregate activation\nmaps with self-attention and decoders transform latent features and scenes\nencoding into pose predictions. This allows our model to focus on general\nfeatures that are informative for localization, while embedding multiple scenes\nin parallel. We extend our previous MS-Transformer approach\n\\cite{shavit2021learning} by introducing a mixed classification-regression\narchitecture that improves the localization accuracy. Our method is evaluated\non commonly benchmark indoor and outdoor datasets and has been shown to exceed\nboth multi-scene and state-of-the-art single-scene absolute pose regressors.\n","authors":["Yoli Shavit","Ron Ferens","Yosi Keller"],"pdf_url":"https://arxiv.org/pdf/2308.11783v1.pdf","comment":"Accepted to IEEE Transactions on Pattern Analysis and Machine\n Intelligence (TPAMI). arXiv admin note: substantial text overlap with\n arXiv:2103.11468"},{"id":"http://arxiv.org/abs/2308.11778v1","updated":"2023-08-22T20:36:16Z","published":"2023-08-22T20:36:16Z","title":"Understanding Hessian Alignment for Domain Generalization","summary":" Out-of-distribution (OOD) generalization is a critical ability for deep\nlearning models in many real-world scenarios including healthcare and\nautonomous vehicles. Recently, different techniques have been proposed to\nimprove OOD generalization. Among these methods, gradient-based regularizers\nhave shown promising performance compared with other competitors. Despite this\nsuccess, our understanding of the role of Hessian and gradient alignment in\ndomain generalization is still limited. To address this shortcoming, we analyze\nthe role of the classifier's head Hessian matrix and gradient in domain\ngeneralization using recent OOD theory of transferability. Theoretically, we\nshow that spectral norm between the classifier's head Hessian matrices across\ndomains is an upper bound of the transfer measure, a notion of distance between\ntarget and source domains. Furthermore, we analyze all the attributes that get\naligned when we encourage similarity between Hessians and gradients. Our\nanalysis explains the success of many regularizers like CORAL, IRM, V-REx,\nFish, IGA, and Fishr as they regularize part of the classifier's head Hessian\nand/or gradient. Finally, we propose two simple yet effective methods to match\nthe classifier's head Hessians and gradients in an efficient way, based on the\nHessian Gradient Product (HGP) and Hutchinson's method (Hutchinson), and\nwithout directly calculating Hessians. We validate the OOD generalization\nability of proposed methods in different scenarios, including transferability,\nsevere correlation shift, label shift and diversity shift. Our results show\nthat Hessian alignment methods achieve promising performance on various OOD\nbenchmarks. The code is available at\n\\url{https://github.com/huawei-noah/Federated-Learning/tree/main/HessianAlignment}.\n","authors":["Sobhan Hemati","Guojun Zhang","Amir Estiri","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2308.11778v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11776v1","updated":"2023-08-22T20:35:24Z","published":"2023-08-22T20:35:24Z","title":"WS-SfMLearner: Self-supervised Monocular Depth and Ego-motion Estimation\n on Surgical Videos with Unknown Camera Parameters","summary":" Depth estimation in surgical video plays a crucial role in many image-guided\nsurgery procedures. However, it is difficult and time consuming to create depth\nmap ground truth datasets in surgical videos due in part to inconsistent\nbrightness and noise in the surgical scene. Therefore, building an accurate and\nrobust self-supervised depth and camera ego-motion estimation system is gaining\nmore attention from the computer vision community. Although several\nself-supervision methods alleviate the need for ground truth depth maps and\nposes, they still need known camera intrinsic parameters, which are often\nmissing or not recorded. Moreover, the camera intrinsic prediction methods in\nexisting works depend heavily on the quality of datasets. In this work, we\naimed to build a self-supervised depth and ego-motion estimation system which\ncan predict not only accurate depth maps and camera pose, but also camera\nintrinsic parameters. We proposed a cost-volume-based supervision manner to\ngive the system auxiliary supervision for camera parameters prediction. The\nexperimental results showed that the proposed method improved the accuracy of\nestimated camera parameters, ego-motion, and depth estimation.\n","authors":["Ange Lou","Jack Noble"],"pdf_url":"https://arxiv.org/pdf/2308.11776v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11774v1","updated":"2023-08-22T20:31:00Z","published":"2023-08-22T20:31:00Z","title":"SAMSNeRF: Segment Anything Model (SAM) Guides Dynamic Surgical Scene\n Reconstruction by Neural Radiance Field (NeRF)","summary":" The accurate reconstruction of surgical scenes from surgical videos is\ncritical for various applications, including intraoperative navigation and\nimage-guided robotic surgery automation. However, previous approaches, mainly\nrelying on depth estimation, have limited effectiveness in reconstructing\nsurgical scenes with moving surgical tools. To address this limitation and\nprovide accurate 3D position prediction for surgical tools in all frames, we\npropose a novel approach called SAMSNeRF that combines Segment Anything Model\n(SAM) and Neural Radiance Field (NeRF) techniques. Our approach generates\naccurate segmentation masks of surgical tools using SAM, which guides the\nrefinement of the dynamic surgical scene reconstruction by NeRF. Our\nexperimental results on public endoscopy surgical videos demonstrate that our\napproach successfully reconstructs high-fidelity dynamic surgical scenes and\naccurately reflects the spatial information of surgical tools. Our proposed\napproach can significantly enhance surgical navigation and automation by\nproviding surgeons with accurate 3D position information of surgical tools\nduring surgery.The source code will be released soon.\n","authors":["Ange Lou","Yamin Li","Xing Yao","Yike Zhang","Jack Noble"],"pdf_url":"https://arxiv.org/pdf/2308.11774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11771v1","updated":"2023-08-22T20:24:24Z","published":"2023-08-22T20:24:24Z","title":"3ET: Efficient Event-based Eye Tracking using a Change-Based ConvLSTM\n Network","summary":" This paper presents a sparse Change-Based Convolutional Long Short-Term\nMemory (CB-ConvLSTM) model for event-based eye tracking, key for\nnext-generation wearable healthcare technology such as AR/VR headsets. We\nleverage the benefits of retina-inspired event cameras, namely their\nlow-latency response and sparse output event stream, over traditional\nframe-based cameras. Our CB-ConvLSTM architecture efficiently extracts\nspatio-temporal features for pupil tracking from the event stream,\noutperforming conventional CNN structures. Utilizing a delta-encoded recurrent\npath enhancing activation sparsity, CB-ConvLSTM reduces arithmetic operations\nby approximately 4.7$\\times$ without losing accuracy when tested on a\n\\texttt{v2e}-generated event dataset of labeled pupils. This increase in\nefficiency makes it ideal for real-time eye tracking in resource-constrained\ndevices. The project code and dataset are openly available at\n\\url{https://github.com/qinche106/cb-convlstm-eyetracking}.\n","authors":["Qinyu Chen","Zuowen Wang","Shih-Chii Liu","Chang Gao"],"pdf_url":"https://arxiv.org/pdf/2308.11771v1.pdf","comment":"To be published at the 2023 IEEE Biomedical Circuits and Systems\n (BioCAS) Conference"},{"id":"http://arxiv.org/abs/2308.11757v1","updated":"2023-08-22T19:58:02Z","published":"2023-08-22T19:58:02Z","title":"Weakly Supervised Face and Whole Body Recognition in Turbulent\n Environments","summary":" Face and person recognition have recently achieved remarkable success under\nchallenging scenarios, such as off-pose and cross-spectrum matching. However,\nlong-range recognition systems are often hindered by atmospheric turbulence,\nleading to spatially and temporally varying distortions in the image. Current\nsolutions rely on generative models to reconstruct a turbulent-free image, but\noften preserve photo-realism instead of discriminative features that are\nessential for recognition. This can be attributed to the lack of large-scale\ndatasets of turbulent and pristine paired images, necessary for optimal\nreconstruction. To address this issue, we propose a new weakly supervised\nframework that employs a parameter-efficient self-attention module to generate\ndomain agnostic representations, aligning turbulent and pristine images into a\ncommon subspace. Additionally, we introduce a new tilt map estimator that\npredicts geometric distortions observed in turbulent images. This estimate is\nused to re-rank gallery matches, resulting in up to 13.86\\% improvement in\nrank-1 accuracy. Our method does not require synthesizing turbulent-free images\nor ground-truth paired images, and requires significantly fewer annotated\nsamples, enabling more practical and rapid utility of increasingly large\ndatasets. We analyze our framework using two datasets -- Long-Range Face\nIdentification Dataset (LRFID) and BRIAR Government Collection 1 (BGC1) --\nachieving enhanced discriminability under varying turbulence and standoff\ndistance.\n","authors":["Kshitij Nikhal","Benjamin S. Riggan"],"pdf_url":"https://arxiv.org/pdf/2308.11757v1.pdf","comment":"IJCB 2023"},{"id":"http://arxiv.org/abs/2211.08772v3","updated":"2023-08-22T19:45:17Z","published":"2022-11-16T09:00:20Z","title":"MIMT: Multi-Illuminant Color Constancy via Multi-Task Local Surface and\n Light Color Learning","summary":" The assumption of a uniform light color distribution is no longer applicable\nin scenes that have multiple light colors. Most color constancy methods are\ndesigned to deal with a single light color, and thus are erroneous when applied\nto multiple light colors. The spatial variability in multiple light colors\ncauses the color constancy problem to be more challenging and requires the\nextraction of local surface/light information. Motivated by this, we introduce\na multi-task learning method to discount multiple light colors in a single\ninput image. To have better cues of the local surface/light colors under\nmultiple light color conditions, we design a novel multi-task learning\nframework. Our framework includes auxiliary tasks of achromatic-pixel detection\nand surface-color similarity prediction, providing better cues for local light\nand surface colors, respectively. Moreover, to ensure that our model maintains\nthe constancy of surface colors regardless of the variations of light colors, a\nnovel local surface color feature preservation scheme is developed. We\ndemonstrate that our model achieves 47.1% improvement (from 4.69 mean angular\nerror to 2.48) compared to a state-of-the-art multi-illuminant color constancy\nmethod on a multi-illuminant dataset (LSMI).\n","authors":["Shuwei Li","Jikai Wang","Michael S. Brown","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2211.08772v3.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.11744v1","updated":"2023-08-22T19:09:56Z","published":"2023-08-22T19:09:56Z","title":"Efficient Controllable Multi-Task Architectures","summary":" We aim to train a multi-task model such that users can adjust the desired\ncompute budget and relative importance of task performances after deployment,\nwithout retraining. This enables optimizing performance for dynamically varying\nuser needs, without heavy computational overhead to train and save models for\nvarious scenarios. To this end, we propose a multi-task model consisting of a\nshared encoder and task-specific decoders where both encoder and decoder\nchannel widths are slimmable. Our key idea is to control the task importance by\nvarying the capacities of task-specific decoders, while controlling the total\ncomputational cost by jointly adjusting the encoder capacity. This improves\noverall accuracy by allowing a stronger encoder for a given budget, increases\ncontrol over computational cost, and delivers high-quality slimmed\nsub-architectures based on user's constraints. Our training strategy involves a\nnovel 'Configuration-Invariant Knowledge Distillation' loss that enforces\nbackbone representations to be invariant under different runtime width\nconfigurations to enhance accuracy. Further, we present a simple but effective\nsearch algorithm that translates user constraints to runtime width\nconfigurations of both the shared encoder and task decoders, for sampling the\nsub-architectures. The key rule for the search algorithm is to provide a larger\ncomputational budget to the higher preferred task decoder, while searching a\nshared encoder configuration that enhances the overall MTL performance. Various\nexperiments on three multi-task benchmarks (PASCALContext, NYUDv2, and\nCIFAR100-MTL) with diverse backbone architectures demonstrate the advantage of\nour approach. For example, our method shows a higher controllability by ~33.5%\nin the NYUD-v2 dataset over prior methods, while incurring much less compute\ncost.\n","authors":["Abhishek Aich","Samuel Schulter","Amit K. Roy-Chowdhury","Manmohan Chandraker","Yumin Suh"],"pdf_url":"https://arxiv.org/pdf/2308.11744v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2203.16475v4","updated":"2023-08-22T19:00:49Z","published":"2022-03-30T17:12:18Z","title":"Concept Evolution in Deep Learning Training: A Unified Interpretation\n Framework and Discoveries","summary":" We present ConceptEvo, a unified interpretation framework for deep neural\nnetworks (DNNs) that reveals the inception and evolution of learned concepts\nduring training. Our work addresses a critical gap in DNN interpretation\nresearch, as existing methods primarily focus on post-training interpretation.\nConceptEvo introduces two novel technical contributions: (1) an algorithm that\ngenerates a unified semantic space, enabling side-by-side comparison of\ndifferent models during training, and (2) an algorithm that discovers and\nquantifies important concept evolutions for class predictions. Through a\nlarge-scale human evaluation and quantitative experiments, we demonstrate that\nConceptEvo successfully identifies concept evolutions across different models,\nwhich are not only comprehensible to humans but also crucial for class\npredictions. ConceptEvo is applicable to both modern DNN architectures, such as\nConvNeXt, and classic DNNs, such as VGGs and InceptionV3.\n","authors":["Haekyu Park","Seongmin Lee","Benjamin Hoover","Austin P. Wright","Omar Shaikh","Rahul Duggal","Nilaksh Das","Kevin Li","Judy Hoffman","Duen Horng Chau"],"pdf_url":"https://arxiv.org/pdf/2203.16475v4.pdf","comment":"Accepted at CIKM'23"},{"id":"http://arxiv.org/abs/2308.11737v1","updated":"2023-08-22T18:57:07Z","published":"2023-08-22T18:57:07Z","title":"Animal3D: A Comprehensive Dataset of 3D Animal Pose and Shape","summary":" Accurately estimating the 3D pose and shape is an essential step towards\nunderstanding animal behavior, and can potentially benefit many downstream\napplications, such as wildlife conservation. However, research in this area is\nheld back by the lack of a comprehensive and diverse dataset with high-quality\n3D pose and shape annotations. In this paper, we propose Animal3D, the first\ncomprehensive dataset for mammal animal 3D pose and shape estimation. Animal3D\nconsists of 3379 images collected from 40 mammal species, high-quality\nannotations of 26 keypoints, and importantly the pose and shape parameters of\nthe SMAL model. All annotations were labeled and checked manually in a\nmulti-stage process to ensure highest quality results. Based on the Animal3D\ndataset, we benchmark representative shape and pose estimation models at: (1)\nsupervised learning from only the Animal3D data, (2) synthetic to real transfer\nfrom synthetically generated images, and (3) fine-tuning human pose and shape\nestimation models. Our experimental results demonstrate that predicting the 3D\nshape and pose of animals across species remains a very challenging task,\ndespite significant advances in human pose estimation. Our results further\ndemonstrate that synthetic pre-training is a viable strategy to boost the model\nperformance. Overall, Animal3D opens new directions for facilitating future\nresearch in animal 3D pose and shape estimation, and is publicly available.\n","authors":["Jiacong Xu","Yi Zhang","Jiawei Peng","Wufei Ma","Artur Jesslen","Pengliang Ji","Qixin Hu","Jiehua Zhang","Qihao Liu","Jiahao Wang","Wei Ji","Chen Wang","Xiaoding Yuan","Prakhar Kaushik","Guofeng Zhang","Jie Liu","Yushan Xie","Yawen Cui","Alan Yuille","Adam Kortylewski"],"pdf_url":"https://arxiv.org/pdf/2308.11737v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.11732v1","updated":"2023-08-22T18:43:57Z","published":"2023-08-22T18:43:57Z","title":"(Un)fair Exposure in Deep Face Rankings at a Distance","summary":" Law enforcement regularly faces the challenge of ranking suspects from their\nfacial images. Deep face models aid this process but frequently introduce\nbiases that disproportionately affect certain demographic segments. While bias\ninvestigation is common in domains like job candidate ranking, the field of\nforensic face rankings remains underexplored. In this paper, we propose a novel\nexperimental framework, encompassing six state-of-the-art face encoders and two\npublic data sets, designed to scrutinize the extent to which demographic groups\nsuffer from biases in exposure in the context of forensic face rankings.\nThrough comprehensive experiments that cover both re-identification and\nidentification tasks, we show that exposure biases within this domain are far\nfrom being countered, demanding attention towards establishing ad-hoc policies\nand corrective measures. The source code is available at\nhttps://github.com/atzoriandrea/ijcb2023-unfair-face-rankings\n","authors":["Andrea Atzori","Gianni Fenu","Mirko Marras"],"pdf_url":"https://arxiv.org/pdf/2308.11732v1.pdf","comment":"Accepted as a full paper at IJCB 2023 Special Session \"Long-Range\n Biometrics Challenges\": 2023 International Joint Conference on Biometrics"},{"id":"http://arxiv.org/abs/2303.17155v2","updated":"2023-08-22T18:13:14Z","published":"2023-03-30T05:25:20Z","title":"Discriminative Class Tokens for Text-to-Image Diffusion Models","summary":" Recent advances in text-to-image diffusion models have enabled the generation\nof diverse and high-quality images. While impressive, the images often fall\nshort of depicting subtle details and are susceptible to errors due to\nambiguity in the input text. One way of alleviating these issues is to train\ndiffusion models on class-labeled datasets. This approach has two\ndisadvantages: (i) supervised datasets are generally small compared to\nlarge-scale scraped text-image datasets on which text-to-image models are\ntrained, affecting the quality and diversity of the generated images, or (ii)\nthe input is a hard-coded label, as opposed to free-form text, limiting the\ncontrol over the generated images.\n In this work, we propose a non-invasive fine-tuning technique that\ncapitalizes on the expressive potential of free-form text while achieving high\naccuracy through discriminative signals from a pretrained classifier. This is\ndone by iteratively modifying the embedding of an added input token of a\ntext-to-image diffusion model, by steering generated images toward a given\ntarget class according to a classifier. Our method is fast compared to prior\nfine-tuning methods and does not require a collection of in-class images or\nretraining of a noise-tolerant classifier. We evaluate our method extensively,\nshowing that the generated images are: (i) more accurate and of higher quality\nthan standard diffusion models, (ii) can be used to augment training data in a\nlow-resource setting, and (iii) reveal information about the data used to train\nthe guiding classifier. The code is available at\n\\url{https://github.com/idansc/discriminative_class_tokens}.\n","authors":["Idan Schwartz","Vésteinn Snæbjarnarson","Hila Chefer","Ryan Cotterell","Serge Belongie","Lior Wolf","Sagie Benaim"],"pdf_url":"https://arxiv.org/pdf/2303.17155v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11696v1","updated":"2023-08-22T17:59:30Z","published":"2023-08-22T17:59:30Z","title":"Efficient Benchmarking (of Language Models)","summary":" The increasing versatility of language models LMs has given rise to a new\nclass of benchmarks that comprehensively assess a broad range of capabilities.\nSuch benchmarks are associated with massive computational costs reaching\nthousands of GPU hours per model. However the efficiency aspect of these\nevaluation efforts had raised little discussion in the literature. In this work\nwe present the problem of Efficient Benchmarking namely intelligently reducing\nthe computation costs of LM evaluation without compromising reliability. Using\nthe HELM benchmark as a test case we investigate how different benchmark design\nchoices affect the computation-reliability tradeoff. We propose to evaluate the\nreliability of such decisions by using a new measure Decision Impact on\nReliability DIoR for short. We find for example that the current leader on HELM\nmay change by merely removing a low-ranked model from the benchmark and observe\nthat a handful of examples suffice to obtain the correct benchmark ranking.\nConversely a slightly different choice of HELM scenarios varies ranking widely.\nBased on our findings we outline a set of concrete recommendations for more\nefficient benchmark design and utilization practices leading to dramatic cost\nsavings with minimal loss of benchmark reliability often reducing computation\nby x100 or more.\n","authors":["Yotam Perlitz","Elron Bandel","Ariel Gera","Ofir Arviv","Liat Ein-Dor","Eyal Shnarch","Noam Slonim","Michal Shmueli-Scheuer","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2308.11696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.11342v5","updated":"2023-08-22T17:02:42Z","published":"2021-06-21T18:19:46Z","title":"Dive into Deep Learning","summary":" This open-source book represents our attempt to make deep learning\napproachable, teaching readers the concepts, the context, and the code. The\nentire book is drafted in Jupyter notebooks, seamlessly integrating exposition\nfigures, math, and interactive examples with self-contained code. Our goal is\nto offer a resource that could (i) be freely available for everyone; (ii) offer\nsufficient technical depth to provide a starting point on the path to actually\nbecoming an applied machine learning scientist; (iii) include runnable code,\nshowing readers how to solve problems in practice; (iv) allow for rapid\nupdates, both by us and also by the community at large; (v) be complemented by\na forum for interactive discussion of technical details and to answer\nquestions.\n","authors":["Aston Zhang","Zachary C. Lipton","Mu Li","Alexander J. Smola"],"pdf_url":"https://arxiv.org/pdf/2106.11342v5.pdf","comment":"(HTML) https://D2L.ai (GitHub) https://github.com/d2l-ai/d2l-en/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.11551v1","updated":"2023-08-22T16:32:46Z","published":"2023-08-22T16:32:46Z","title":"Multi-event Video-Text Retrieval","summary":" Video-Text Retrieval (VTR) is a crucial multi-modal task in an era of massive\nvideo-text data on the Internet. A plethora of work characterized by using a\ntwo-stream Vision-Language model architecture that learns a joint\nrepresentation of video-text pairs has become a prominent approach for the VTR\ntask. However, these models operate under the assumption of bijective\nvideo-text correspondences and neglect a more practical scenario where video\ncontent usually encompasses multiple events, while texts like user queries or\nwebpage metadata tend to be specific and correspond to single events. This\nestablishes a gap between the previous training objective and real-world\napplications, leading to the potential performance degradation of earlier\nmodels during inference. In this study, we introduce the Multi-event Video-Text\nRetrieval (MeVTR) task, addressing scenarios in which each video contains\nmultiple different events, as a niche scenario of the conventional Video-Text\nRetrieval Task. We present a simple model, Me-Retriever, which incorporates key\nevent video representation and a new MeVTR loss for the MeVTR task.\nComprehensive experiments show that this straightforward framework outperforms\nother models in the Video-to-Text and Text-to-Video tasks, effectively\nestablishing a robust baseline for the MeVTR task. We believe this work serves\nas a strong foundation for future studies. Code is available at\nhttps://github.com/gengyuanmax/MeVTR.\n","authors":["Gengyuan Zhang","Jisen Ren","Jindong Gu","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2308.11551v1.pdf","comment":"accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2305.03881v2","updated":"2023-08-22T16:09:59Z","published":"2023-05-06T00:24:44Z","title":"Fairness in Image Search: A Study of Occupational Stereotyping in Image\n Retrieval and its Debiasing","summary":" Multi-modal search engines have experienced significant growth and widespread\nuse in recent years, making them the second most common internet use. While\nsearch engine systems offer a range of services, the image search field has\nrecently become a focal point in the information retrieval community, as the\nadage goes, \"a picture is worth a thousand words\". Although popular search\nengines like Google excel at image search accuracy and agility, there is an\nongoing debate over whether their search results can be biased in terms of\ngender, language, demographics, socio-cultural aspects, and stereotypes. This\npotential for bias can have a significant impact on individuals' perceptions\nand influence their perspectives.\n In this paper, we present our study on bias and fairness in web search, with\na focus on keyword-based image search. We first discuss several kinds of biases\nthat exist in search systems and why it is important to mitigate them. We\nnarrow down our study to assessing and mitigating occupational stereotypes in\nimage search, which is a prevalent fairness issue in image retrieval. For the\nassessment of stereotypes, we take gender as an indicator. We explore various\nopen-source and proprietary APIs for gender identification from images. With\nthese, we examine the extent of gender bias in top-tanked image search results\nobtained for several occupational keywords. To mitigate the bias, we then\npropose a fairness-aware re-ranking algorithm that optimizes (a) relevance of\nthe search result with the keyword and (b) fairness w.r.t genders identified.\nWe experiment on 100 top-ranked images obtained for 10 occupational keywords\nand consider random re-ranking and re-ranking based on relevance as baselines.\nOur experimental results show that the fairness-aware re-ranking algorithm\nproduces rankings with better fairness scores and competitive relevance scores\nthan the baselines.\n","authors":["Swagatika Dash"],"pdf_url":"https://arxiv.org/pdf/2305.03881v2.pdf","comment":"20 Pages, Work uses Proprietary Search Systems from the year 2021"},{"id":"http://arxiv.org/abs/2308.09765v2","updated":"2023-08-22T15:53:18Z","published":"2023-08-18T18:18:55Z","title":"Taken by Surprise: Contrast effect for Similarity Scores","summary":" Accurately evaluating the similarity of object vector embeddings is of\ncritical importance for natural language processing, information retrieval and\nclassification tasks. Popular similarity scores (e.g cosine similarity) are\nbased on pairs of embedding vectors and disregard the distribution of the\nensemble from which objects are drawn. Human perception of object similarity\nsignificantly depends on the context in which the objects appear. In this work\nwe propose the $\\textit{surprise score}$, an ensemble-normalized similarity\nmetric that encapsulates the contrast effect of human perception and\nsignificantly improves the classification performance on zero- and few-shot\ndocument classification tasks. This score quantifies the surprise to find a\ngiven similarity between two elements relative to the pairwise ensemble\nsimilarities. We evaluate this metric on zero/few shot classification and\nclustering tasks and typically find 10-15 % better performance compared to raw\ncosine similarity. Our code is available at\nhttps://github.com/MeetElise/surprise-similarity.\n","authors":["Thomas C. Bachlechner","Mario Martone","Marjorie Schillo"],"pdf_url":"https://arxiv.org/pdf/2308.09765v2.pdf","comment":"9 pages, 2 figures and 4 tables"},{"id":"http://arxiv.org/abs/2308.11512v1","updated":"2023-08-22T15:39:47Z","published":"2023-08-22T15:39:47Z","title":"L^2R: Lifelong Learning for First-stage Retrieval with\n Backward-Compatible Representations","summary":" First-stage retrieval is a critical task that aims to retrieve relevant\ndocument candidates from a large-scale collection. While existing retrieval\nmodels have achieved impressive performance, they are mostly studied on static\ndata sets, ignoring that in the real-world, the data on the Web is continuously\ngrowing with potential distribution drift. Consequently, retrievers trained on\nstatic old data may not suit new-coming data well and inevitably produce\nsub-optimal results. In this work, we study lifelong learning for first-stage\nretrieval, especially focusing on the setting where the emerging documents are\nunlabeled since relevance annotation is expensive and may not keep up with data\nemergence. Under this setting, we aim to develop model updating with two goals:\n(1) to effectively adapt to the evolving distribution with the unlabeled\nnew-coming data, and (2) to avoid re-inferring all embeddings of old documents\nto efficiently update the index each time the model is updated.\n We first formalize the task and then propose a novel Lifelong Learning method\nfor the first-stage Retrieval, namely L^2R. L^2R adopts the typical memory\nmechanism for lifelong learning, and incorporates two crucial components: (1)\nselecting diverse support negatives for model training and memory updating for\neffective model adaptation, and (2) a ranking alignment objective to ensure the\nbackward-compatibility of representations to save the cost of index rebuilding\nwithout hurting the model performance. For evaluation, we construct two new\nbenchmarks from LoTTE and Multi-CPR datasets to simulate the document\ndistribution drift in realistic retrieval scenarios. Extensive experiments show\nthat L^2R significantly outperforms competitive lifelong learning baselines.\n","authors":["Yinqiong Cai","Keping Bi","Yixing Fan","Jiafeng Guo","Wei Chen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2308.11512v1.pdf","comment":"accepted by CIKM2023"},{"id":"http://arxiv.org/abs/2308.11474v1","updated":"2023-08-22T14:42:27Z","published":"2023-08-22T14:42:27Z","title":"Pre-training with Aspect-Content Text Mutual Prediction for Multi-Aspect\n Dense Retrieval","summary":" Grounded on pre-trained language models (PLMs), dense retrieval has been\nstudied extensively on plain text. In contrast, there has been little research\non retrieving data with multiple aspects using dense models. In the scenarios\nsuch as product search, the aspect information plays an essential role in\nrelevance matching, e.g., category: Electronics, Computers, and Pet Supplies. A\ncommon way of leveraging aspect information for multi-aspect retrieval is to\nintroduce an auxiliary classification objective, i.e., using item contents to\npredict the annotated value IDs of item aspects. However, by learning the value\nembeddings from scratch, this approach may not capture the various semantic\nsimilarities between the values sufficiently. To address this limitation, we\nleverage the aspect information as text strings rather than class IDs during\npre-training so that their semantic similarities can be naturally captured in\nthe PLMs. To facilitate effective retrieval with the aspect strings, we propose\nmutual prediction objectives between the text of the item aspect and content.\nIn this way, our model makes more sufficient use of aspect information than\nconducting undifferentiated masked language modeling (MLM) on the concatenated\ntext of aspects and content. Extensive experiments on two real-world datasets\n(product and mini-program search) show that our approach can outperform\ncompetitive baselines both treating aspect values as classes and conducting the\nsame MLM for aspect and content strings. Code and related dataset will be\navailable at the URL \\footnote{https://github.com/sunxiaojie99/ATTEMPT}.\n","authors":["Xiaojie Sun","Keping Bi","Jiafeng Guo","Xinyu Ma","Fan Yixing","Hongyu Shan","Qishen Zhang","Zhongyi Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11474v1.pdf","comment":"accepted by cikm2023"},{"id":"http://arxiv.org/abs/2308.11336v1","updated":"2023-08-22T10:28:02Z","published":"2023-08-22T10:28:02Z","title":"On the Opportunities and Challenges of Offline Reinforcement Learning\n for Recommender Systems","summary":" Reinforcement learning serves as a potent tool for modeling dynamic user\ninterests within recommender systems, garnering increasing research attention\nof late. However, a significant drawback persists: its poor data efficiency,\nstemming from its interactive nature. The training of reinforcement\nlearning-based recommender systems demands expensive online interactions to\namass adequate trajectories, essential for agents to learn user preferences.\nThis inefficiency renders reinforcement learning-based recommender systems a\nformidable undertaking, necessitating the exploration of potential solutions.\nRecent strides in offline reinforcement learning present a new perspective.\nOffline reinforcement learning empowers agents to glean insights from offline\ndatasets and deploy learned policies in online settings. Given that recommender\nsystems possess extensive offline datasets, the framework of offline\nreinforcement learning aligns seamlessly. Despite being a burgeoning field,\nworks centered on recommender systems utilizing offline reinforcement learning\nremain limited. This survey aims to introduce and delve into offline\nreinforcement learning within recommender systems, offering an inclusive review\nof existing literature in this domain. Furthermore, we strive to underscore\nprevalent challenges, opportunities, and future pathways, poised to propel\nresearch in this evolving field.\n","authors":["Xiaocong Chen","Siyu Wang","Julian McAuley","Dietmar Jannach","Lina Yao"],"pdf_url":"https://arxiv.org/pdf/2308.11336v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2308.11288v1","updated":"2023-08-22T08:57:44Z","published":"2023-08-22T08:57:44Z","title":"Test Time Embedding Normalization for Popularity Bias Mitigation","summary":" Popularity bias is a widespread problem in the field of recommender systems,\nwhere popular items tend to dominate recommendation results. In this work, we\npropose 'Test Time Embedding Normalization' as a simple yet effective strategy\nfor mitigating popularity bias, which surpasses the performance of the previous\nmitigation approaches by a significant margin. Our approach utilizes the\nnormalized item embedding during the inference stage to control the influence\nof embedding magnitude, which is highly correlated with item popularity.\nThrough extensive experiments, we show that our method combined with the\nsampled softmax loss effectively reduces popularity bias compare to previous\napproaches for bias mitigation. We further investigate the relationship between\nuser and item embeddings and find that the angular similarity between\nembeddings distinguishes preferable and non-preferable items regardless of\ntheir popularity. The analysis explains the mechanism behind the success of our\napproach in eliminating the impact of popularity bias. Our code is available at\nhttps://github.com/ml-postech/TTEN.\n","authors":["Dain Kim","Jinhyeok Park","Dongwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11288v1.pdf","comment":"5 pages, CIKM 2023"},{"id":"http://arxiv.org/abs/2308.11175v1","updated":"2023-08-22T04:06:56Z","published":"2023-08-22T04:06:56Z","title":"MISSRec: Pre-training and Transferring Multi-modal Interest-aware\n Sequence Representation for Recommendation","summary":" The goal of sequential recommendation (SR) is to predict a user's potential\ninterested items based on her/his historical interaction sequences. Most\nexisting sequential recommenders are developed based on ID features, which,\ndespite their widespread use, often underperform with sparse IDs and struggle\nwith the cold-start problem. Besides, inconsistent ID mappings hinder the\nmodel's transferability, isolating similar recommendation domains that could\nhave been co-optimized. This paper aims to address these issues by exploring\nthe potential of multi-modal information in learning robust and generalizable\nsequence representations. We propose MISSRec, a multi-modal pre-training and\ntransfer learning framework for SR. On the user side, we design a\nTransformer-based encoder-decoder model, where the contextual encoder learns to\ncapture the sequence-level multi-modal synergy while a novel interest-aware\ndecoder is developed to grasp item-modality-interest relations for better\nsequence representation. On the candidate item side, we adopt a dynamic fusion\nmodule to produce user-adaptive item representation, providing more precise\nmatching between users and items. We pre-train the model with contrastive\nlearning objectives and fine-tune it in an efficient manner. Extensive\nexperiments demonstrate the effectiveness and flexibility of MISSRec, promising\nan practical solution for real-world recommendation scenarios.\n","authors":["Jinpeng Wang","Ziyun Zeng","Yunxiao Wang","Yuting Wang","Xingyu Lu","Tianxiang Li","Jun Yuan","Rui Zhang","Hai-Tao Zheng","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2308.11175v1.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.11137v1","updated":"2023-08-22T02:34:47Z","published":"2023-08-22T02:34:47Z","title":"Towards Validating Long-Term User Feedbacks in Interactive\n Recommendation Systems","summary":" Interactive Recommender Systems (IRSs) have attracted a lot of attention, due\nto their ability to model interactive processes between users and recommender\nsystems. Numerous approaches have adopted Reinforcement Learning (RL)\nalgorithms, as these can directly maximize users' cumulative rewards. In IRS,\nresearchers commonly utilize publicly available review datasets to compare and\nevaluate algorithms. However, user feedback provided in public datasets merely\nincludes instant responses (e.g., a rating), with no inclusion of delayed\nresponses (e.g., the dwell time and the lifetime value). Thus, the question\nremains whether these review datasets are an appropriate choice to evaluate the\nlong-term effects of the IRS. In this work, we revisited experiments on IRS\nwith review datasets and compared RL-based models with a simple reward model\nthat greedily recommends the item with the highest one-step reward. Following\nextensive analysis, we can reveal three main findings: First, a simple greedy\nreward model consistently outperforms RL-based models in maximizing cumulative\nrewards. Second, applying higher weighting to long-term rewards leads to a\ndegradation of recommendation performance. Third, user feedbacks have mere\nlong-term effects on the benchmark datasets. Based on our findings, we conclude\nthat a dataset has to be carefully verified and that a simple greedy baseline\nshould be included for a proper evaluation of RL-based IRS approaches.\n","authors":["Hojoon Lee","Dongyoon Hwang","Kyushik Min","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2308.11137v1.pdf","comment":"Accepted to SIGIR'22"},{"id":"http://arxiv.org/abs/2308.11131v1","updated":"2023-08-22T02:25:04Z","published":"2023-08-22T02:25:04Z","title":"ReLLa: Retrieval-enhanced Large Language Models for Lifelong Sequential\n Behavior Comprehension in Recommendation","summary":" With large language models (LLMs) achieving remarkable breakthroughs in\nnatural language processing (NLP) domains, LLM-enhanced recommender systems\nhave received much attention and have been actively explored currently. In this\npaper, we focus on adapting and empowering a pure large language model for\nzero-shot and few-shot recommendation tasks. First and foremost, we identify\nand formulate the lifelong sequential behavior incomprehension problem for LLMs\nin recommendation domains, i.e., LLMs fail to extract useful information from a\ntextual context of long user behavior sequence, even if the length of context\nis far from reaching the context limitation of LLMs. To address such an issue\nand improve the recommendation performance of LLMs, we propose a novel\nframework, namely Retrieval-enhanced Large Language models (ReLLa) for\nrecommendation tasks in both zero-shot and few-shot settings. For zero-shot\nrecommendation, we perform semantic user behavior retrieval (SUBR) to improve\nthe data quality of testing samples, which greatly reduces the difficulty for\nLLMs to extract the essential knowledge from user behavior sequences. As for\nfew-shot recommendation, we further design retrieval-enhanced instruction\ntuning (ReiT) by adopting SUBR as a data augmentation technique for training\nsamples. Specifically, we develop a mixed training dataset consisting of both\nthe original data samples and their retrieval-enhanced counterparts. We conduct\nextensive experiments on a real-world public dataset (i.e., MovieLens-1M) to\ndemonstrate the superiority of ReLLa compared with existing baseline models, as\nwell as its capability for lifelong sequential behavior comprehension.\n","authors":["Jianghao Lin","Rong Shan","Chenxu Zhu","Kounianhua Du","Bo Chen","Shigang Quan","Ruiming Tang","Yong Yu","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11131v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2308.11127v1","updated":"2023-08-22T02:17:34Z","published":"2023-08-22T02:17:34Z","title":"How Expressive are Graph Neural Networks in Recommendation?","summary":" Graph Neural Networks (GNNs) have demonstrated superior performance on\nvarious graph learning tasks, including recommendation, where they leverage\nuser-item collaborative filtering signals in graphs. However, theoretical\nformulations of their capability are scarce, despite their empirical\neffectiveness in state-of-the-art recommender models. Recently, research has\nexplored the expressiveness of GNNs in general, demonstrating that message\npassing GNNs are at most as powerful as the Weisfeiler-Lehman test, and that\nGNNs combined with random node initialization are universal. Nevertheless, the\nconcept of \"expressiveness\" for GNNs remains vaguely defined. Most existing\nworks adopt the graph isomorphism test as the metric of expressiveness, but\nthis graph-level task may not effectively assess a model's ability in\nrecommendation, where the objective is to distinguish nodes of different\ncloseness. In this paper, we provide a comprehensive theoretical analysis of\nthe expressiveness of GNNs in recommendation, considering three levels of\nexpressiveness metrics: graph isomorphism (graph-level), node automorphism\n(node-level), and topological closeness (link-level). We propose the\ntopological closeness metric to evaluate GNNs' ability to capture the\nstructural distance between nodes, which aligns closely with the objective of\nrecommendation. To validate the effectiveness of this new metric in evaluating\nrecommendation performance, we introduce a learning-less GNN algorithm that is\noptimal on the new metric and can be optimal on the node-level metric with\nsuitable modification. We conduct extensive experiments comparing the proposed\nalgorithm against various types of state-of-the-art GNN models to explore the\nexplainability of the new metric in the recommendation task. For\nreproducibility, implementation codes are available at\nhttps://github.com/HKUDS/GTE.\n","authors":["Xuheng Cai","Lianghao Xia","Xubin Ren","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.11127v1.pdf","comment":"32nd ACM International Conference on Information and Knowledge\n Management (CIKM) 2023"},{"id":"http://arxiv.org/abs/2308.11103v1","updated":"2023-08-22T00:57:36Z","published":"2023-08-22T00:57:36Z","title":"Anonymity at Risk? Assessing Re-Identification Capabilities of Large\n Language Models","summary":" Anonymity of both natural and legal persons in court rulings is a critical\naspect of privacy protection in the European Union and Switzerland. With the\nadvent of LLMs, concerns about large-scale re-identification of anonymized\npersons are growing. In accordance with the Federal Supreme Court of\nSwitzerland, we explore the potential of LLMs to re-identify individuals in\ncourt rulings by constructing a proof-of-concept using actual legal data from\nthe Swiss federal supreme court. Following the initial experiment, we\nconstructed an anonymized Wikipedia dataset as a more rigorous testing ground\nto further investigate the findings. With the introduction and application of\nthe new task of re-identifying people in texts, we also introduce new metrics\nto measure performance. We systematically analyze the factors that influence\nsuccessful re-identifications, identifying model size, input length, and\ninstruction tuning among the most critical determinants. Despite high\nre-identification rates on Wikipedia, even the best LLMs struggled with court\ndecisions. The complexity is attributed to the lack of test datasets, the\nnecessity for substantial training resources, and data sparsity in the\ninformation used for re-identification. In conclusion, this study demonstrates\nthat re-identification using LLMs may not be feasible for now, but as the\nproof-of-concept on Wikipedia showed, it might become possible in the future.\nWe hope that our system can help enhance the confidence in the security of\nanonymized decisions, thus leading to the courts being more confident to\npublish decisions.\n","authors":["Alex Nyffenegger","Matthias Stürmer","Joel Niklaus"],"pdf_url":"https://arxiv.org/pdf/2308.11103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11797v1","updated":"2023-08-22T21:29:55Z","published":"2023-08-22T21:29:55Z","title":"CLIP Multi-modal Hashing: A new baseline CLIPMH","summary":" The multi-modal hashing method is widely used in multimedia retrieval. It can\nfuse multi-source data to generate binary hash code. However, the current\nmulti-modal methods have the problem of low retrieval accuracy. The reason is\nthat the individual backbone networks have limited feature expression\ncapabilities and are not jointly pre-trained on large-scale unsupervised\nmulti-modal data. To solve this problem, we propose a new baseline CLIP\nMulti-modal Hashing (CLIPMH) method. It uses CLIP model to extract text and\nimage features, and then fuse to generate hash code. CLIP improves the\nexpressiveness of each modal feature. In this way, it can greatly improve the\nretrieval performance of multi-modal hashing methods. In comparison to\nstate-of-the-art unsupervised and supervised multi-modal hashing methods,\nexperiments reveal that the proposed CLIPMH can significantly enhance\nperformance (Maximum increase of 8.38%). CLIP also has great advantages over\nthe text and visual backbone networks commonly used before.\n","authors":["Jian Zhu","Mingkai Sheng","Mingda Ke","Zhangmin Huang","Jingfei Chang"],"pdf_url":"https://arxiv.org/pdf/2308.11797v1.pdf","comment":"submit to ICASSP2024"},{"id":"http://arxiv.org/abs/2308.11730v1","updated":"2023-08-22T18:41:31Z","published":"2023-08-22T18:41:31Z","title":"Knowledge Graph Prompting for Multi-Document Question Answering","summary":" The 'pre-train, prompt, predict' paradigm of large language models (LLMs) has\nachieved remarkable success in open-domain question answering (OD-QA). However,\nfew works explore this paradigm in the scenario of multi-document question\nanswering (MD-QA), a task demanding a thorough understanding of the logical\nassociations among the contents and structures of different documents. To fill\nthis crucial gap, we propose a Knowledge Graph Prompting (KGP) method to\nformulate the right context in prompting LLMs for MD-QA, which consists of a\ngraph construction module and a graph traversal module. For graph construction,\nwe create a knowledge graph (KG) over multiple documents with nodes symbolizing\npassages or document structures (e.g., pages/tables), and edges denoting the\nsemantic/lexical similarity between passages or intra-document structural\nrelations. For graph traversal, we design an LM-guided graph traverser that\nnavigates across nodes and gathers supporting passages assisting LLMs in MD-QA.\nThe constructed graph serves as the global ruler that regulates the\ntransitional space among passages and reduces retrieval latency. Concurrently,\nthe LM-guided traverser acts as a local navigator that gathers pertinent\ncontext to progressively approach the question and guarantee retrieval quality.\nExtensive experiments underscore the efficacy of KGP for MD-QA, signifying the\npotential of leveraging graphs in enhancing the prompt design for LLMs. Our\ncode is at https://github.com/YuWVandy/KG-LLM-MDQA.\n","authors":["Yu Wang","Nedim Lipka","Ryan A. Rossi","Alexa Siu","Ruiyi Zhang","Tyler Derr"],"pdf_url":"https://arxiv.org/pdf/2308.11730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11728v1","updated":"2023-08-22T18:39:39Z","published":"2023-08-22T18:39:39Z","title":"Invariant representation learning for sequential recommendation","summary":" Sequential recommendation involves automatically recommending the next item\nto users based on their historical item sequence. While most prior research\nemploys RNN or transformer methods to glean information from the item\nsequence-generating probabilities for each user-item pair and recommending the\ntop items, these approaches often overlook the challenge posed by spurious\nrelationships. This paper specifically addresses these spurious relations. We\nintroduce a novel sequential recommendation framework named Irl4Rec. This\nframework harnesses invariant learning and employs a new objective that factors\nin the relationship between spurious variables and adjustment variables during\nmodel training. This approach aids in identifying spurious relations.\nComparative analyses reveal that our framework outperforms three typical\nmethods, underscoring the effectiveness of our model. Moreover, an ablation\nstudy further demonstrates the critical role our model plays in detecting\nspurious relations.\n","authors":["Xiaofan Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.11728v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2202.12040v3","updated":"2023-08-22T17:57:36Z","published":"2022-02-24T11:40:44Z","title":"Self-Training: A Survey","summary":" Semi-supervised algorithms aim to learn prediction functions from a small set\nof labeled observations and a large set of unlabeled observations. Because this\nframework is relevant in many applications, they have received a lot of\ninterest in both academia and industry. Among the existing techniques,\nself-training methods have undoubtedly attracted greater attention in recent\nyears. These models are designed to find the decision boundary on low density\nregions without making additional assumptions about the data distribution, and\nuse the unsigned output score of a learned classifier, or its margin, as an\nindicator of confidence. The working principle of self-training algorithms is\nto learn a classifier iteratively by assigning pseudo-labels to the set of\nunlabeled training samples with a margin greater than a certain threshold. The\npseudo-labeled examples are then used to enrich the labeled training data and\nto train a new classifier in conjunction with the labeled training set. In this\npaper, we present self-training methods for binary and multi-class\nclassification; as well as their variants and two related approaches, namely\nconsistency-based approaches and transductive learning. We examine the impact\nof significant self-training features on various methods, using different\ngeneral and image classification benchmarks, and we discuss our ideas for\nfuture research in self-training. To the best of our knowledge, this is the\nfirst thorough and complete survey on this subject.\n","authors":["Massih-Reza Amini","Vasilii Feofanov","Loic Pauletto","Emilie Devijver","Yury Maximov"],"pdf_url":"https://arxiv.org/pdf/2202.12040v3.pdf","comment":"27 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.11604v1","updated":"2023-08-22T17:52:44Z","published":"2023-08-22T17:52:44Z","title":"Semantic Multi-Resolution Communications","summary":" Deep learning based joint source-channel coding (JSCC) has demonstrated\nsignificant advancements in data reconstruction compared to separate\nsource-channel coding (SSCC). This superiority arises from the suboptimality of\nSSCC when dealing with finite block-length data. Moreover, SSCC falls short in\nreconstructing data in a multi-user and/or multi-resolution fashion, as it only\ntries to satisfy the worst channel and/or the highest quality data. To overcome\nthese limitations, we propose a novel deep learning multi-resolution JSCC\nframework inspired by the concept of multi-task learning (MTL). This proposed\nframework excels at encoding data for different resolutions through\nhierarchical layers and effectively decodes it by leveraging both current and\npast layers of encoded data. Moreover, this framework holds great potential for\nsemantic communication, where the objective extends beyond data reconstruction\nto preserving specific semantic attributes throughout the communication\nprocess. These semantic features could be crucial elements such as class\nlabels, essential for classification tasks, or other key attributes that\nrequire preservation. Within this framework, each level of encoded data can be\ncarefully designed to retain specific data semantics. As a result, the\nprecision of a semantic classifier can be progressively enhanced across\nsuccessive layers, emphasizing the preservation of targeted semantics\nthroughout the encoding and decoding stages. We conduct experiments on MNIST\nand CIFAR10 dataset. The experiment with both datasets illustrates that our\nproposed method is capable of surpassing the SSCC method in reconstructing data\nwith different resolutions, enabling the extraction of semantic features with\nheightened confidence in successive layers. This capability is particularly\nadvantageous for prioritizing and preserving more crucial semantic features\nwithin the datasets.\n","authors":["Matin Mortaheb","Mohammad A. Amir Khojastepour","Srimat T. Chakradhar","Sennur Ulukus"],"pdf_url":"https://arxiv.org/pdf/2308.11604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11601v1","updated":"2023-08-22T17:48:24Z","published":"2023-08-22T17:48:24Z","title":"Tryage: Real-time, intelligent Routing of User Prompts to Large Language\n Model","summary":" The introduction of the transformer architecture and the self-attention\nmechanism has led to an explosive production of language models trained on\nspecific downstream tasks and data domains. With over 200, 000 models in the\nHugging Face ecosystem, users grapple with selecting and optimizing models to\nsuit multifaceted workflows and data domains while addressing computational,\nsecurity, and recency concerns. There is an urgent need for machine learning\nframeworks that can eliminate the burden of model selection and customization\nand unleash the incredible power of the vast emerging model library for end\nusers. Here, we propose a context-aware routing system, Tryage, that leverages\na language model router for optimal selection of expert models from a model\nlibrary based on analysis of individual input prompts. Inspired by the thalamic\nrouter in the brain, Tryage employs a perceptive router to predict down-stream\nmodel performance on prompts and, then, makes a routing decision using an\nobjective function that integrates performance predictions with user goals and\nconstraints that are incorporated through flags (e.g., model size, model\nrecency). Tryage allows users to explore a Pareto front and automatically\ntrade-off between task accuracy and secondary goals including minimization of\nmodel size, recency, security, verbosity, and readability. Across heterogeneous\ndata sets that include code, text, clinical data, and patents, the Tryage\nframework surpasses Gorilla and GPT3.5 turbo in dynamic model selection\nidentifying the optimal model with an accuracy of 50.9% , compared to 23.6% by\nGPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how\nrouting models can be applied to program and control the behavior of\nmulti-model LLM systems to maximize efficient use of the expanding and evolving\nlanguage model ecosystem.\n","authors":["Surya Narayanan Hari","Matt Thomson"],"pdf_url":"https://arxiv.org/pdf/2308.11601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08708v3","updated":"2023-08-22T17:33:15Z","published":"2023-08-17T00:10:16Z","title":"Consciousness in Artificial Intelligence: Insights from the Science of\n Consciousness","summary":" Whether current or near-term AI systems could be conscious is a topic of\nscientific interest and increasing public concern. This report argues for, and\nexemplifies, a rigorous and empirically grounded approach to AI consciousness:\nassessing existing AI systems in detail, in light of our best-supported\nneuroscientific theories of consciousness. We survey several prominent\nscientific theories of consciousness, including recurrent processing theory,\nglobal workspace theory, higher-order theories, predictive processing, and\nattention schema theory. From these theories we derive \"indicator properties\"\nof consciousness, elucidated in computational terms that allow us to assess AI\nsystems for these properties. We use these indicator properties to assess\nseveral recent AI systems, and we discuss how future systems might implement\nthem. Our analysis suggests that no current AI systems are conscious, but also\nsuggests that there are no obvious technical barriers to building AI systems\nwhich satisfy these indicators.\n","authors":["Patrick Butlin","Robert Long","Eric Elmoznino","Yoshua Bengio","Jonathan Birch","Axel Constant","George Deane","Stephen M. Fleming","Chris Frith","Xu Ji","Ryota Kanai","Colin Klein","Grace Lindsay","Matthias Michel","Liad Mudrik","Megan A. K. Peters","Eric Schwitzgebel","Jonathan Simon","Rufin VanRullen"],"pdf_url":"https://arxiv.org/pdf/2308.08708v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09729v2","updated":"2023-08-22T17:32:16Z","published":"2023-08-17T16:59:50Z","title":"MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large\n Language Models","summary":" LLMs usually exhibit limitations in their ability to incorporate new\nknowledge, the generation of hallucinations, and the transparency of their\ndecision-making process. In this paper, we explore how to prompt LLMs with\nknowledge graphs (KG), working as a remedy to engage LLMs with up-to-date\nknowledge and elicit the reasoning pathways from LLMs. Specifically, we build a\nprompting pipeline that endows LLMs with the capability of comprehending KG\ninputs and inferring with a combined implicit knowledge and the retrieved\nexternal knowledge. In addition, we investigate eliciting the mind map on which\nLLMs perform the reasoning and generate the answers. It is identified that the\nproduced mind map exhibits the reasoning pathways of LLMs grounded on the\nontology of knowledge, hence bringing the prospects of probing and gauging LLM\ninference in production. The experiments on three question & answering datasets\nalso show that MindMap prompting leads to a striking empirical gain. For\ninstance, prompting a GPT-3.5 with MindMap yields an overwhelming performance\nover GPT-4 consistently. We also demonstrate that with structured facts\nretrieved from KG, MindMap can outperform a series of\nprompting-with-document-retrieval methods, benefiting from more accurate,\nconcise, and comprehensive knowledge from KGs.\n","authors":["Yilin Wen","Zifeng Wang","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.09729v2.pdf","comment":"7 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2011.05001v7","updated":"2023-08-22T17:20:10Z","published":"2020-11-10T09:32:50Z","title":"MMD-Regularized Unbalanced Optimal Transport","summary":" We study the unbalanced optimal transport (UOT) problem, where the marginal\nconstraints are enforced using Maximum Mean Discrepancy (MMD) regularization.\nOur work is motivated by the observation that the literature on UOT is focused\non regularization based on $\\phi$-divergence (e.g., KL divergence). Despite the\npopularity of MMD, its role as a regularizer in the context of UOT seems less\nunderstood. We begin by deriving the dual of MMD-regularized UOT (MMD-UOT),\nwhich helps us prove other useful properties. One interesting outcome of this\nduality result is that MMD-UOT induces novel metrics, which not only lift the\nground metric like the Wasserstein but are also efficient to estimate like the\nMMD. Further, we present finite-dimensional convex programs for estimating\nMMD-UOT and the corresponding barycenter solely based on the samples from the\nmeasures being transported. Under mild conditions, we prove that our\nconvex-program-based estimators are consistent and the estimation error decays\nat a rate $\\mathcal{O}\\left(m^{-\\frac{1}{2}}\\right)$, where $m$ is the number\nof samples. As far as we know, such error bounds that are free from the curse\nof dimensionality are not known for $\\phi$-divergence regularized UOT. Finally,\nwe discuss how the proposed convex programs can be solved efficiently using\naccelerated projected gradient descent. Our experiments show that MMD-UOT\nconsistently outperforms popular baselines, including KL-regularized UOT and\nMMD, in diverse machine learning applications.\n","authors":["Piyushi Manupriya","J. Saketha Nath","Pratik Jawanpuria"],"pdf_url":"https://arxiv.org/pdf/2011.05001v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.14995v2","updated":"2023-08-22T17:16:52Z","published":"2022-08-31T17:50:04Z","title":"Discovering Conservation Laws using Optimal Transport and Manifold\n Learning","summary":" Conservation laws are key theoretical and practical tools for understanding,\ncharacterizing, and modeling nonlinear dynamical systems. However, for many\ncomplex systems, the corresponding conserved quantities are difficult to\nidentify, making it hard to analyze their dynamics and build stable predictive\nmodels. Current approaches for discovering conservation laws often depend on\ndetailed dynamical information or rely on black box parametric deep learning\nmethods. We instead reformulate this task as a manifold learning problem and\npropose a non-parametric approach for discovering conserved quantities. We test\nthis new approach on a variety of physical systems and demonstrate that our\nmethod is able to both identify the number of conserved quantities and extract\ntheir values. Using tools from optimal transport theory and manifold learning,\nour proposed method provides a direct geometric approach to identifying\nconservation laws that is both robust and interpretable without requiring an\nexplicit model of the system nor accurate time information.\n","authors":["Peter Y. Lu","Rumen Dangovski","Marin Soljačić"],"pdf_url":"https://arxiv.org/pdf/2208.14995v2.pdf","comment":"30 pages, 15 figures (7 main text, 8 supplemental), 3 tables\n (supplemental)"},{"id":"http://arxiv.org/abs/2308.11567v1","updated":"2023-08-22T17:08:47Z","published":"2023-08-22T17:08:47Z","title":"Low Tensor Rank Learning of Neural Dynamics","summary":" Learning relies on coordinated synaptic changes in recurrently connected\npopulations of neurons. Therefore, understanding the collective evolution of\nsynaptic connectivity over learning is a key challenge in neuroscience and\nmachine learning. In particular, recent work has shown that the weight matrices\nof task-trained RNNs are typically low rank, but how this low rank structure\nunfolds over learning is unknown. To address this, we investigate the rank of\nthe 3-tensor formed by the weight matrices throughout learning. By fitting RNNs\nof varying rank to large-scale neural recordings during a motor learning task,\nwe find that the inferred weights are low-tensor-rank and therefore evolve over\na fixed low-dimensional subspace throughout the entire course of learning. We\nnext validate the observation of low-tensor-rank learning on an RNN trained to\nsolve the same task by performing a low-tensor-rank decomposition directly on\nthe ground truth weights, and by showing that the method we applied to the data\nfaithfully recovers this low rank structure. Finally, we present a set of\nmathematical results bounding the matrix and tensor ranks of gradient descent\nlearning dynamics which show that low-tensor-rank weights emerge naturally in\nRNNs trained to solve low-dimensional tasks. Taken together, our findings\nprovide novel constraints on the evolution of population connectivity over\nlearning in both biological and artificial neural networks, and enable reverse\nengineering of learning-induced changes in recurrent network dynamics from\nlarge-scale neural recordings.\n","authors":["Arthur Pellegrino","N Alex Cayco-Gajic","Angus Chadwick"],"pdf_url":"https://arxiv.org/pdf/2308.11567v1.pdf","comment":"The last two authors contributed equally"},{"id":"http://arxiv.org/abs/2304.13169v2","updated":"2023-08-22T16:42:25Z","published":"2023-04-25T22:02:09Z","title":"SAFE: Machine Unlearning With Shard Graphs","summary":" We present Synergy Aware Forgetting Ensemble (SAFE), a method to adapt large\nmodels on a diverse collection of data while minimizing the expected cost to\nremove the influence of training samples from the trained model. This process,\nalso known as selective forgetting or unlearning, is often conducted by\npartitioning a dataset into shards, training fully independent models on each,\nthen ensembling the resulting models. Increasing the number of shards reduces\nthe expected cost to forget but at the same time it increases inference cost\nand reduces the final accuracy of the model since synergistic information\nbetween samples is lost during the independent model training. Rather than\ntreating each shard as independent, SAFE introduces the notion of a shard\ngraph, which allows incorporating limited information from other shards during\ntraining, trading off a modest increase in expected forgetting cost with a\nsignificant increase in accuracy, all while still attaining complete removal of\nresidual influence after forgetting. SAFE uses a lightweight system of adapters\nwhich can be trained while reusing most of the computations. This allows SAFE\nto be trained on shards an order-of-magnitude smaller than current\nstate-of-the-art methods (thus reducing the forgetting costs) while also\nmaintaining high accuracy, as we demonstrate empirically on fine-grained\ncomputer vision datasets.\n","authors":["Yonatan Dukler","Benjamin Bowman","Alessandro Achille","Aditya Golatkar","Ashwin Swaminathan","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2304.13169v2.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10425v2","updated":"2023-08-22T16:36:44Z","published":"2023-08-21T02:27:13Z","title":"Spatio-Temporal Adaptive Embedding Makes Vanilla Transformer SOTA for\n Traffic Forecasting","summary":" With the rapid development of the Intelligent Transportation System (ITS),\naccurate traffic forecasting has emerged as a critical challenge. The key\nbottleneck lies in capturing the intricate spatio-temporal traffic patterns. In\nrecent years, numerous neural networks with complicated architectures have been\nproposed to address this issue. However, the advancements in network\narchitectures have encountered diminishing performance gains. In this study, we\npresent a novel component called spatio-temporal adaptive embedding that can\nyield outstanding results with vanilla transformers. Our proposed\nSpatio-Temporal Adaptive Embedding transformer (STAEformer) achieves\nstate-of-the-art performance on five real-world traffic forecasting datasets.\nFurther experiments demonstrate that spatio-temporal adaptive embedding plays a\ncrucial role in traffic forecasting by effectively capturing intrinsic\nspatio-temporal relations and chronological information in traffic time series.\n","authors":["Hangchen Liu","Zheng Dong","Renhe Jiang","Jiewen Deng","Jinliang Deng","Quanjun Chen","Xuan Song"],"pdf_url":"https://arxiv.org/pdf/2308.10425v2.pdf","comment":"Accepted as CIKM2023 Short Paper"},{"id":"http://arxiv.org/abs/2308.11551v1","updated":"2023-08-22T16:32:46Z","published":"2023-08-22T16:32:46Z","title":"Multi-event Video-Text Retrieval","summary":" Video-Text Retrieval (VTR) is a crucial multi-modal task in an era of massive\nvideo-text data on the Internet. A plethora of work characterized by using a\ntwo-stream Vision-Language model architecture that learns a joint\nrepresentation of video-text pairs has become a prominent approach for the VTR\ntask. However, these models operate under the assumption of bijective\nvideo-text correspondences and neglect a more practical scenario where video\ncontent usually encompasses multiple events, while texts like user queries or\nwebpage metadata tend to be specific and correspond to single events. This\nestablishes a gap between the previous training objective and real-world\napplications, leading to the potential performance degradation of earlier\nmodels during inference. In this study, we introduce the Multi-event Video-Text\nRetrieval (MeVTR) task, addressing scenarios in which each video contains\nmultiple different events, as a niche scenario of the conventional Video-Text\nRetrieval Task. We present a simple model, Me-Retriever, which incorporates key\nevent video representation and a new MeVTR loss for the MeVTR task.\nComprehensive experiments show that this straightforward framework outperforms\nother models in the Video-to-Text and Text-to-Video tasks, effectively\nestablishing a robust baseline for the MeVTR task. We believe this work serves\nas a strong foundation for future studies. Code is available at\nhttps://github.com/gengyuanmax/MeVTR.\n","authors":["Gengyuan Zhang","Jisen Ren","Jindong Gu","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2308.11551v1.pdf","comment":"accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2308.10236v2","updated":"2023-08-22T16:09:09Z","published":"2023-08-20T11:49:12Z","title":"FedSIS: Federated Split Learning with Intermediate Representation\n Sampling for Privacy-preserving Generalized Face Presentation Attack\n Detection","summary":" Lack of generalization to unseen domains/attacks is the Achilles heel of most\nface presentation attack detection (FacePAD) algorithms. Existing attempts to\nenhance the generalizability of FacePAD solutions assume that data from\nmultiple source domains are available with a single entity to enable\ncentralized training. In practice, data from different source domains may be\ncollected by diverse entities, who are often unable to share their data due to\nlegal and privacy constraints. While collaborative learning paradigms such as\nfederated learning (FL) can overcome this problem, standard FL methods are\nill-suited for domain generalization because they struggle to surmount the twin\nchallenges of handling non-iid client data distributions during training and\ngeneralizing to unseen domains during inference. In this work, a novel\nframework called Federated Split learning with Intermediate representation\nSampling (FedSIS) is introduced for privacy-preserving domain generalization.\nIn FedSIS, a hybrid Vision Transformer (ViT) architecture is learned using a\ncombination of FL and split learning to achieve robustness against statistical\nheterogeneity in the client data distributions without any sharing of raw data\n(thereby preserving privacy). To further improve generalization to unseen\ndomains, a novel feature augmentation strategy called intermediate\nrepresentation sampling is employed, and discriminative information from\nintermediate blocks of a ViT is distilled using a shared adapter network. The\nFedSIS approach has been evaluated on two well-known benchmarks for\ncross-domain FacePAD to demonstrate that it is possible to achieve\nstate-of-the-art generalization performance without data sharing. Code:\nhttps://github.com/Naiftt/FedSIS\n","authors":["Naif Alkhunaizi","Koushik Srivatsan","Faris Almalik","Ibrahim Almakky","Karthik Nandakumar"],"pdf_url":"https://arxiv.org/pdf/2308.10236v2.pdf","comment":"Accepted to the IEEE International Joint Conference on Biometrics\n (IJCB), 2023"},{"id":"http://arxiv.org/abs/2308.11532v1","updated":"2023-08-22T15:59:25Z","published":"2023-08-22T15:59:25Z","title":"A free from local minima algorithm for training regressive MLP neural\n networks","summary":" In this article an innovative method for training regressive MLP networks is\npresented, which is not subject to local minima. The Error-Back-Propagation\nalgorithm, proposed by William-Hinton-Rummelhart, has had the merit of\nfavouring the development of machine learning techniques, which has permeated\nevery branch of research and technology since the mid-1980s. This extraordinary\nsuccess is largely due to the black-box approach, but this same factor was also\nseen as a limitation, as soon more challenging problems were approached. One of\nthe most critical aspects of the training algorithms was that of local minima\nof the loss function, typically the mean squared error of the output on the\ntraining set. In fact, as the most popular training algorithms are driven by\nthe derivatives of the loss function, there is no possibility to evaluate if a\nreached minimum is local or global. The algorithm presented in this paper\navoids the problem of local minima, as the training is based on the properties\nof the distribution of the training set, or better on its image internal to the\nneural network. The performance of the algorithm is shown for a well-known\nbenchmark.\n","authors":["Augusto Montisci"],"pdf_url":"https://arxiv.org/pdf/2308.11532v1.pdf","comment":"9 pages, 4 figures, theoretical work"},{"id":"http://arxiv.org/abs/2306.10168v2","updated":"2023-08-22T15:57:35Z","published":"2023-06-16T20:11:38Z","title":"Beyond Geometry: Comparing the Temporal Structure of Computation in\n Neural Circuits with Dynamical Similarity Analysis","summary":" How can we tell whether two neural networks are utilizing the same internal\nprocesses for a particular computation? This question is pertinent for multiple\nsubfields of both neuroscience and machine learning, including neuroAI,\nmechanistic interpretability, and brain-machine interfaces. Standard approaches\nfor comparing neural networks focus on the spatial geometry of latent states.\nYet in recurrent networks, computations are implemented at the level of neural\ndynamics, which do not have a simple one-to-one mapping with geometry. To\nbridge this gap, we introduce a novel similarity metric that compares two\nsystems at the level of their dynamics. Our method incorporates two components:\nUsing recent advances in data-driven dynamical systems theory, we learn a\nhigh-dimensional linear system that accurately captures core features of the\noriginal nonlinear dynamics. Next, we compare these linear approximations via a\nnovel extension of Procrustes Analysis that accounts for how vector fields\nchange under orthogonal transformation. Via four case studies, we demonstrate\nthat our method effectively identifies and distinguishes dynamic structure in\nrecurrent neural networks (RNNs), whereas geometric methods fall short. We\nadditionally show that our method can distinguish learning rules in an\nunsupervised manner. Our method therefore opens the door to novel data-driven\nanalyses of the temporal structure of neural computation, and to more rigorous\ntesting of RNNs as models of the brain.\n","authors":["Mitchell Ostrow","Adam Eisen","Leo Kozachkov","Ila Fiete"],"pdf_url":"https://arxiv.org/pdf/2306.10168v2.pdf","comment":"21 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.09765v2","updated":"2023-08-22T15:53:18Z","published":"2023-08-18T18:18:55Z","title":"Taken by Surprise: Contrast effect for Similarity Scores","summary":" Accurately evaluating the similarity of object vector embeddings is of\ncritical importance for natural language processing, information retrieval and\nclassification tasks. Popular similarity scores (e.g cosine similarity) are\nbased on pairs of embedding vectors and disregard the distribution of the\nensemble from which objects are drawn. Human perception of object similarity\nsignificantly depends on the context in which the objects appear. In this work\nwe propose the $\\textit{surprise score}$, an ensemble-normalized similarity\nmetric that encapsulates the contrast effect of human perception and\nsignificantly improves the classification performance on zero- and few-shot\ndocument classification tasks. This score quantifies the surprise to find a\ngiven similarity between two elements relative to the pairwise ensemble\nsimilarities. We evaluate this metric on zero/few shot classification and\nclustering tasks and typically find 10-15 % better performance compared to raw\ncosine similarity. Our code is available at\nhttps://github.com/MeetElise/surprise-similarity.\n","authors":["Thomas C. Bachlechner","Mario Martone","Marjorie Schillo"],"pdf_url":"https://arxiv.org/pdf/2308.09765v2.pdf","comment":"9 pages, 2 figures and 4 tables"},{"id":"http://arxiv.org/abs/2308.11522v1","updated":"2023-08-22T15:52:37Z","published":"2023-08-22T15:52:37Z","title":"ReLiCADA -- Reservoir Computing using Linear Cellular Automata Design\n Algorithm","summary":" In this paper, we present a novel algorithm to optimize the design of\nReservoir Computing using Cellular Automata models for time series\napplications. Besides selecting the models' hyperparameters, the proposed\nalgorithm particularly solves the open problem of linear Cellular Automaton\nrule selection. The selection method pre-selects only a few promising candidate\nrules out of an exponentially growing rule space. When applied to relevant\nbenchmark datasets, the selected rules achieve low errors, with the best rules\nbeing among the top 5% of the overall rule space. The algorithm was developed\nbased on mathematical analysis of linear Cellular Automaton properties and is\nbacked by almost one million experiments, adding up to a computational runtime\nof nearly one year. Comparisons to other state-of-the-art time series models\nshow that the proposed Reservoir Computing using Cellular Automata models have\nlower computational complexity, at the same time, achieve lower errors. Hence,\nour approach reduces the time needed for training and hyperparameter\noptimization by up to several orders of magnitude.\n","authors":["Jonas Kantic","Fabian C. Legl","Walter Stechele","Jakob Hermann"],"pdf_url":"https://arxiv.org/pdf/2308.11522v1.pdf","comment":"19 pages, 14 figures"},{"id":"http://arxiv.org/abs/2308.11518v1","updated":"2023-08-22T15:47:58Z","published":"2023-08-22T15:47:58Z","title":"EM for Mixture of Linear Regression with Clustered Data","summary":" Modern data-driven and distributed learning frameworks deal with diverse\nmassive data generated by clients spread across heterogeneous environments.\nIndeed, data heterogeneity is a major bottleneck in scaling up many distributed\nlearning paradigms. In many settings however, heterogeneous data may be\ngenerated in clusters with shared structures, as is the case in several\napplications such as federated learning where a common latent variable governs\nthe distribution of all the samples generated by a client. It is therefore\nnatural to ask how the underlying clustered structures in distributed data can\nbe exploited to improve learning schemes. In this paper, we tackle this\nquestion in the special case of estimating $d$-dimensional parameters of a\ntwo-component mixture of linear regressions problem where each of $m$ nodes\ngenerates $n$ samples with a shared latent variable. We employ the well-known\nExpectation-Maximization (EM) method to estimate the maximum likelihood\nparameters from $m$ batches of dependent samples each containing $n$\nmeasurements. Discarding the clustered structure in the mixture model, EM is\nknown to require $O(\\log(mn/d))$ iterations to reach the statistical accuracy\nof $O(\\sqrt{d/(mn)})$. In contrast, we show that if initialized properly, EM on\nthe structured data requires only $O(1)$ iterations to reach the same\nstatistical accuracy, as long as $m$ grows up as $e^{o(n)}$. Our analysis\nestablishes and combines novel asymptotic optimization and generalization\nguarantees for population and empirical EM with dependent samples, which may be\nof independent interest.\n","authors":["Amirhossein Reisizadeh","Khashayar Gatmiry","Asuman Ozdaglar"],"pdf_url":"https://arxiv.org/pdf/2308.11518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11513v1","updated":"2023-08-22T15:40:03Z","published":"2023-08-22T15:40:03Z","title":"TrackFlow: Multi-Object Tracking with Normalizing Flows","summary":" The field of multi-object tracking has recently seen a renewed interest in\nthe good old schema of tracking-by-detection, as its simplicity and strong\npriors spare it from the complex design and painful babysitting of\ntracking-by-attention approaches. In view of this, we aim at extending\ntracking-by-detection to multi-modal settings, where a comprehensive cost has\nto be computed from heterogeneous information e.g., 2D motion cues, visual\nappearance, and pose estimates. More precisely, we follow a case study where a\nrough estimate of 3D information is also available and must be merged with\nother traditional metrics (e.g., the IoU). To achieve that, recent approaches\nresort to either simple rules or complex heuristics to balance the contribution\nof each cost. However, i) they require careful tuning of tailored\nhyperparameters on a hold-out set, and ii) they imply these costs to be\nindependent, which does not hold in reality. We address these issues by\nbuilding upon an elegant probabilistic formulation, which considers the cost of\na candidate association as the negative log-likelihood yielded by a deep\ndensity estimator, trained to model the conditional joint probability\ndistribution of correct associations. Our experiments, conducted on both\nsimulated and real benchmarks, show that our approach consistently enhances the\nperformance of several tracking-by-detection algorithms.\n","authors":["Gianluca Mancusi","Aniello Panariello","Angelo Porrello","Matteo Fabbri","Simone Calderara","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2308.11513v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11511v1","updated":"2023-08-22T15:39:29Z","published":"2023-08-22T15:39:29Z","title":"Mode Combinability: Exploring Convex Combinations of Permutation Aligned\n Models","summary":" We explore element-wise convex combinations of two permutation-aligned neural\nnetwork parameter vectors $\\Theta_A$ and $\\Theta_B$ of size $d$. We conduct\nextensive experiments by examining various distributions of such model\ncombinations parametrized by elements of the hypercube $[0,1]^{d}$ and its\nvicinity. Our findings reveal that broad regions of the hypercube form surfaces\nof low loss values, indicating that the notion of linear mode connectivity\nextends to a more general phenomenon which we call mode combinability. We also\nmake several novel observations regarding linear mode connectivity and model\nre-basin. We demonstrate a transitivity property: two models re-based to a\ncommon third model are also linear mode connected, and a robustness property:\neven with significant perturbations of the neuron matchings the resulting\ncombinations continue to form a working model. Moreover, we analyze the\nfunctional and weight similarity of model combinations and show that such\ncombinations are non-vacuous in the sense that there are significant functional\ndifferences between the resulting models.\n","authors":["Adrián Csiszárik","Melinda F. Kiss","Péter Kőrösi-Szabó","Márton Muntag","Gergely Papp","Dániel Varga"],"pdf_url":"https://arxiv.org/pdf/2308.11511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05557v2","updated":"2023-08-22T15:37:34Z","published":"2023-06-08T21:01:24Z","title":"On Performance Discrepancies Across Local Homophily Levels in Graph\n Neural Networks","summary":" Graph Neural Network (GNN) research has highlighted a relationship between\nhigh homophily (i.e., the tendency of nodes of the same class to connect) and\nstrong predictive performance in node classification. However, recent work has\nfound the relationship to be more nuanced, demonstrating that simple GNNs can\nlearn in certain heterophilous settings. To resolve these conflicting findings\nand align closer to real-world datasets, we go beyond the assumption of a\nglobal graph homophily level and study the performance of GNNs when the local\nhomophily level of a node deviates from the global homophily level. Through\ntheoretical and empirical analysis, we systematically demonstrate how shifts in\nlocal homophily can introduce performance degradation, leading to performance\ndiscrepancies across local homophily levels. We ground the practical\nimplications of this work through granular analysis on five real-world datasets\nwith varying global homophily levels, demonstrating that (a) GNNs can fail to\ngeneralize to test nodes that deviate from the global homophily of a graph, and\n(b) high local homophily does not necessarily confer high performance for a\nnode. We further show that GNNs designed for globally heterophilous graphs can\nalleviate performance discrepancy by improving performance across local\nhomophily levels, offering a new perspective on how these GNNs achieve stronger\nglobal performance.\n","authors":["Donald Loveland","Jiong Zhu","Mark Heimann","Benjamin Fish","Michael T. Shaub","Danai Koutra"],"pdf_url":"https://arxiv.org/pdf/2306.05557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11195v2","updated":"2023-08-22T15:35:25Z","published":"2023-05-18T14:03:47Z","title":"DClEVerNet: Deep Combinatorial Learning for Efficient EV Charging\n Scheduling in Large-scale Networked Facilities","summary":" With the electrification of transportation, the rising uptake of electric\nvehicles (EVs) might stress distribution networks significantly, leaving their\nperformance degraded and stability jeopardized. To accommodate these new loads\ncost-effectively, modern power grids require coordinated or ``smart'' charging\nstrategies capable of optimizing EV charging scheduling in a scalable and\nefficient fashion. With this in view, the present work focuses on reservation\nmanagement programs for large-scale, networked EV charging stations. We\nformulate a time-coupled binary optimization problem that maximizes EV users'\ntotal welfare gain while accounting for the network's available power capacity\nand stations' occupancy limits. To tackle the problem at scale while retaining\nhigh solution quality, a data-driven optimization framework combining\ntechniques from the fields of Deep Learning and Approximation Algorithms is\nintroduced. The framework's key ingredient is a novel input-output processing\nscheme for neural networks that allows direct extrapolation to problem sizes\nsubstantially larger than those included in the training set. Extensive\nnumerical simulations based on synthetic and real-world data traces verify the\neffectiveness and superiority of the presented approach over two representative\nscheduling algorithms. Lastly, we round up the contributions by listing several\nimmediate extensions to the proposed framework and outlining the prospects for\nfurther exploration.\n","authors":["Bushra Alshehhi","Areg Karapetyan","Khaled Elbassioni","Sid Chi-Kin Chau","Majid Khonji"],"pdf_url":"https://arxiv.org/pdf/2305.11195v2.pdf","comment":"Published in the proceedings of the 14th ACM International Conference\n on Future Energy Systems (Best paper award nominee).\n https://dl.acm.org/doi/abs/10.1145/3575813.3595205"},{"id":"http://arxiv.org/abs/2304.03193v2","updated":"2023-08-22T15:23:07Z","published":"2023-04-06T16:17:28Z","title":"Improving automatic endoscopic stone recognition using a multi-view\n fusion approach enhanced with two-step transfer learning","summary":" This contribution presents a deep-learning method for extracting and fusing\nimage information acquired from different viewpoints, with the aim to produce\nmore discriminant object features for the identification of the type of kidney\nstones seen in endoscopic images. The model was further improved with a\ntwo-step transfer learning approach and by attention blocks to refine the\nlearned feature maps. Deep feature fusion strategies improved the results of\nsingle view extraction backbone models by more than 6% in terms of accuracy of\nthe kidney stones classification.\n","authors":["Francisco Lopez-Tiro","Elias Villalvazo-Avila","Juan Pablo Betancur-Rengifo","Ivan Reyes-Amezcua","Jacques Hubert","Gilberto Ochoa-Ruiz","Christian Daul"],"pdf_url":"https://arxiv.org/pdf/2304.03193v2.pdf","comment":"This paper has been accepted at the LatinX in Computer Vision (LXCV)\n Research workshop at ICCV 2023 (Paris, France)"},{"id":"http://arxiv.org/abs/2308.11490v1","updated":"2023-08-22T15:10:45Z","published":"2023-08-22T15:10:45Z","title":"Can Authorship Representation Learning Capture Stylistic Features?","summary":" Automatically disentangling an author's style from the content of their\nwriting is a longstanding and possibly insurmountable problem in computational\nlinguistics. At the same time, the availability of large text corpora furnished\nwith author labels has recently enabled learning authorship representations in\na purely data-driven manner for authorship attribution, a task that ostensibly\ndepends to a greater extent on encoding writing style than encoding content.\nHowever, success on this surrogate task does not ensure that such\nrepresentations capture writing style since authorship could also be correlated\nwith other latent variables, such as topic. In an effort to better understand\nthe nature of the information these representations convey, and specifically to\nvalidate the hypothesis that they chiefly encode writing style, we\nsystematically probe these representations through a series of targeted\nexperiments. The results of these experiments suggest that representations\nlearned for the surrogate authorship prediction task are indeed sensitive to\nwriting style. As a consequence, authorship representations may be expected to\nbe robust to certain kinds of data shift, such as topic drift over time.\nAdditionally, our findings may open the door to downstream applications that\nrequire stylistic representations, such as style transfer.\n","authors":["Andrew Wang","Cristina Aggazzotti","Rebecca Kotula","Rafael Rivera Soto","Marcus Bishop","Nicholas Andrews"],"pdf_url":"https://arxiv.org/pdf/2308.11490v1.pdf","comment":"appearing at TACL 2023"},{"id":"http://arxiv.org/abs/2305.16376v2","updated":"2023-08-22T14:55:55Z","published":"2023-05-25T14:42:04Z","title":"Constrained Probabilistic Mask Learning for Task-specific Undersampled\n MRI Reconstruction","summary":" Undersampling is a common method in Magnetic Resonance Imaging (MRI) to\nsubsample the number of data points in k-space, reducing acquisition times at\nthe cost of decreased image quality. A popular approach is to employ\nundersampling patterns following various strategies, e.g., variable density\nsampling or radial trajectories. In this work, we propose a method that\ndirectly learns the undersampling masks from data points, thereby also\nproviding task- and domain-specific patterns. To solve the resulting discrete\noptimization problem, we propose a general optimization routine called ProM: A\nfully probabilistic, differentiable, versatile, and model-free framework for\nmask optimization that enforces acceleration factors through a convex\nconstraint. Analyzing knee, brain, and cardiac MRI datasets with our method, we\ndiscover that different anatomic regions reveal distinct optimal undersampling\nmasks, demonstrating the benefits of using custom masks, tailored for a\ndownstream task. For example, ProM can create undersampling masks that maximize\nperformance in downstream tasks like segmentation with networks trained on\nfully-sampled MRIs. Even with extreme acceleration factors, ProM yields\nreasonable performance while being more versatile than existing methods, paving\nthe way for data-driven all-purpose mask generation.\n","authors":["Tobias Weber","Michael Ingrisch","Bernd Bischl","David Rügamer"],"pdf_url":"https://arxiv.org/pdf/2305.16376v2.pdf","comment":"accepted at WACV 2024"},{"id":"http://arxiv.org/abs/2308.11483v1","updated":"2023-08-22T14:54:59Z","published":"2023-08-22T14:54:59Z","title":"Large Language Models Sensitivity to The Order of Options in\n Multiple-Choice Questions","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nvarious NLP tasks. However, previous works have shown these models are\nsensitive towards prompt wording, and few-shot demonstrations and their order,\nposing challenges to fair assessment of these models. As these models become\nmore powerful, it becomes imperative to understand and address these\nlimitations. In this paper, we focus on LLMs robustness on the task of\nmultiple-choice questions -- commonly adopted task to study reasoning and\nfact-retrieving capability of LLMs. Investigating the sensitivity of LLMs\ntowards the order of options in multiple-choice questions, we demonstrate a\nconsiderable performance gap of approximately 13% to 75% in LLMs on different\nbenchmarks, when answer options are reordered, even when using demonstrations\nin a few-shot setting. Through a detailed analysis, we conjecture that this\nsensitivity arises when LLMs are uncertain about the prediction between the\ntop-2/3 choices, and specific options placements may favor certain prediction\nbetween those top choices depending on the question caused by positional bias.\nWe also identify patterns in top-2 choices that amplify or mitigate the model's\nbias toward option placement. We found that for amplifying bias, the optimal\nstrategy involves positioning the top two choices as the first and last\noptions. Conversely, to mitigate bias, we recommend placing these choices among\nthe adjacent options. To validate our conjecture, we conduct various\nexperiments and adopt two approaches to calibrate LLMs' predictions, leading to\nup to 8 percentage points improvement across different models and benchmarks.\n","authors":["Pouya Pezeshkpour","Estevam Hruschka"],"pdf_url":"https://arxiv.org/pdf/2308.11483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11480v1","updated":"2023-08-22T14:52:44Z","published":"2023-08-22T14:52:44Z","title":"Expecting The Unexpected: Towards Broad Out-Of-Distribution Detection","summary":" Improving the reliability of deployed machine learning systems often involves\ndeveloping methods to detect out-of-distribution (OOD) inputs. However,\nexisting research often narrowly focuses on samples from classes that are\nabsent from the training set, neglecting other types of plausible distribution\nshifts. This limitation reduces the applicability of these methods in\nreal-world scenarios, where systems encounter a wide variety of anomalous\ninputs. In this study, we categorize five distinct types of distribution shifts\nand critically evaluate the performance of recent OOD detection methods on each\nof them. We publicly release our benchmark under the name BROAD (Benchmarking\nResilience Over Anomaly Diversity). Our findings reveal that while these\nmethods excel in detecting unknown classes, their performance is inconsistent\nwhen encountering other types of distribution shifts. In other words, they only\nreliably detect unexpected inputs that they have been specifically designed to\nexpect. As a first step toward broad OOD detection, we learn a generative model\nof existing detection scores with a Gaussian mixture. By doing so, we present\nan ensemble approach that offers a more consistent and comprehensive solution\nfor broad OOD detection, demonstrating superior performance compared to\nexisting methods. Our code to download BROAD and reproduce our experiments is\npublicly available.\n","authors":["Charles Guille-Escuret","Pierre-André Noël","Ioannis Mitliagkas","David Vazquez","Joao Monteiro"],"pdf_url":"https://arxiv.org/pdf/2308.11480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11477v1","updated":"2023-08-22T14:43:36Z","published":"2023-08-22T14:43:36Z","title":"Revisiting column-generation-based matheuristic for learning\n classification trees","summary":" Decision trees are highly interpretable models for solving classification\nproblems in machine learning (ML). The standard ML algorithms for training\ndecision trees are fast but generate suboptimal trees in terms of accuracy.\nOther discrete optimization models in the literature address the optimality\nproblem but only work well on relatively small datasets. \\cite{firat2020column}\nproposed a column-generation-based heuristic approach for learning decision\ntrees. This approach improves scalability and can work with large datasets. In\nthis paper, we describe improvements to this column generation approach. First,\nwe modify the subproblem model to significantly reduce the number of\nsubproblems in multiclass classification instances. Next, we show that the\ndata-dependent constraints in the master problem are implied, and use them as\ncutting planes. Furthermore, we describe a separation model to generate data\npoints for which the linear programming relaxation solution violates their\ncorresponding constraints. We conclude by presenting computational results that\nshow that these modifications result in better scalability.\n","authors":["Krunal Kishor Patel","Guy Desaulniers","Andrea Lodi"],"pdf_url":"https://arxiv.org/pdf/2308.11477v1.pdf","comment":"Submitted to Computers and Operations Research journal"},{"id":"http://arxiv.org/abs/2308.10797v2","updated":"2023-08-22T14:38:43Z","published":"2023-08-21T15:42:56Z","title":"Stabilizing Unsupervised Environment Design with a Learned Adversary","summary":" A key challenge in training generally-capable agents is the design of\ntraining tasks that facilitate broad generalization and robustness to\nenvironment variations. This challenge motivates the problem setting of\nUnsupervised Environment Design (UED), whereby a student agent trains on an\nadaptive distribution of tasks proposed by a teacher agent. A pioneering\napproach for UED is PAIRED, which uses reinforcement learning (RL) to train a\nteacher policy to design tasks from scratch, making it possible to directly\ngenerate tasks that are adapted to the agent's current capabilities. Despite\nits strong theoretical backing, PAIRED suffers from a variety of challenges\nthat hinder its practical performance. Thus, state-of-the-art methods currently\nrely on curation and mutation rather than generation of new tasks. In this\nwork, we investigate several key shortcomings of PAIRED and propose solutions\nfor each shortcoming. As a result, we make it possible for PAIRED to match or\nexceed state-of-the-art methods, producing robust agents in several established\nchallenging procedurally-generated environments, including a partially-observed\nmaze navigation task and a continuous-control car racing environment. We\nbelieve this work motivates a renewed emphasis on UED methods based on learned\nmodels that directly generate challenging environments, potentially unlocking\nmore open-ended RL training and, as a result, more general agents.\n","authors":["Ishita Mediratta","Minqi Jiang","Jack Parker-Holder","Michael Dennis","Eugene Vinitsky","Tim Rocktäschel"],"pdf_url":"https://arxiv.org/pdf/2308.10797v2.pdf","comment":"CoLLAs 2023 - Oral; Second and third authors contributed equally"},{"id":"http://arxiv.org/abs/2308.10482v2","updated":"2023-08-22T14:33:43Z","published":"2023-08-21T05:46:40Z","title":"An Effective Method using Phrase Mechanism in Neural Machine Translation","summary":" Machine Translation is one of the essential tasks in Natural Language\nProcessing (NLP), which has massive applications in real life as well as\ncontributing to other tasks in the NLP research community. Recently,\nTransformer -based methods have attracted numerous researchers in this domain\nand achieved state-of-the-art results in most of the pair languages. In this\npaper, we report an effective method using a phrase mechanism,\nPhraseTransformer, to improve the strong baseline model Transformer in\nconstructing a Neural Machine Translation (NMT) system for parallel corpora\nVietnamese-Chinese. Our experiments on the MT dataset of the VLSP 2022\ncompetition achieved the BLEU score of 35.3 on Vietnamese to Chinese and 33.2\nBLEU scores on Chinese to Vietnamese data. Our code is available at\nhttps://github.com/phuongnm94/PhraseTransformer.\n","authors":["Phuong Minh Nguyen","Le Minh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2308.10482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03807v3","updated":"2023-08-22T14:26:18Z","published":"2023-05-05T19:20:29Z","title":"Evading Watermark based Detection of AI-Generated Content","summary":" A generative AI model can generate extremely realistic-looking content,\nposing growing challenges to the authenticity of information. To address the\nchallenges, watermark has been leveraged to detect AI-generated content.\nSpecifically, a watermark is embedded into an AI-generated content before it is\nreleased. A content is detected as AI-generated if a similar watermark can be\ndecoded from it. In this work, we perform a systematic study on the robustness\nof such watermark-based AI-generated content detection. We focus on\nAI-generated images. Our work shows that an attacker can post-process a\nwatermarked image via adding a small, human-imperceptible perturbation to it,\nsuch that the post-processed image evades detection while maintaining its\nvisual quality. We show the effectiveness of our attack both theoretically and\nempirically. Moreover, to evade detection, our adversarial post-processing\nmethod adds much smaller perturbations to AI-generated images and thus better\nmaintain their visual quality than existing popular post-processing methods\nsuch as JPEG compression, Gaussian blur, and Brightness/Contrast. Our work\nshows the insufficiency of existing watermark-based detection of AI-generated\ncontent, highlighting the urgent needs of new methods. Our code is publicly\navailable: https://github.com/zhengyuan-jiang/WEvade.\n","authors":["Zhengyuan Jiang","Jinghuai Zhang","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2305.03807v3.pdf","comment":"To appear in ACM Conference on Computer and Communications Security\n (CCS), 2023"},{"id":"http://arxiv.org/abs/2308.11464v1","updated":"2023-08-22T14:23:21Z","published":"2023-08-22T14:23:21Z","title":"Internal Cross-layer Gradients for Extending Homogeneity to\n Heterogeneity in Federated Learning","summary":" Federated learning (FL) inevitably confronts the challenge of system\nheterogeneity in practical scenarios. To enhance the capabilities of most\nmodel-homogeneous FL methods in handling system heterogeneity, we propose a\ntraining scheme that can extend their capabilities to cope with this challenge.\nIn this paper, we commence our study with a detailed exploration of homogeneous\nand heterogeneous FL settings and discover three key observations: (1) a\npositive correlation between client performance and layer similarities, (2)\nhigher similarities in the shallow layers in contrast to the deep layers, and\n(3) the smoother gradients distributions indicate the higher layer\nsimilarities. Building upon these observations, we propose InCo Aggregation\nthat leverags internal cross-layer gradients, a mixture of gradients from\nshallow and deep layers within a server model, to augment the similarity in the\ndeep layers without requiring additional communication between clients.\nFurthermore, our methods can be tailored to accommodate model-homogeneous FL\nmethods such as FedAvg, FedProx, FedNova, Scaffold, and MOON, to expand their\ncapabilities to handle the system heterogeneity. Copious experimental results\nvalidate the effectiveness of InCo Aggregation, spotlighting internal\ncross-layer gradients as a promising avenue to enhance the performance in\nheterogenous FL.\n","authors":["Yun-Hin Chan","Rui Zhou","Running Zhao","Zhihan Jiang","Edith C. -H. Ngai"],"pdf_url":"https://arxiv.org/pdf/2308.11464v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2305.09659v2","updated":"2023-08-22T14:23:10Z","published":"2023-05-16T17:58:05Z","title":"Double Pessimism is Provably Efficient for Distributionally Robust\n Offline Reinforcement Learning: Generic Algorithm and Robust Partial Coverage","summary":" In this paper, we study distributionally robust offline reinforcement\nlearning (robust offline RL), which seeks to find an optimal policy purely from\nan offline dataset that can perform well in perturbed environments. In\nspecific, we propose a generic algorithm framework called Doubly Pessimistic\nModel-based Policy Optimization ($P^2MPO$), which features a novel combination\nof a flexible model estimation subroutine and a doubly pessimistic policy\noptimization step. Notably, the double pessimism principle is crucial to\novercome the distributional shifts incurred by (i) the mismatch between the\nbehavior policy and the target policies; and (ii) the perturbation of the\nnominal model. Under certain accuracy conditions on the model estimation\nsubroutine, we prove that $P^2MPO$ is sample-efficient with robust partial\ncoverage data, which only requires the offline data to have good coverage of\nthe distributions induced by the optimal robust policy and the perturbed models\naround the nominal model.\n By tailoring specific model estimation subroutines for concrete examples of\nRMDPs, including tabular RMDPs, factored RMDPs, kernel and neural RMDPs, we\nprove that $P^2MPO$ enjoys a $\\tilde{\\mathcal{O}}(n^{-1/2})$ convergence rate,\nwhere $n$ is the dataset size. We highlight that all these examples, except\ntabular RMDPs, are first identified and proven tractable by this work.\nFurthermore, we continue our study of robust offline RL in the robust Markov\ngames (RMGs). By extending the double pessimism principle identified for\nsingle-agent RMDPs, we propose another algorithm framework that can efficiently\nfind the robust Nash equilibria among players using only robust unilateral\n(partial) coverage data. To our best knowledge, this work proposes the first\ngeneral learning principle -- double pessimism -- for robust offline RL and\nshows that it is provably efficient with general function approximation.\n","authors":["Jose Blanchet","Miao Lu","Tong Zhang","Han Zhong"],"pdf_url":"https://arxiv.org/pdf/2305.09659v2.pdf","comment":"V2 adds results on robust offline Markov games"},{"id":"http://arxiv.org/abs/2207.08645v4","updated":"2023-08-22T14:11:07Z","published":"2022-07-18T14:45:55Z","title":"Active Exploration for Inverse Reinforcement Learning","summary":" Inverse Reinforcement Learning (IRL) is a powerful paradigm for inferring a\nreward function from expert demonstrations. Many IRL algorithms require a known\ntransition model and sometimes even a known expert policy, or they at least\nrequire access to a generative model. However, these assumptions are too strong\nfor many real-world applications, where the environment can be accessed only\nthrough sequential interaction. We propose a novel IRL algorithm: Active\nexploration for Inverse Reinforcement Learning (AceIRL), which actively\nexplores an unknown environment and expert policy to quickly learn the expert's\nreward function and identify a good policy. AceIRL uses previous observations\nto construct confidence intervals that capture plausible reward functions and\nfind exploration policies that focus on the most informative regions of the\nenvironment. AceIRL is the first approach to active IRL with sample-complexity\nbounds that does not require a generative model of the environment. AceIRL\nmatches the sample complexity of active IRL with a generative model in the\nworst case. Additionally, we establish a problem-dependent bound that relates\nthe sample complexity of AceIRL to the suboptimality gap of a given IRL\nproblem. We empirically evaluate AceIRL in simulations and find that it\nsignificantly outperforms more naive exploration strategies.\n","authors":["David Lindner","Andreas Krause","Giorgia Ramponi"],"pdf_url":"https://arxiv.org/pdf/2207.08645v4.pdf","comment":"Presented at Conference on Neural Information Processing Systems\n (NeurIPS), 2022"},{"id":"http://arxiv.org/abs/2305.04106v2","updated":"2023-08-22T14:10:06Z","published":"2023-05-06T18:00:21Z","title":"On the Usage of Continual Learning for Out-of-Distribution\n Generalization in Pre-trained Language Models of Code","summary":" Pre-trained language models (PLMs) have become a prevalent technique in deep\nlearning for code, utilizing a two-stage pre-training and fine-tuning procedure\nto acquire general knowledge about code and specialize in a variety of\ndownstream tasks. However, the dynamic nature of software codebases poses a\nchallenge to the effectiveness and robustness of PLMs. In particular,\nworld-realistic scenarios potentially lead to significant differences between\nthe distribution of the pre-training and test data, i.e., distribution shift,\nresulting in a degradation of the PLM's performance on downstream tasks. In\nthis paper, we stress the need for adapting PLMs of code to software data whose\ndistribution changes over time, a crucial problem that has been overlooked in\nprevious works. The motivation of this work is to consider the PLM in a\nnon-stationary environment, where fine-tuning data evolves over time according\nto a software evolution scenario. Specifically, we design a scenario where the\nmodel needs to learn from a stream of programs containing new, unseen APIs over\ntime. We study two widely used PLM architectures, i.e., a GPT2 decoder and a\nRoBERTa encoder, on two downstream tasks, API call and API usage prediction. We\ndemonstrate that the most commonly used fine-tuning technique from prior work\nis not robust enough to handle the dynamic nature of APIs, leading to the loss\nof previously acquired knowledge i.e., catastrophic forgetting. To address\nthese issues, we implement five continual learning approaches, including\nreplay-based and regularization-based methods. Our findings demonstrate that\nutilizing these straightforward methods effectively mitigates catastrophic\nforgetting in PLMs across both downstream tasks while achieving comparable or\nsuperior performance.\n","authors":["Martin Weyssow","Xin Zhou","Kisub Kim","David Lo","Houari Sahraoui"],"pdf_url":"https://arxiv.org/pdf/2305.04106v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10511v2","updated":"2023-08-22T14:08:20Z","published":"2023-08-21T06:51:58Z","title":"Performance Enhancement Leveraging Mask-RCNN on Bengali Document Layout\n Analysis","summary":" Understanding digital documents is like solving a puzzle, especially\nhistorical ones. Document Layout Analysis (DLA) helps with this puzzle by\ndividing documents into sections like paragraphs, images, and tables. This is\ncrucial for machines to read and understand these documents. In the DL Sprint\n2.0 competition, we worked on understanding Bangla documents. We used a dataset\ncalled BaDLAD with lots of examples. We trained a special model called Mask\nR-CNN to help with this understanding. We made this model better by\nstep-by-step hyperparameter tuning, and we achieved a good dice score of 0.889.\nHowever, not everything went perfectly. We tried using a model trained for\nEnglish documents, but it didn't fit well with Bangla. This showed us that each\nlanguage has its own challenges. Our solution for the DL Sprint 2.0 is publicly\navailable at https://www.kaggle.com/competitions/dlsprint2/discussion/432201\nalong with notebooks, weights, and inference notebook.\n","authors":["Shrestha Datta","Md Adith Mollah","Raisa Fairooz","Tariful Islam Fahim"],"pdf_url":"https://arxiv.org/pdf/2308.10511v2.pdf","comment":"Contest paper, Conest: DL sprint 2.0 (Link:\n https://www.kaggle.com/competitions/dlsprint2), Solution link:\n https://www.kaggle.com/competitions/dlsprint2/discussion/432201"},{"id":"http://arxiv.org/abs/2308.11455v1","updated":"2023-08-22T14:05:37Z","published":"2023-08-22T14:05:37Z","title":"A Survey on Self-Supervised Representation Learning","summary":" Learning meaningful representations is at the heart of many tasks in the\nfield of modern machine learning. Recently, a lot of methods were introduced\nthat allow learning of image representations without supervision. These\nrepresentations can then be used in downstream tasks like classification or\nobject detection. The quality of these representations is close to supervised\nlearning, while no labeled images are needed. This survey paper provides a\ncomprehensive review of these methods in a unified notation, points out\nsimilarities and differences of these methods, and proposes a taxonomy which\nsets these methods in relation to each other. Furthermore, our survey\nsummarizes the most-recent experimental results reported in the literature in\nform of a meta-study. Our survey is intended as a starting point for\nresearchers and practitioners who want to dive into the field of representation\nlearning.\n","authors":["Tobias Uelwer","Jan Robine","Stefan Sylvius Wagner","Marc Höftmann","Eric Upschulte","Sebastian Konietzny","Maike Behrendt","Stefan Harmeling"],"pdf_url":"https://arxiv.org/pdf/2308.11455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11930v3","updated":"2023-08-22T14:00:25Z","published":"2023-04-24T09:16:31Z","title":"Label-free timing analysis of SiPM-based modularized detectors with\n physics-constrained deep learning","summary":" Pulse timing is an important topic in nuclear instrumentation, with\nfar-reaching applications from high energy physics to radiation imaging. While\nhigh-speed analog-to-digital converters become more and more developed and\naccessible, their potential uses and merits in nuclear detector signal\nprocessing are still uncertain, partially due to associated timing algorithms\nwhich are not fully understood and utilized. In this paper, we propose a novel\nmethod based on deep learning for timing analysis of modularized detectors\nwithout explicit needs of labelling event data. By taking advantage of the\nintrinsic time correlations, a label-free loss function with a specially\ndesigned regularizer is formed to supervise the training of neural networks\ntowards a meaningful and accurate mapping function. We mathematically\ndemonstrate the existence of the optimal function desired by the method, and\ngive a systematic algorithm for training and calibration of the model. The\nproposed method is validated on two experimental datasets based on silicon\nphotomultipliers (SiPM) as main transducers. In the toy experiment, the neural\nnetwork model achieves the single-channel time resolution of 8.8 ps and\nexhibits robustness against concept drift in the dataset. In the\nelectromagnetic calorimeter experiment, several neural network models (FC, CNN\nand LSTM) are tested to show their conformance to the underlying physical\nconstraint and to judge their performance against traditional methods. In\ntotal, the proposed method works well in either ideal or noisy experimental\ncondition and recovers the time information from waveform samples successfully\nand precisely.\n","authors":["Pengcheng Ai","Le Xiao","Zhi Deng","Yi Wang","Xiangming Sun","Guangming Huang","Dong Wang","Yulei Li","Xinchi Ran"],"pdf_url":"https://arxiv.org/pdf/2304.11930v3.pdf","comment":"26 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.11448v1","updated":"2023-08-22T13:55:57Z","published":"2023-08-22T13:55:57Z","title":"Masked Momentum Contrastive Learning for Zero-shot Semantic\n Understanding","summary":" Self-supervised pretraining (SSP) has emerged as a popular technique in\nmachine learning, enabling the extraction of meaningful feature representations\nwithout labelled data. In the realm of computer vision, pretrained vision\ntransformers (ViTs) have played a pivotal role in advancing transfer learning.\nNonetheless, the escalating cost of finetuning these large models has posed a\nchallenge due to the explosion of model size. This study endeavours to evaluate\nthe effectiveness of pure self-supervised learning (SSL) techniques in computer\nvision tasks, obviating the need for finetuning, with the intention of\nemulating human-like capabilities in generalisation and recognition of unseen\nobjects. To this end, we propose an evaluation protocol for zero-shot\nsegmentation based on a prompting patch. Given a point on the target object as\na prompt, the algorithm calculates the similarity map between the selected\npatch and other patches, upon that, a simple thresholding is applied to segment\nthe target. Another evaluation is intra-object and inter-object similarity to\ngauge discriminatory ability of SSP ViTs. Insights from zero-shot segmentation\nfrom prompting and discriminatory abilities of SSP led to the design of a\nsimple SSP approach, termed MMC. This approaches combines Masked image\nmodelling for encouraging similarity of local features, Momentum based\nself-distillation for transferring semantics from global to local features, and\nglobal Contrast for promoting semantics of global features, to enhance\ndiscriminative representations of SSP ViTs. Consequently, our proposed method\nsignificantly reduces the overlap of intra-object and inter-object\nsimilarities, thereby facilitating effective object segmentation within an\nimage. Our experiments reveal that MMC delivers top-tier results in zero-shot\nsemantic segmentation across various datasets.\n","authors":["Jiantao Wu","Shentong Mo","Muhammad Awais","Sara Atito","Zhenhua Feng","Josef Kittler"],"pdf_url":"https://arxiv.org/pdf/2308.11448v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11446v1","updated":"2023-08-22T13:53:43Z","published":"2023-08-22T13:53:43Z","title":"Exploration of Rashomon Set Assists Explanations for Medical Data","summary":" The machine learning modeling process conventionally culminates in selecting\na single model that maximizes a selected performance metric. However, this\napproach leads to abandoning a more profound analysis of slightly inferior\nmodels. Particularly in medical and healthcare studies, where the objective\nextends beyond predictions to valuable insight generation, relying solely on\nperformance metrics can result in misleading or incomplete conclusions. This\nproblem is particularly pertinent when dealing with a set of models with\nperformance close to maximum one, known as $\\textit{Rashomon set}$. Such a set\ncan be numerous and may contain models describing the data in a different way,\nwhich calls for comprehensive analysis. This paper introduces a novel process\nto explore Rashomon set models, extending the conventional modeling approach.\nThe cornerstone is the identification of the most different models within the\nRashomon set, facilitated by the introduced $\\texttt{Rashomon_DETECT}$\nalgorithm. This algorithm compares profiles illustrating prediction\ndependencies on variable values generated by eXplainable Artificial\nIntelligence (XAI) techniques. To quantify differences in variable effects\namong models, we introduce the Profile Disparity Index (PDI) based on measures\nfrom functional data analysis. To illustrate the effectiveness of our approach,\nwe showcase its application in predicting survival among hemophagocytic\nlymphohistiocytosis (HLH) patients - a foundational case study. Additionally,\nwe benchmark our approach on other medical data sets, demonstrating its\nversatility and utility in various contexts.\n","authors":["Katarzyna Kobylińska","Mateusz Krzyziński","Rafał Machowicz","Mariusz Adamek","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2308.11446v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2305.13998v2","updated":"2023-08-22T13:34:27Z","published":"2023-05-23T12:27:56Z","title":"SMT 2.0: A Surrogate Modeling Toolbox with a focus on Hierarchical and\n Mixed Variables Gaussian Processes","summary":" The Surrogate Modeling Toolbox (SMT) is an open-source Python package that\noffers a collection of surrogate modeling methods, sampling techniques, and a\nset of sample problems. This paper presents SMT 2.0, a major new release of SMT\nthat introduces significant upgrades and new features to the toolbox. This\nrelease adds the capability to handle mixed-variable surrogate models and\nhierarchical variables. These types of variables are becoming increasingly\nimportant in several surrogate modeling applications. SMT 2.0 also improves SMT\nby extending sampling methods, adding new surrogate models, and computing\nvariance and kernel derivatives for Kriging. This release also includes new\nfunctions to handle noisy and use multifidelity data. To the best of our\nknowledge, SMT 2.0 is the first open-source surrogate library to propose\nsurrogate models for hierarchical and mixed inputs. This open-source software\nis distributed under the New BSD license.\n","authors":["Paul Saves","Remi Lafage","Nathalie Bartoli","Youssef Diouane","Jasper Bussemaker","Thierry Lefebvre","John T. Hwang","Joseph Morlier","Joaquim R. R. A. Martins"],"pdf_url":"https://arxiv.org/pdf/2305.13998v2.pdf","comment":"version 2"},{"id":"http://arxiv.org/abs/2308.11421v1","updated":"2023-08-22T13:08:29Z","published":"2023-08-22T13:08:29Z","title":"TurboViT: Generating Fast Vision Transformers via Generative\n Architecture Search","summary":" Vision transformers have shown unprecedented levels of performance in\ntackling various visual perception tasks in recent years. However, the\narchitectural and computational complexity of such network architectures have\nmade them challenging to deploy in real-world applications with\nhigh-throughput, low-memory requirements. As such, there has been significant\nresearch recently on the design of efficient vision transformer architectures.\nIn this study, we explore the generation of fast vision transformer\narchitecture designs via generative architecture search (GAS) to achieve a\nstrong balance between accuracy and architectural and computational efficiency.\nThrough this generative architecture search process, we create TurboViT, a\nhighly efficient hierarchical vision transformer architecture design that is\ngenerated around mask unit attention and Q-pooling design patterns. The\nresulting TurboViT architecture design achieves significantly lower\narchitectural computational complexity (>2.47$\\times$ smaller than FasterViT-0\nwhile achieving same accuracy) and computational complexity (>3.4$\\times$ fewer\nFLOPs and 0.9% higher accuracy than MobileViT2-2.0) when compared to 10 other\nstate-of-the-art efficient vision transformer network architecture designs\nwithin a similar range of accuracy on the ImageNet-1K dataset. Furthermore,\nTurboViT demonstrated strong inference latency and throughput in both\nlow-latency and batch processing scenarios (>3.21$\\times$ lower latency and\n>3.18$\\times$ higher throughput compared to FasterViT-0 for low-latency\nscenario). These promising results demonstrate the efficacy of leveraging\ngenerative architecture search for generating efficient transformer\narchitecture designs for high-throughput scenarios.\n","authors":["Alexander Wong","Saad Abbasi","Saeejith Nair"],"pdf_url":"https://arxiv.org/pdf/2308.11421v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2308.11406v1","updated":"2023-08-22T12:53:09Z","published":"2023-08-22T12:53:09Z","title":"Designing an attack-defense game: how to increase robustness of\n financial transaction models via a competition","summary":" Given the escalating risks of malicious attacks in the finance sector and the\nconsequential severe damage, a thorough understanding of adversarial strategies\nand robust defense mechanisms for machine learning models is critical. The\nthreat becomes even more severe with the increased adoption in banks more\naccurate, but potentially fragile neural networks. We aim to investigate the\ncurrent state and dynamics of adversarial attacks and defenses for neural\nnetwork models that use sequential financial data as the input.\n To achieve this goal, we have designed a competition that allows realistic\nand detailed investigation of problems in modern financial transaction data.\nThe participants compete directly against each other, so possible attacks and\ndefenses are examined in close-to-real-life conditions. Our main contributions\nare the analysis of the competition dynamics that answers the questions on how\nimportant it is to conceal a model from malicious users, how long does it take\nto break it, and what techniques one should use to make it more robust, and\nintroduction additional way to attack models or increase their robustness.\n Our analysis continues with a meta-study on the used approaches with their\npower, numerical experiments, and accompanied ablations studies. We show that\nthe developed attacks and defenses outperform existing alternatives from the\nliterature while being practical in terms of execution, proving the validity of\nthe competition as a tool for uncovering vulnerabilities of machine learning\nmodels and mitigating them in various domains.\n","authors":["Alexey Zaytsev","Alex Natekin","Evgeni Vorsin","Valerii Smirnov","Oleg Sidorshin","Alexander Senin","Alexander Dudin","Dmitry Berestnev"],"pdf_url":"https://arxiv.org/pdf/2308.11406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01628v2","updated":"2023-08-22T12:42:47Z","published":"2023-04-04T08:33:13Z","title":"Equivariant Networks for Porous Crystalline Materials","summary":" Porous crystalline materials have the potential to play a key role in\ndeveloping solutions for molecular storage, gas separation and carbon\nadsorption. For these solutions, we need to develop new materials with specific\nproperties. Estimating the properties of such porous materials involves first\nprinciple simulation using classical molecular simulations. The computational\ncomplexity of these methods can be a barrier to high throughput screening of\nthe potential materials as the space of possible materials is vast. Data-driven\nmethods, specifically machine learning methods based on deep neural networks\noffer a significant opportunity to significantly scale the simulation of the\nbehavior of these materials. However, to effectively achieve this the Deep\nLearning models need to utilize the symmetries present in the crystals.\nCrystals pose specific symmetries that are present in their space group.\nExisting methods for crystal property prediction either have symmetry\nconstraints that are too restrictive or only incorporate symmetries between\nunit cells. In addition, these models do not explicitly model the porous\nstructure of the crystal. In this paper, we develop a model which incorporates\nthe symmetries of the unit cell of a crystal in its architecture and explicitly\nmodels the porous structure. We evaluate our model by predicting the heat of\nadsorption of CO$_2$ for different configurations of the Mordenite and ZSM-5\nzeolites. Our results confirm that our method performs better than existing\nmethods for crystal property prediction and that the inclusion of pores results\nin a more efficient model.\n","authors":["Marko Petković","Pablo Romero-Marimon","Vlado Menkovski","Sofia Calero"],"pdf_url":"https://arxiv.org/pdf/2304.01628v2.pdf","comment":"Added additional figures as well as additional experiments for MFI"},{"id":"http://arxiv.org/abs/2308.11389v1","updated":"2023-08-22T12:28:09Z","published":"2023-08-22T12:28:09Z","title":"Non-Redundant Combination of Hand-Crafted and Deep Learning Radiomics:\n Application to the Early Detection of Pancreatic Cancer","summary":" We address the problem of learning Deep Learning Radiomics (DLR) that are not\nredundant with Hand-Crafted Radiomics (HCR). To do so, we extract DLR features\nusing a VAE while enforcing their independence with HCR features by minimizing\ntheir mutual information. The resulting DLR features can be combined with\nhand-crafted ones and leveraged by a classifier to predict early markers of\ncancer. We illustrate our method on four early markers of pancreatic cancer and\nvalidate it on a large independent test set. Our results highlight the value of\ncombining non-redundant DLR and HCR features, as evidenced by an improvement in\nthe Area Under the Curve compared to baseline methods that do not address\nredundancy or solely rely on HCR features.\n","authors":["Rebeca Vétil","Clément Abi-Nader","Alexandre Bône","Marie-Pierre Vullierme","Marc-Michel Rohé","Pietro Gori","Isabelle Bloch"],"pdf_url":"https://arxiv.org/pdf/2308.11389v1.pdf","comment":"CaPTion workshop MICCAI 2023"},{"id":"http://arxiv.org/abs/2308.11386v1","updated":"2023-08-22T12:25:49Z","published":"2023-08-22T12:25:49Z","title":"Targeted Data Augmentation for bias mitigation","summary":" The development of fair and ethical AI systems requires careful consideration\nof bias mitigation, an area often overlooked or ignored. In this study, we\nintroduce a novel and efficient approach for addressing biases called Targeted\nData Augmentation (TDA), which leverages classical data augmentation techniques\nto tackle the pressing issue of bias in data and models. Unlike the laborious\ntask of removing biases, our method proposes to insert biases instead,\nresulting in improved performance. To identify biases, we annotated two diverse\ndatasets: a dataset of clinical skin lesions and a dataset of male and female\nfaces. These bias annotations are published for the first time in this study,\nproviding a valuable resource for future research. Through Counterfactual Bias\nInsertion, we discovered that biases associated with the frame, ruler, and\nglasses had a significant impact on models. By randomly introducing biases\nduring training, we mitigated these biases and achieved a substantial decrease\nin bias measures, ranging from two-fold to more than 50-fold, while maintaining\na negligible increase in the error rate.\n","authors":["Agnieszka Mikołajczyk-Bareła","Maria Ferlin","Michał Grochowski"],"pdf_url":"https://arxiv.org/pdf/2308.11386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11375v1","updated":"2023-08-22T12:01:49Z","published":"2023-08-22T12:01:49Z","title":"Interpretable Distribution-Invariant Fairness Measures for Continuous\n Scores","summary":" Measures of algorithmic fairness are usually discussed in the context of\nbinary decisions. We extend the approach to continuous scores. So far,\nROC-based measures have mainly been suggested for this purpose. Other existing\nmethods depend heavily on the distribution of scores, are unsuitable for\nranking tasks, or their effect sizes are not interpretable. Here, we propose a\ndistributionally invariant version of fairness measures for continuous scores\nwith a reasonable interpretation based on the Wasserstein distance. Our\nmeasures are easily computable and well suited for quantifying and interpreting\nthe strength of group disparities as well as for comparing biases across\ndifferent models, datasets, or time points. We derive a link between the\ndifferent families of existing fairness measures for scores and show that the\nproposed distributionally invariant fairness measures outperform ROC-based\nfairness measures because they are more explicit and can quantify significant\nbiases that ROC-based fairness measures miss. Finally, we demonstrate their\neffectiveness through experiments on the most commonly used fairness benchmark\ndatasets.\n","authors":["Ann-Kristin Becker","Oana Dumitrasc","Klaus Broelemann"],"pdf_url":"https://arxiv.org/pdf/2308.11375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13233v2","updated":"2023-08-22T11:31:11Z","published":"2023-05-22T17:05:34Z","title":"Estimating Gibbs free energies via isobaric-isothermal flows","summary":" We present a machine-learning model based on normalizing flows that is\ntrained to sample from the isobaric-isothermal ensemble. In our approach, we\napproximate the joint distribution of a fully-flexible triclinic simulation box\nand particle coordinates to achieve a desired internal pressure. This novel\nextension of flow-based sampling to the isobaric-isothermal ensemble yields\ndirect estimates of Gibbs free energies. We test our NPT-flow on monatomic\nwater in the cubic and hexagonal ice phases and find excellent agreement of\nGibbs free energies and other observables compared with established baselines.\n","authors":["Peter Wirnsberger","Borja Ibarz","George Papamakarios"],"pdf_url":"https://arxiv.org/pdf/2305.13233v2.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.11358v1","updated":"2023-08-22T11:20:40Z","published":"2023-08-22T11:20:40Z","title":"How Much Temporal Long-Term Context is Needed for Action Segmentation?","summary":" Modeling long-term context in videos is crucial for many fine-grained tasks\nincluding temporal action segmentation. An interesting question that is still\nopen is how much long-term temporal context is needed for optimal performance.\nWhile transformers can model the long-term context of a video, this becomes\ncomputationally prohibitive for long videos. Recent works on temporal action\nsegmentation thus combine temporal convolutional networks with self-attentions\nthat are computed only for a local temporal window. While these approaches show\ngood results, their performance is limited by their inability to capture the\nfull context of a video. In this work, we try to answer how much long-term\ntemporal context is required for temporal action segmentation by introducing a\ntransformer-based model that leverages sparse attention to capture the full\ncontext of a video. We compare our model with the current state of the art on\nthree datasets for temporal action segmentation, namely 50Salads, Breakfast,\nand Assembly101. Our experiments show that modeling the full context of a video\nis necessary to obtain the best performance for temporal action segmentation.\n","authors":["Emad Bahrami","Gianpiero Francesca","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2308.11358v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.03945v2","updated":"2023-08-22T11:17:37Z","published":"2023-08-07T23:27:20Z","title":"The Prospect of Enhancing Large-Scale Heterogeneous Federated Learning\n with Transformers","summary":" Federated learning (FL) addresses data privacy concerns by enabling\ncollaborative training of AI models across distributed data owners. Wide\nadoption of FL faces the fundamental challenges of data heterogeneity and the\nlarge scale of data owners involved. In this paper, we investigate the prospect\nof Transformer-based FL models for achieving generalization and personalization\nin this setting. We conduct extensive comparative experiments involving FL with\nTransformers, ResNet, and personalized ResNet-based FL approaches under various\nscenarios. These experiments consider varying numbers of data owners to\ndemonstrate Transformers' advantages over deep neural networks in large-scale\nheterogeneous FL tasks. In addition, we analyze the superior performance of\nTransformers by comparing the Centered Kernel Alignment (CKA) representation\nsimilarity across different layers and FL models to gain insight into the\nreasons behind their promising capabilities.\n","authors":["Yulan Gao","Zhaoxiang Hou","Chengyi Yang","Zengxiang Li","Han Yu"],"pdf_url":"https://arxiv.org/pdf/2308.03945v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11355v1","updated":"2023-08-22T11:12:53Z","published":"2023-08-22T11:12:53Z","title":"Machine learning assisted exploration for affine Deligne-Lusztig\n varieties","summary":" This paper presents a novel, interdisciplinary study that leverages a Machine\nLearning (ML) assisted framework to explore the geometry of affine\nDeligne-Lusztig varieties (ADLV). The primary objective is to investigate the\nnonemptiness pattern, dimension and enumeration of irreducible components of\nADLV. Our proposed framework demonstrates a recursive pipeline of data\ngeneration, model training, pattern analysis, and human examination, presenting\nan intricate interplay between ML and pure mathematical research. Notably, our\ndata-generation process is nuanced, emphasizing the selection of meaningful\nsubsets and appropriate feature sets. We demonstrate that this framework has a\npotential to accelerate pure mathematical research, leading to the discovery of\nnew conjectures and promising research directions that could otherwise take\nsignificant time to uncover. We rediscover the virtual dimension formula and\nprovide a full mathematical proof of a newly identified problem concerning a\ncertain lower bound of dimension. Furthermore, we extend an open invitation to\nthe readers by providing the source code for computing ADLV and the ML models,\npromoting further explorations. This paper concludes by sharing valuable\nexperiences and highlighting lessons learned from this collaboration.\n","authors":["Bin Dong","Xuhua He","Pengfei Jin","Felix Schremmer","Qingchao Yu"],"pdf_url":"https://arxiv.org/pdf/2308.11355v1.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2308.11348v1","updated":"2023-08-22T10:52:46Z","published":"2023-08-22T10:52:46Z","title":"Careful at Estimation and Bold at Exploration","summary":" Exploration strategies in continuous action space are often heuristic due to\nthe infinite actions, and these kinds of methods cannot derive a general\nconclusion. In prior work, it has been shown that policy-based exploration is\nbeneficial for continuous action space in deterministic policy reinforcement\nlearning(DPRL). However, policy-based exploration in DPRL has two prominent\nissues: aimless exploration and policy divergence, and the policy gradient for\nexploration is only sometimes helpful due to inaccurate estimation. Based on\nthe double-Q function framework, we introduce a novel exploration strategy to\nmitigate these issues, separate from the policy gradient. We first propose the\ngreedy Q softmax update schema for Q value update. The expected Q value is\nderived by weighted summing the conservative Q value over actions, and the\nweight is the corresponding greedy Q value. Greedy Q takes the maximum value of\nthe two Q functions, and conservative Q takes the minimum value of the two\ndifferent Q functions. For practicality, this theoretical basis is then\nextended to allow us to combine action exploration with the Q value update,\nexcept for the premise that we have a surrogate policy that behaves like this\nexploration policy. In practice, we construct such an exploration policy with a\nfew sampled actions, and to meet the premise, we learn such a surrogate policy\nby minimizing the KL divergence between the target policy and the exploration\npolicy constructed by the conservative Q. We evaluate our method on the Mujoco\nbenchmark and demonstrate superior performance compared to previous\nstate-of-the-art methods across various environments, particularly in the most\ncomplex Humanoid environment.\n","authors":["Xing Chen","Yijun Liu","Zhaogeng Liu","Hechang Chen","Hengshuai Yao","Yi Chang"],"pdf_url":"https://arxiv.org/pdf/2308.11348v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2303.16464v3","updated":"2023-08-22T10:50:51Z","published":"2023-03-29T05:33:53Z","title":"Lipschitzness Effect of a Loss Function on Generalization Performance of\n Deep Neural Networks Trained by Adam and AdamW Optimizers","summary":" The generalization performance of deep neural networks with regard to the\noptimization algorithm is one of the major concerns in machine learning. This\nperformance can be affected by various factors. In this paper, we theoretically\nprove that the Lipschitz constant of a loss function is an important factor to\ndiminish the generalization error of the output model obtained by Adam or\nAdamW. The results can be used as a guideline for choosing the loss function\nwhen the optimization algorithm is Adam or AdamW. In addition, to evaluate the\ntheoretical bound in a practical setting, we choose the human age estimation\nproblem in computer vision. For assessing the generalization better, the\ntraining and test datasets are drawn from different distributions. Our\nexperimental evaluation shows that the loss function with a lower Lipschitz\nconstant and maximum value improves the generalization of the model trained by\nAdam or AdamW.\n","authors":["Mohammad Lashkari","Amin Gheibi"],"pdf_url":"https://arxiv.org/pdf/2303.16464v3.pdf","comment":"Accepted to be published in AUT Journal of Mathematics and Computing\n (AJMC, 2023)"},{"id":"http://arxiv.org/abs/2308.11339v1","updated":"2023-08-22T10:36:56Z","published":"2023-08-22T10:36:56Z","title":"ProAgent: Building Proactive Cooperative AI with Large Language Models","summary":" Building AIs with adaptive behaviors in human-AI cooperation stands as a\npivotal focus in AGI research. Current methods for developing cooperative\nagents predominantly rely on learning-based methods, where policy\ngeneralization heavily hinges on past interactions with specific teammates.\nThese approaches constrain the agent's capacity to recalibrate its strategy\nwhen confronted with novel teammates. We propose \\textbf{ProAgent}, a novel\nframework that harnesses large language models (LLMs) to fashion a\n\\textit{pro}active \\textit{agent} empowered with the ability to anticipate\nteammates' forthcoming decisions and formulate enhanced plans for itself.\nProAgent excels at cooperative reasoning with the capacity to dynamically adapt\nits behavior to enhance collaborative efforts with teammates. Moreover, the\nProAgent framework exhibits a high degree of modularity and interpretability,\nfacilitating seamless integration to address a wide array of coordination\nscenarios. Experimental evaluations conducted within the framework of\n\\textit{Overcook-AI} unveil the remarkable performance superiority of ProAgent,\noutperforming five methods based on self-play and population-based training in\ncooperation with AI agents. Further, when cooperating with human proxy models,\nits performance exhibits an average improvement exceeding 10\\% compared to the\ncurrent state-of-the-art, COLE. The advancement was consistently observed\nacross diverse scenarios involving interactions with both AI agents of varying\ncharacteristics and human counterparts. These findings inspire future research\nfor human-robot collaborations. For a hands-on demonstration, please visit\n\\url{https://pku-proagent.github.io}.\n","authors":["Ceyao Zhang","Kaijie Yang","Siyi Hu","Zihao Wang","Guanghe Li","Yihang Sun","Cheng Zhang","Zhaowei Zhang","Anji Liu","Song-Chun Zhu","Xiaojun Chang","Junge Zhang","Feng Yin","Yitao Liang","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2308.11339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07714v3","updated":"2023-08-22T10:26:00Z","published":"2022-10-14T11:27:49Z","title":"CrowdGuard: Federated Backdoor Detection in Federated Learning","summary":" Federated Learning (FL) is a promising approach enabling multiple clients to\ntrain Deep Neural Networks (DNNs) collaboratively without sharing their local\ntraining data. However, FL is susceptible to backdoor (or targeted poisoning)\nattacks. These attacks are initiated by malicious clients who seek to\ncompromise the learning process by introducing specific behaviors into the\nlearned model that can be triggered by carefully crafted inputs. Existing FL\nsafeguards have various limitations: They are restricted to specific data\ndistributions or reduce the global model accuracy due to excluding benign\nmodels or adding noise, are vulnerable to adaptive defense-aware adversaries,\nor require the server to access local models, allowing data inference attacks.\n This paper presents a novel defense mechanism, CrowdGuard, that effectively\nmitigates backdoor attacks in FL and overcomes the deficiencies of existing\ntechniques. It leverages clients' feedback on individual models, analyzes the\nbehavior of neurons in hidden layers, and eliminates poisoned models through an\niterative pruning scheme. CrowdGuard employs a server-located stacked\nclustering scheme to enhance its resilience to rogue client feedback. The\nevaluation results demonstrate that CrowdGuard achieves a 100%\nTrue-Positive-Rate and True-Negative-Rate across various scenarios, including\nIID and non-IID data distributions. Additionally, CrowdGuard withstands\nadaptive adversaries while preserving the original performance of protected\nmodels. To ensure confidentiality, CrowdGuard uses a secure and\nprivacy-preserving architecture leveraging Trusted Execution Environments\n(TEEs) on both client and server sides.\n","authors":["Phillip Rieger","Torsten Krauß","Markus Miettinen","Alexandra Dmitrienko","Ahmad-Reza Sadeghi"],"pdf_url":"https://arxiv.org/pdf/2210.07714v3.pdf","comment":"To appear in the Network and Distributed System Security (NDSS)\n Symposium 2024. Phillip Rieger and Torsten Krau{\\ss} contributed equally to\n this contribution. 19 pages, 8 figures, 5 tables, 4 algorithms, 5 equations"},{"id":"http://arxiv.org/abs/2308.11333v1","updated":"2023-08-22T10:16:12Z","published":"2023-08-22T10:16:12Z","title":"Protect Federated Learning Against Backdoor Attacks via Data-Free\n Trigger Generation","summary":" As a distributed machine learning paradigm, Federated Learning (FL) enables\nlarge-scale clients to collaboratively train a model without sharing their raw\ndata. However, due to the lack of data auditing for untrusted clients, FL is\nvulnerable to poisoning attacks, especially backdoor attacks. By using poisoned\ndata for local training or directly changing the model parameters, attackers\ncan easily inject backdoors into the model, which can trigger the model to make\nmisclassification of targeted patterns in images. To address these issues, we\npropose a novel data-free trigger-generation-based defense approach based on\nthe two characteristics of backdoor attacks: i) triggers are learned faster\nthan normal knowledge, and ii) trigger patterns have a greater effect on image\nclassification than normal class patterns. Our approach generates the images\nwith newly learned knowledge by identifying the differences between the old and\nnew global models, and filters trigger images by evaluating the effect of these\ngenerated images. By using these trigger images, our approach eliminates\npoisoned models to ensure the updated global model is benign. Comprehensive\nexperiments demonstrate that our approach can defend against almost all the\nexisting types of backdoor attacks and outperform all the seven\nstate-of-the-art defense methods with both IID and non-IID scenarios.\nEspecially, our approach can successfully defend against the backdoor attack\neven when 80\\% of the clients are malicious.\n","authors":["Yanxin Yang","Ming Hu","Yue Cao","Jun Xia","Yihao Huang","Yang Liu","Mingsong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.11333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2101.11525v2","updated":"2023-08-22T09:55:34Z","published":"2021-01-27T16:31:33Z","title":"Calibrating and Improving Graph Contrastive Learning","summary":" Graph contrastive learning algorithms have demonstrated remarkable success in\nvarious applications such as node classification, link prediction, and graph\nclustering. However, in unsupervised graph contrastive learning, some\ncontrastive pairs may contradict the truths in downstream tasks and thus the\ndecrease of losses on these pairs undesirably harms the performance in the\ndownstream tasks. To assess the discrepancy between the prediction and the\nground-truth in the downstream tasks for these contrastive pairs, we adapt the\nexpected calibration error (ECE) to graph contrastive learning. The analysis of\nECE motivates us to propose a novel regularization method, Contrast-Reg, to\nensure that decreasing the contrastive loss leads to better performance in the\ndownstream tasks. As a plug-in regularizer, Contrast-Reg effectively improves\nthe performance of existing graph contrastive learning algorithms. We provide\nboth theoretical and empirical results to demonstrate the effectiveness of\nContrast-Reg in enhancing the generalizability of the Graph Neural Network(GNN)\nmodel and improving the performance of graph contrastive algorithms with\ndifferent similarity definitions and encoder backbones across various\ndownstream tasks.\n","authors":["Kaili Ma","Haochen Yang","Han Yang","Yongqiang Chen","James Cheng"],"pdf_url":"https://arxiv.org/pdf/2101.11525v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11966v2","updated":"2023-08-22T09:45:11Z","published":"2023-02-23T12:29:20Z","title":"Machine Learning for QoS Prediction in Vehicular Communication:\n Challenges and Solution Approaches","summary":" As cellular networks evolve towards the 6th generation, machine learning is\nseen as a key enabling technology to improve the capabilities of the network.\nMachine learning provides a methodology for predictive systems, which can make\nnetworks become proactive. This proactive behavior of the network can be\nleveraged to sustain, for example, a specific quality of service requirement.\nWith predictive quality of service, a wide variety of new use cases, both\nsafety- and entertainment-related, are emerging, especially in the automotive\nsector. Therefore, in this work, we consider maximum throughput prediction\nenhancing, for example, streaming or high-definition mapping applications. We\ndiscuss the entire machine learning workflow highlighting less regarded aspects\nsuch as the detailed sampling procedures, the in-depth analysis of the dataset\ncharacteristics, the effects of splits in the provided results, and the data\navailability. Reliable machine learning models need to face a lot of challenges\nduring their lifecycle. We highlight how confidence can be built on machine\nlearning technologies by better understanding the underlying characteristics of\nthe collected data. We discuss feature engineering and the effects of different\nsplits for the training processes, showcasing that random splits might\noverestimate performance by more than twofold. Moreover, we investigate diverse\nsets of input features, where network information proved to be most effective,\ncutting the error by half. Part of our contribution is the validation of\nmultiple machine learning models within diverse scenarios. We also use\nexplainable AI to show that machine learning can learn underlying principles of\nwireless networks without being explicitly programmed. Our data is collected\nfrom a deployed network that was under full control of the measurement team and\ncovered different vehicular scenarios and radio environments.\n","authors":["Alexandros Palaios","Christian L. Vielhaus","Daniel F. Külzer","Cara Watermann","Rodrigo Hernangomez","Sanket Partani","Philipp Geuer","Anton Krause","Raja Sattiraju","Martin Kasparick","Gerhard Fettweis","Frank H. P. Fitzek","Hans D. Schotten","Slawomir Stanczak"],"pdf_url":"https://arxiv.org/pdf/2302.11966v2.pdf","comment":"18 pages, 12 Figures. Accepted on IEEE Access"},{"id":"http://arxiv.org/abs/2207.09944v4","updated":"2023-08-22T09:31:35Z","published":"2022-07-20T14:41:09Z","title":"Probable Domain Generalization via Quantile Risk Minimization","summary":" Domain generalization (DG) seeks predictors which perform well on unseen test\ndistributions by leveraging data drawn from multiple related training\ndistributions or domains. To achieve this, DG is commonly formulated as an\naverage- or worst-case problem over the set of possible domains. However,\npredictors that perform well on average lack robustness while predictors that\nperform well in the worst case tend to be overly-conservative. To address this,\nwe propose a new probabilistic framework for DG where the goal is to learn\npredictors that perform well with high probability. Our key idea is that\ndistribution shifts seen during training should inform us of probable shifts at\ntest time, which we realize by explicitly relating training and test domains as\ndraws from the same underlying meta-distribution. To achieve probable DG, we\npropose a new optimization problem called Quantile Risk Minimization (QRM). By\nminimizing the $\\alpha$-quantile of predictor's risk distribution over domains,\nQRM seeks predictors that perform well with probability $\\alpha$. To solve QRM\nin practice, we propose the Empirical QRM (EQRM) algorithm and provide: (i) a\ngeneralization bound for EQRM; and (ii) the conditions under which EQRM\nrecovers the causal predictor as $\\alpha \\to 1$. In our experiments, we\nintroduce a more holistic quantile-focused evaluation protocol for DG and\ndemonstrate that EQRM outperforms state-of-the-art baselines on datasets from\nWILDS and DomainBed.\n","authors":["Cian Eastwood","Alexander Robey","Shashank Singh","Julius von Kügelgen","Hamed Hassani","George J. Pappas","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2207.09944v4.pdf","comment":"NeurIPS 2022 camera-ready (+ minor corrections)"},{"id":"http://arxiv.org/abs/2308.11295v1","updated":"2023-08-22T09:17:45Z","published":"2023-08-22T09:17:45Z","title":"Uncertainty Estimation of Transformers' Predictions via Topological\n Analysis of the Attention Matrices","summary":" Determining the degree of confidence of deep learning model in its prediction\nis an open problem in the field of natural language processing. Most of the\nclassical methods for uncertainty estimation are quite weak for text\nclassification models. We set the task of obtaining an uncertainty estimate for\nneural networks based on the Transformer architecture. A key feature of such\nmo-dels is the attention mechanism, which supports the information flow between\nthe hidden representations of tokens in the neural network. We explore the\nformed relationships between internal representations using Topological Data\nAnalysis methods and utilize them to predict model's confidence. In this paper,\nwe propose a method for uncertainty estimation based on the topological\nproperties of the attention mechanism and compare it with classical methods. As\na result, the proposed algorithm surpasses the existing methods in quality and\nopens up a new area of application of the attention mechanism, but requires the\nselection of topological features.\n","authors":["Elizaveta Kostenok","Daniil Cherniavskii","Alexey Zaytsev"],"pdf_url":"https://arxiv.org/pdf/2308.11295v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11294v1","updated":"2023-08-22T09:15:43Z","published":"2023-08-22T09:15:43Z","title":"Network Momentum across Asset Classes","summary":" We investigate the concept of network momentum, a novel trading signal\nderived from momentum spillover across assets. Initially observed within the\nconfines of pairwise economic and fundamental ties, such as the stock-bond\nconnection of the same company and stocks linked through supply-demand chains,\nmomentum spillover implies a propagation of momentum risk premium from one\nasset to another. The similarity of momentum risk premium, exemplified by\nco-movement patterns, has been spotted across multiple asset classes including\ncommodities, equities, bonds and currencies. However, studying the network\neffect of momentum spillover across these classes has been challenging due to a\nlack of readily available common characteristics or economic ties beyond the\ncompany level. In this paper, we explore the interconnections of momentum\nfeatures across a diverse range of 64 continuous future contracts spanning\nthese four classes. We utilise a linear and interpretable graph learning model\nwith minimal assumptions to reveal the intricacies of the momentum spillover\nnetwork. By leveraging the learned networks, we construct a network momentum\nstrategy that exhibits a Sharpe ratio of 1.5 and an annual return of 22%, after\nvolatility scaling, from 2000 to 2022. This paper pioneers the examination of\nmomentum spillover across multiple asset classes using only pricing data,\npresents a multi-asset investment strategy based on network momentum, and\nunderscores the effectiveness of this strategy through robust empirical\nanalysis.\n","authors":[" Xingyue"," Pu","Stephen Roberts","Xiaowen Dong","Stefan Zohren"],"pdf_url":"https://arxiv.org/pdf/2308.11294v1.pdf","comment":"27 pages"},{"id":"http://arxiv.org/abs/2205.14900v3","updated":"2023-08-22T09:15:15Z","published":"2022-05-30T07:43:42Z","title":"FRAug: Tackling Federated Learning with Non-IID Features via\n Representation Augmentation","summary":" Federated Learning (FL) is a decentralized learning paradigm, in which\nmultiple clients collaboratively train deep learning models without\ncentralizing their local data, and hence preserve data privacy. Real-world\napplications usually involve a distribution shift across the datasets of the\ndifferent clients, which hurts the generalization ability of the clients to\nunseen samples from their respective data distributions. In this work, we\naddress the recently proposed feature shift problem where the clients have\ndifferent feature distributions, while the label distribution is the same. We\npropose Federated Representation Augmentation (FRAug) to tackle this practical\nand challenging problem. Our approach generates synthetic client-specific\nsamples in the embedding space to augment the usually small client datasets.\nFor that, we train a shared generative model to fuse the clients knowledge\nlearned from their different feature distributions. This generator synthesizes\nclient-agnostic embeddings, which are then locally transformed into\nclient-specific embeddings by Representation Transformation Networks (RTNets).\nBy transferring knowledge across the clients, the generated embeddings act as a\nregularizer for the client models and reduce overfitting to the local original\ndatasets, hence improving generalization. Our empirical evaluation on public\nbenchmarks and a real-world medical dataset demonstrates the effectiveness of\nthe proposed method, which substantially outperforms the current\nstate-of-the-art FL methods for non-IID features, including PartialFed and\nFedBN.\n","authors":["Haokun Chen","Ahmed Frikha","Denis Krompass","Jindong Gu","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2205.14900v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11291v1","updated":"2023-08-22T09:12:11Z","published":"2023-08-22T09:12:11Z","title":"Improving Knot Prediction in Wood Logs with Longitudinal Feature\n Propagation","summary":" The quality of a wood log in the wood industry depends heavily on the\npresence of both outer and inner defects, including inner knots that are a\nresult of the growth of tree branches. Today, locating the inner knots require\nthe use of expensive equipment such as X-ray scanners. In this paper, we\naddress the task of predicting the location of inner defects from the outer\nshape of the logs. The dataset is built by extracting both the contours and the\nknots with X-ray measurements. We propose to solve this binary segmentation\ntask by leveraging convolutional recurrent neural networks. Once the neural\nnetwork is trained, inference can be performed from the outer shape measured\nwith cheap devices such as laser profilers. We demonstrate the effectiveness of\nour approach on fir and spruce tree species and perform ablation on the\nrecurrence to demonstrate its importance.\n","authors":["Salim Khazem","Jeremy Fix","Cédric Pradalier"],"pdf_url":"https://arxiv.org/pdf/2308.11291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11290v1","updated":"2023-08-22T09:11:53Z","published":"2023-08-22T09:11:53Z","title":"ShadowNet for Data-Centric Quantum System Learning","summary":" Understanding the dynamics of large quantum systems is hindered by the curse\nof dimensionality. Statistical learning offers new possibilities in this regime\nby neural-network protocols and classical shadows, while both methods have\nlimitations: the former is plagued by the predictive uncertainty and the latter\nlacks the generalization ability. Here we propose a data-centric learning\nparadigm combining the strength of these two approaches to facilitate diverse\nquantum system learning (QSL) tasks. Particularly, our paradigm utilizes\nclassical shadows along with other easily obtainable information of quantum\nsystems to create the training dataset, which is then learnt by neural networks\nto unveil the underlying mapping rule of the explored QSL problem. Capitalizing\non the generalization power of neural networks, this paradigm can be trained\noffline and excel at predicting previously unseen systems at the inference\nstage, even with few state copies. Besides, it inherits the characteristic of\nclassical shadows, enabling memory-efficient storage and faithful prediction.\nThese features underscore the immense potential of the proposed data-centric\napproach in discovering novel and large-scale quantum systems. For\nconcreteness, we present the instantiation of our paradigm in quantum state\ntomography and direct fidelity estimation tasks and conduct numerical analysis\nup to 60 qubits. Our work showcases the profound prospects of data-centric\nartificial intelligence to advance QSL in a faithful and generalizable manner.\n","authors":["Yuxuan Du","Yibo Yang","Tongliang Liu","Zhouchen Lin","Bernard Ghanem","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2308.11290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14729v2","updated":"2023-08-22T09:06:01Z","published":"2023-07-27T09:35:56Z","title":"Understanding Silent Failures in Medical Image Classification","summary":" To ensure the reliable use of classification systems in medical applications,\nit is crucial to prevent silent failures. This can be achieved by either\ndesigning classifiers that are robust enough to avoid failures in the first\nplace, or by detecting remaining failures using confidence scoring functions\n(CSFs). A predominant source of failures in image classification is\ndistribution shifts between training data and deployment data. To understand\nthe current state of silent failure prevention in medical imaging, we conduct\nthe first comprehensive analysis comparing various CSFs in four biomedical\ntasks and a diverse range of distribution shifts. Based on the result that none\nof the benchmarked CSFs can reliably prevent silent failures, we conclude that\na deeper understanding of the root causes of failures in the data is required.\nTo facilitate this, we introduce SF-Visuals, an interactive analysis tool that\nuses latent space clustering to visualize shifts and failures. On the basis of\nvarious examples, we demonstrate how this tool can help researchers gain\ninsight into the requirements for safe application of classification systems in\nthe medical domain. The open-source benchmark and tool are at:\nhttps://github.com/IML-DKFZ/sf-visuals.\n","authors":["Till J. Bungert","Levin Kobelke","Paul F. Jaeger"],"pdf_url":"https://arxiv.org/pdf/2307.14729v2.pdf","comment":"Accepted at MICCAI 23"},{"id":"http://arxiv.org/abs/2307.15438v2","updated":"2023-08-22T09:05:08Z","published":"2023-07-28T09:40:19Z","title":"Autonomous Payload Thermal Control","summary":" In small satellites there is less room for heat control equipment, scientific\ninstruments, and electronic components. Furthermore, the near proximity of the\nelectronics makes power dissipation difficult, with the risk of not being able\nto control the temperature appropriately, reducing component lifetime and\nmission performance. To address this challenge, taking advantage of the advent\nof increasing intelligence on board satellites, a deep reinforcement learning\nbased framework that uses Soft Actor-Critic algorithm is proposed for learning\nthe thermal control policy onboard. The framework is evaluated both in a naive\nsimulated environment and in a real space edge processing computer that will be\nshipped in the future IMAGIN-e mission and hosted in the ISS. The experiment\nresults show that the proposed framework is able to learn to control the\npayload processing power to maintain the temperature under operational ranges,\ncomplementing traditional thermal control systems.\n","authors":["Alejandro D. Mousist"],"pdf_url":"https://arxiv.org/pdf/2307.15438v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11288v1","updated":"2023-08-22T08:57:44Z","published":"2023-08-22T08:57:44Z","title":"Test Time Embedding Normalization for Popularity Bias Mitigation","summary":" Popularity bias is a widespread problem in the field of recommender systems,\nwhere popular items tend to dominate recommendation results. In this work, we\npropose 'Test Time Embedding Normalization' as a simple yet effective strategy\nfor mitigating popularity bias, which surpasses the performance of the previous\nmitigation approaches by a significant margin. Our approach utilizes the\nnormalized item embedding during the inference stage to control the influence\nof embedding magnitude, which is highly correlated with item popularity.\nThrough extensive experiments, we show that our method combined with the\nsampled softmax loss effectively reduces popularity bias compare to previous\napproaches for bias mitigation. We further investigate the relationship between\nuser and item embeddings and find that the angular similarity between\nembeddings distinguishes preferable and non-preferable items regardless of\ntheir popularity. The analysis explains the mechanism behind the success of our\napproach in eliminating the impact of popularity bias. Our code is available at\nhttps://github.com/ml-postech/TTEN.\n","authors":["Dain Kim","Jinhyeok Park","Dongwoo Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11288v1.pdf","comment":"5 pages, CIKM 2023"},{"id":"http://arxiv.org/abs/2308.11277v1","updated":"2023-08-22T08:46:30Z","published":"2023-08-22T08:46:30Z","title":"CNN based Cuneiform Sign Detection Learned from Annotated 3D Renderings\n and Mapped Photographs with Illumination Augmentation","summary":" Motivated by the challenges of the Digital Ancient Near Eastern Studies\n(DANES) community, we develop digital tools for processing cuneiform script\nbeing a 3D script imprinted into clay tablets used for more than three\nmillennia and at least eight major languages. It consists of thousands of\ncharacters that have changed over time and space. Photographs are the most\ncommon representations usable for machine learning, while ink drawings are\nprone to interpretation. Best suited 3D datasets that are becoming available.\nWe created and used the HeiCuBeDa and MaiCuBeDa datasets, which consist of\naround 500 annotated tablets. For our novel OCR-like approach to mixed image\ndata, we provide an additional mapping tool for transferring annotations\nbetween 3D renderings and photographs. Our sign localization uses a RepPoints\ndetector to predict the locations of characters as bounding boxes. We use image\ndata from GigaMesh's MSII (curvature, see https://gigamesh.eu) based rendering,\nPhong-shaded 3D models, and photographs as well as illumination augmentation.\nThe results show that using rendered 3D images for sign detection performs\nbetter than other work on photographs. In addition, our approach gives\nreasonably good results for photographs only, while it is best used for mixed\ndatasets. More importantly, the Phong renderings, and especially the MSII\nrenderings, improve the results on photographs, which is the largest dataset on\na global scale.\n","authors":["Ernst Stötzner","Timo Homburg","Hubert Mara"],"pdf_url":"https://arxiv.org/pdf/2308.11277v1.pdf","comment":"This paper was accepted to ICCV23 and includes the DOI for an Open\n Access Dataset with annotated cuneiform script"},{"id":"http://arxiv.org/abs/2308.11272v1","updated":"2023-08-22T08:39:44Z","published":"2023-08-22T08:39:44Z","title":"FoX: Formation-aware exploration in multi-agent reinforcement learning","summary":" Recently, deep multi-agent reinforcement learning (MARL) has gained\nsignificant popularity due to its success in various cooperative multi-agent\ntasks. However, exploration still remains a challenging problem in MARL due to\nthe partial observability of the agents and the exploration space that can grow\nexponentially as the number of agents increases. Firstly, in order to address\nthe scalability issue of the exploration space, we define a formation-based\nequivalence relation on the exploration space and aim to reduce the search\nspace by exploring only meaningful states in different formations. Then, we\npropose a novel formation-aware exploration (FoX) framework that encourages\npartially observable agents to visit the states in diverse formations by\nguiding them to be well aware of their current formation solely based on their\nown observations. Numerical results show that the proposed FoX framework\nsignificantly outperforms the state-of-the-art MARL algorithms on Google\nResearch Football (GRF) and sparse Starcraft II multi-agent challenge (SMAC)\ntasks.\n","authors":["Yonghyeon Jo","Sunwoo Lee","Junghyuk Yum","Seungyul Han"],"pdf_url":"https://arxiv.org/pdf/2308.11272v1.pdf","comment":"7 pages main, 5 pages appendix with reference. 10 figures, submitted\n for AAAI"},{"id":"http://arxiv.org/abs/2308.11269v1","updated":"2023-08-22T08:29:09Z","published":"2023-08-22T08:29:09Z","title":"Quantum-Inspired Machine Learning: a Survey","summary":" Quantum-inspired Machine Learning (QiML) is a burgeoning field, receiving\nglobal attention from researchers for its potential to leverage principles of\nquantum mechanics within classical computational frameworks. However, current\nreview literature often presents a superficial exploration of QiML, focusing\ninstead on the broader Quantum Machine Learning (QML) field. In response to\nthis gap, this survey provides an integrated and comprehensive examination of\nQiML, exploring QiML's diverse research domains including tensor network\nsimulations, dequantized algorithms, and others, showcasing recent\nadvancements, practical applications, and illuminating potential future\nresearch avenues. Further, a concrete definition of QiML is established by\nanalyzing various prior interpretations of the term and their inherent\nambiguities. As QiML continues to evolve, we anticipate a wealth of future\ndevelopments drawing from quantum mechanics, quantum computing, and classical\nmachine learning, enriching the field further. This survey serves as a guide\nfor researchers and practitioners alike, providing a holistic understanding of\nQiML's current landscape and future directions.\n","authors":["Larry Huynh","Jin Hong","Ajmal Mian","Hajime Suzuki","Yanqiu Wu","Seyit Camtepe"],"pdf_url":"https://arxiv.org/pdf/2308.11269v1.pdf","comment":"56 pages, 13 figures, 8 tables"},{"id":"http://arxiv.org/abs/2308.11267v1","updated":"2023-08-22T08:24:45Z","published":"2023-08-22T08:24:45Z","title":"Robust Lagrangian and Adversarial Policy Gradient for Robust Constrained\n Markov Decision Processes","summary":" The robust constrained Markov decision process (RCMDP) is a recent\ntask-modelling framework for reinforcement learning that incorporates\nbehavioural constraints and that provides robustness to errors in the\ntransition dynamics model through the use of an uncertainty set. Simulating\nRCMDPs requires computing the worst-case dynamics based on value estimates for\neach state, an approach which has previously been used in the Robust\nConstrained Policy Gradient (RCPG). Highlighting potential downsides of RCPG\nsuch as not robustifying the full constrained objective and the lack of\nincremental learning, this paper introduces two algorithms, called RCPG with\nRobust Lagrangian and Adversarial RCPG. RCPG with Robust Lagrangian modifies\nRCPG by taking the worst-case dynamics based on the Lagrangian rather than\neither the value or the constraint. Adversarial RCPG also formulates the\nworst-case dynamics based on the Lagrangian but learns this directly and\nincrementally as an adversarial policy through gradient descent rather than\nindirectly and abruptly through constrained optimisation on a sorted value\nlist. A theoretical analysis first derives the Lagrangian policy gradient for\nthe policy optimisation of both proposed algorithms and then the adversarial\npolicy gradient to learn the adversary for Adversarial RCPG. Empirical\nexperiments injecting perturbations in inventory management and safe navigation\ntasks demonstrate the competitive performance of both algorithms compared to\ntraditional RCPG variants as well as non-robust and non-constrained ablations.\nIn particular, Adversarial RCPG ranks among the top two performing algorithms\non all tests.\n","authors":["David M. Bossens"],"pdf_url":"https://arxiv.org/pdf/2308.11267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13057v2","updated":"2023-08-22T08:18:04Z","published":"2023-05-22T14:14:43Z","title":"Causality-Aided Trade-off Analysis for Machine Learning Fairness","summary":" There has been an increasing interest in enhancing the fairness of machine\nlearning (ML). Despite the growing number of fairness-improving methods, we\nlack a systematic understanding of the trade-offs among factors considered in\nthe ML pipeline when fairness-improving methods are applied. This understanding\nis essential for developers to make informed decisions regarding the provision\nof fair ML services. Nonetheless, it is extremely difficult to analyze the\ntrade-offs when there are multiple fairness parameters and other crucial\nmetrics involved, coupled, and even in conflict with one another.\n This paper uses causality analysis as a principled method for analyzing\ntrade-offs between fairness parameters and other crucial metrics in ML\npipelines. To ractically and effectively conduct causality analysis, we propose\na set of domain-specific optimizations to facilitate accurate causal discovery\nand a unified, novel interface for trade-off analysis based on well-established\ncausal inference methods. We conduct a comprehensive empirical study using\nthree real-world datasets on a collection of widelyused fairness-improving\ntechniques. Our study obtains actionable suggestions for users and developers\nof fair ML. We further demonstrate the versatile usage of our approach in\nselecting the optimal fairness-improving method, paving the way for more\nethical and socially responsible AI technologies.\n","authors":["Zhenlan Ji","Pingchuan Ma","Shuai Wang","Yanhui Li"],"pdf_url":"https://arxiv.org/pdf/2305.13057v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11256v1","updated":"2023-08-22T07:59:49Z","published":"2023-08-22T07:59:49Z","title":"Efficient Last-iterate Convergence Algorithms in Solving Games","summary":" No-regret algorithms are popular for learning Nash equilibrium (NE) in\ntwo-player zero-sum normal-form games (NFGs) and extensive-form games (EFGs).\nMany recent works consider the last-iterate convergence no-regret algorithms.\nAmong them, the two most famous algorithms are Optimistic Gradient Descent\nAscent (OGDA) and Optimistic Multiplicative Weight Update (OMWU). However, OGDA\nhas high per-iteration complexity. OMWU exhibits a lower per-iteration\ncomplexity but poorer empirical performance, and its convergence holds only\nwhen NE is unique. Recent works propose a Reward Transformation (RT) framework\nfor MWU, which removes the uniqueness condition and achieves competitive\nperformance with OMWU. Unfortunately, RT-based algorithms perform worse than\nOGDA under the same number of iterations, and their convergence guarantee is\nbased on the continuous-time feedback assumption, which does not hold in most\nscenarios. To address these issues, we provide a closer analysis of the RT\nframework, which holds for both continuous and discrete-time feedback. We\ndemonstrate that the essence of the RT framework is to transform the problem of\nlearning NE in the original game into a series of strongly convex-concave\noptimization problems (SCCPs). We show that the bottleneck of RT-based\nalgorithms is the speed of solving SCCPs. To improve the their empirical\nperformance, we design a novel transformation method to enable the SCCPs can be\nsolved by Regret Matching+ (RM+), a no-regret algorithm with better empirical\nperformance, resulting in Reward Transformation RM+ (RTRM+). RTRM+ enjoys\nlast-iterate convergence under the discrete-time feedback setting. Using the\ncounterfactual regret decomposition framework, we propose Reward Transformation\nCFR+ (RTCFR+) to extend RTRM+ to EFGs. Experimental results show that our\nalgorithms significantly outperform existing last-iterate convergence\nalgorithms and RM+ (CFR+).\n","authors":["Linjian Meng","Zhenxing Ge","Wenbin Li","Bo An","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2308.11256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11254v1","updated":"2023-08-22T07:56:57Z","published":"2023-08-22T07:56:57Z","title":"A survey on bias in machine learning research","summary":" Current research on bias in machine learning often focuses on fairness, while\noverlooking the roots or causes of bias. However, bias was originally defined\nas a \"systematic error,\" often caused by humans at different stages of the\nresearch process. This article aims to bridge the gap between past literature\non bias in research by providing taxonomy for potential sources of bias and\nerrors in data and models. The paper focus on bias in machine learning\npipelines. Survey analyses over forty potential sources of bias in the machine\nlearning (ML) pipeline, providing clear examples for each. By understanding the\nsources and consequences of bias in machine learning, better methods can be\ndeveloped for its detecting and mitigating, leading to fairer, more\ntransparent, and more accurate ML models.\n","authors":["Agnieszka Mikołajczyk-Bareła","Michał Grochowski"],"pdf_url":"https://arxiv.org/pdf/2308.11254v1.pdf","comment":"Submitted to journal. arXiv admin note: substantial text overlap with\n arXiv:2308.09464"},{"id":"http://arxiv.org/abs/2308.11247v1","updated":"2023-08-22T07:43:59Z","published":"2023-08-22T07:43:59Z","title":"Multi-Source Domain Adaptation for Cross-Domain Fault Diagnosis of\n Chemical Processes","summary":" Fault diagnosis is an essential component in process supervision. Indeed, it\ndetermines which kind of fault has occurred, given that it has been previously\ndetected, allowing for appropriate intervention. Automatic fault diagnosis\nsystems use machine learning for predicting the fault type from sensor\nreadings. Nonetheless, these models are sensible to changes in the data\ndistributions, which may be caused by changes in the monitored process, such as\nchanges in the mode of operation. This scenario is known as Cross-Domain Fault\nDiagnosis (CDFD). We provide an extensive comparison of single and multi-source\nunsupervised domain adaptation (SSDA and MSDA respectively) algorithms for\nCDFD. We study these methods in the context of the Tennessee-Eastmann Process,\na widely used benchmark in the chemical industry. We show that using multiple\ndomains during training has a positive effect, even when no adaptation is\nemployed. As such, the MSDA baseline improves over the SSDA baseline\nclassification accuracy by 23% on average. In addition, under the\nmultiple-sources scenario, we improve classification accuracy of the no\nadaptation setting by 8.4% on average.\n","authors":["Eduardo Fernandes Montesuma","Michela Mulas","Fred Ngolè Mboula","Francesco Corona","Antoine Souloumiac"],"pdf_url":"https://arxiv.org/pdf/2308.11247v1.pdf","comment":"18 pages,15 figures"},{"id":"http://arxiv.org/abs/2308.11241v1","updated":"2023-08-22T07:34:07Z","published":"2023-08-22T07:34:07Z","title":"An Effective Transformer-based Contextual Model and Temporal Gate\n Pooling for Speaker Identification","summary":" Wav2vec2 has achieved success in applying Transformer architecture and\nself-supervised learning to speech recognition. Recently, these have come to be\nused not only for speech recognition but also for the entire speech processing.\nThis paper introduces an effective end-to-end speaker identification model\napplied Transformer-based contextual model. We explored the relationship\nbetween the parameters and the performance in order to discern the structure of\nan effective model. Furthermore, we propose a pooling method, Temporal Gate\nPooling, with powerful learning ability for speaker identification. We applied\nConformer as encoder and BEST-RQ for pre-training and conducted an evaluation\nutilizing the speaker identification of VoxCeleb1. The proposed method has\nachieved an accuracy of 85.9% with 28.5M parameters, demonstrating comparable\nprecision to wav2vec2 with 317.7M parameters. Code is available at\nhttps://github.com/HarunoriKawano/speaker-identification-with-tgp.\n","authors":["Harunori Kawano","Sota Shimizu"],"pdf_url":"https://arxiv.org/pdf/2308.11241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14953v2","updated":"2023-08-22T07:33:53Z","published":"2023-07-27T15:46:59Z","title":"Multi-Source Domain Adaptation through Dataset Dictionary Learning in\n Wasserstein Space","summary":" This paper seeks to solve Multi-Source Domain Adaptation (MSDA), which aims\nto mitigate data distribution shifts when transferring knowledge from multiple\nlabeled source domains to an unlabeled target domain. We propose a novel MSDA\nframework based on dictionary learning and optimal transport. We interpret each\ndomain in MSDA as an empirical distribution. As such, we express each domain as\na Wasserstein barycenter of dictionary atoms, which are empirical\ndistributions. We propose a novel algorithm, DaDiL, for learning via\nmini-batches: (i) atom distributions; (ii) a matrix of barycentric coordinates.\nBased on our dictionary, we propose two novel methods for MSDA: DaDil-R, based\non the reconstruction of labeled samples in the target domain, and DaDiL-E,\nbased on the ensembling of classifiers learned on atom distributions. We\nevaluate our methods in 3 benchmarks: Caltech-Office, Office 31, and CRWU,\nwhere we improved previous state-of-the-art by 3.15%, 2.29%, and 7.71% in\nclassification performance. Finally, we show that interpolations in the\nWasserstein hull of learned atoms provide data that can generalize to the\ntarget domain.\n","authors":["Eduardo Fernandes Montesuma","Fred Ngolè Mboula","Antoine Souloumiac"],"pdf_url":"https://arxiv.org/pdf/2307.14953v2.pdf","comment":"13 pages,8 figures,Accepted as a conference paper at the 26th\n European Conference on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2308.11240v1","updated":"2023-08-22T07:27:45Z","published":"2023-08-22T07:27:45Z","title":"Minwise-Independent Permutations with Insertion and Deletion of Features","summary":" In their seminal work, Broder \\textit{et. al.}~\\citep{BroderCFM98} introduces\nthe $\\mathrm{minHash}$ algorithm that computes a low-dimensional sketch of\nhigh-dimensional binary data that closely approximates pairwise Jaccard\nsimilarity. Since its invention, $\\mathrm{minHash}$ has been commonly used by\npractitioners in various big data applications. Further, the data is dynamic in\nmany real-life scenarios, and their feature sets evolve over time. We consider\nthe case when features are dynamically inserted and deleted in the dataset. We\nnote that a naive solution to this problem is to repeatedly recompute\n$\\mathrm{minHash}$ with respect to the updated dimension. However, this is an\nexpensive task as it requires generating fresh random permutations. To the best\nof our knowledge, no systematic study of $\\mathrm{minHash}$ is recorded in the\ncontext of dynamic insertion and deletion of features. In this work, we\ninitiate this study and suggest algorithms that make the $\\mathrm{minHash}$\nsketches adaptable to the dynamic insertion and deletion of features. We show a\nrigorous theoretical analysis of our algorithms and complement it with\nextensive experiments on several real-world datasets. Empirically we observe a\nsignificant speed-up in the running time while simultaneously offering\ncomparable performance with respect to running $\\mathrm{minHash}$ from scratch.\nOur proposal is efficient, accurate, and easy to implement in practice.\n","authors":["Rameshwar Pratap","Raghav Kulkarni"],"pdf_url":"https://arxiv.org/pdf/2308.11240v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00623v2","updated":"2023-08-22T07:10:19Z","published":"2023-07-02T17:29:41Z","title":"Variational Autoencoding Molecular Graphs with Denoising Diffusion\n Probabilistic Model","summary":" In data-driven drug discovery, designing molecular descriptors is a very\nimportant task. Deep generative models such as variational autoencoders (VAEs)\noffer a potential solution by designing descriptors as probabilistic latent\nvectors derived from molecular structures. These models can be trained on large\ndatasets, which have only molecular structures, and applied to transfer\nlearning. Nevertheless, the approximate posterior distribution of the latent\nvectors of the usual VAE assumes a simple multivariate Gaussian distribution\nwith zero covariance, which may limit the performance of representing the\nlatent features. To overcome this limitation, we propose a novel molecular deep\ngenerative model that incorporates a hierarchical structure into the\nprobabilistic latent vectors. We achieve this by a denoising diffusion\nprobabilistic model (DDPM). We demonstrate that our model can design effective\nmolecular latent vectors for molecular property prediction from some\nexperiments by small datasets on physical properties and activity. The results\nhighlight the superior prediction performance and robustness of our model\ncompared to existing approaches.\n","authors":["Daiki Koge","Naoaki Ono","Shigehiko Kanaya"],"pdf_url":"https://arxiv.org/pdf/2307.00623v2.pdf","comment":"2 pages. Short paper submitted to IEEE CIBCB 2023"},{"id":"http://arxiv.org/abs/2308.11220v1","updated":"2023-08-22T06:21:39Z","published":"2023-08-22T06:21:39Z","title":"Federated Learning on Patient Data for Privacy-Protecting Polycystic\n Ovary Syndrome Treatment","summary":" The field of women's endocrinology has trailed behind data-driven medical\nsolutions, largely due to concerns over the privacy of patient data. Valuable\ndatapoints about hormone levels or menstrual cycling could expose patients who\nsuffer from comorbidities or terminate a pregnancy, violating their privacy. We\nexplore the application of Federated Learning (FL) to predict the optimal drug\nfor patients with polycystic ovary syndrome (PCOS). PCOS is a serious hormonal\ndisorder impacting millions of women worldwide, yet it's poorly understood and\nits research is stunted by a lack of patient data. We demonstrate that a\nvariety of FL approaches succeed on a synthetic PCOS patient dataset. Our\nproposed FL models are a tool to access massive quantities of diverse data and\nidentify the most effective treatment option while providing PCOS patients with\nprivacy guarantees.\n","authors":["Lucia Morris","Tori Qiu","Nikhil Raghuraman"],"pdf_url":"https://arxiv.org/pdf/2308.11220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16889v2","updated":"2023-08-22T06:15:15Z","published":"2023-07-28T10:57:38Z","title":"Rethinking Noisy Label Learning in Real-world Annotation Scenarios from\n the Noise-type Perspective","summary":" In this paper, we investigate the problem of learning with noisy labels in\nreal-world annotation scenarios, where noise can be categorized into two types:\nfactual noise and ambiguity noise. To better distinguish these noise types and\nutilize their semantics, we propose a novel sample selection-based approach for\nnoisy label learning, called Proto-semi. Proto-semi initially divides all\nsamples into the confident and unconfident datasets via warm-up. By leveraging\nthe confident dataset, prototype vectors are constructed to capture class\ncharacteristics. Subsequently, the distances between the unconfident samples\nand the prototype vectors are calculated to facilitate noise classification.\nBased on these distances, the labels are either corrected or retained,\nresulting in the refinement of the confident and unconfident datasets. Finally,\nwe introduce a semi-supervised learning method to enhance training. Empirical\nevaluations on a real-world annotated dataset substantiate the robustness of\nProto-semi in handling the problem of learning from noisy labels. Meanwhile,\nthe prototype-based repartitioning strategy is shown to be effective in\nmitigating the adverse impact of label noise. Our code and data are available\nat https://github.com/fuxiAIlab/ProtoSemi.\n","authors":["Renyu Zhu","Haoyu Liu","Runze Wu","Minmin Lin","Tangjie Lv","Changjie Fan","Haobo Wang"],"pdf_url":"https://arxiv.org/pdf/2307.16889v2.pdf","comment":"Submitted to AAAI 2024"},{"id":"http://arxiv.org/abs/2308.11217v1","updated":"2023-08-22T06:05:11Z","published":"2023-08-22T06:05:11Z","title":"Federated Learning in Big Model Era: Domain-Specific Multimodal Large\n Models","summary":" Multimodal data, which can comprehensively perceive and recognize the\nphysical world, has become an essential path towards general artificial\nintelligence. However, multimodal large models trained on public datasets often\nunderperform in specific industrial domains. This paper proposes a multimodal\nfederated learning framework that enables multiple enterprises to utilize\nprivate domain data to collaboratively train large models for vertical domains,\nachieving intelligent services across scenarios. The authors discuss in-depth\nthe strategic transformation of federated learning in terms of intelligence\nfoundation and objectives in the era of big model, as well as the new\nchallenges faced in heterogeneous data, model aggregation, performance and cost\ntrade-off, data privacy, and incentive mechanism. The paper elaborates a case\nstudy of leading enterprises contributing multimodal data and expert knowledge\nto city safety operation management , including distributed deployment and\nefficient coordination of the federated learning platform, technical\ninnovations on data quality improvement based on large model capabilities and\nefficient joint fine-tuning approaches. Preliminary experiments show that\nenterprises can enhance and accumulate intelligent capabilities through\nmultimodal model federated learning, thereby jointly creating an smart city\nmodel that provides high-quality intelligent services covering energy\ninfrastructure safety, residential community security, and urban operation\nmanagement. The established federated learning cooperation ecosystem is\nexpected to further aggregate industry, academia, and research resources,\nrealize large models in multiple vertical domains, and promote the large-scale\nindustrial application of artificial intelligence and cutting-edge research on\nmultimodal federated learning.\n","authors":["Zengxiang Li","Zhaoxiang Hou","Hui Liu","Ying Wang","Tongzhi Li","Longfei Xie","Chao Shi","Chengyi Yang","Weishan Zhang","Zelei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11216v1","updated":"2023-08-22T06:03:00Z","published":"2023-08-22T06:03:00Z","title":"Hamiltonian GAN","summary":" A growing body of work leverages the Hamiltonian formalism as an inductive\nbias for physically plausible neural network based video generation. The\nstructure of the Hamiltonian ensures conservation of a learned quantity (e.g.,\nenergy) and imposes a phase-space interpretation on the low-dimensional\nmanifold underlying the input video. While this interpretation has the\npotential to facilitate the integration of learned representations in\ndownstream tasks, existing methods are limited in their applicability as they\nrequire a structural prior for the configuration space at design time. In this\nwork, we present a GAN-based video generation pipeline with a learned\nconfiguration space map and Hamiltonian neural network motion model, to learn a\nrepresentation of the configuration space from data. We train our model with a\nphysics-inspired cyclic-coordinate loss function which encourages a minimal\nrepresentation of the configuration space and improves interpretability. We\ndemonstrate the efficacy and advantages of our approach on the Hamiltonian\nDynamics Suite Toy Physics dataset.\n","authors":["Christine Allen-Blanchette"],"pdf_url":"https://arxiv.org/pdf/2308.11216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11204v1","updated":"2023-08-22T05:41:20Z","published":"2023-08-22T05:41:20Z","title":"A Simple Framework for Multi-mode Spatial-Temporal Data Modeling","summary":" Spatial-temporal data modeling aims to mine the underlying spatial\nrelationships and temporal dependencies of objects in a system. However, most\nexisting methods focus on the modeling of spatial-temporal data in a single\nmode, lacking the understanding of multiple modes. Though very few methods have\nbeen presented to learn the multi-mode relationships recently, they are built\non complicated components with higher model complexities. In this paper, we\npropose a simple framework for multi-mode spatial-temporal data modeling to\nbring both effectiveness and efficiency together. Specifically, we design a\ngeneral cross-mode spatial relationships learning component to adaptively\nestablish connections between multiple modes and propagate information along\nthe learned connections. Moreover, we employ multi-layer perceptrons to capture\nthe temporal dependencies and channel correlations, which are conceptually and\ntechnically succinct. Experiments on three real-world datasets show that our\nmodel can consistently outperform the baselines with lower space and time\ncomplexity, opening up a promising direction for modeling spatial-temporal\ndata. The generalizability of the cross-mode spatial relationships learning\nmodule is also validated.\n","authors":["Zihang Liu","Le Yu","Tongyu Zhu","Leiei Sun"],"pdf_url":"https://arxiv.org/pdf/2308.11204v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11200v1","updated":"2023-08-22T05:23:04Z","published":"2023-08-22T05:23:04Z","title":"SegRNN: Segment Recurrent Neural Network for Long-Term Time Series\n Forecasting","summary":" RNN-based methods have faced challenges in the Long-term Time Series\nForecasting (LTSF) domain when dealing with excessively long look-back windows\nand forecast horizons. Consequently, the dominance in this domain has shifted\ntowards Transformer, MLP, and CNN approaches. The substantial number of\nrecurrent iterations are the fundamental reasons behind the limitations of RNNs\nin LTSF. To address these issues, we propose two novel strategies to reduce the\nnumber of iterations in RNNs for LTSF tasks: Segment-wise Iterations and\nParallel Multi-step Forecasting (PMF). RNNs that combine these strategies,\nnamely SegRNN, significantly reduce the required recurrent iterations for LTSF,\nresulting in notable improvements in forecast accuracy and inference speed.\nExtensive experiments demonstrate that SegRNN not only outperforms SOTA\nTransformer-based models but also reduces runtime and memory usage by more than\n78%. These achievements provide strong evidence that RNNs continue to excel in\nLTSF tasks and encourage further exploration of this domain with more RNN-based\napproaches. The source code is coming soon.\n","authors":["Shengsheng Lin","Weiwei Lin","Wentai Wu","Feiyu Zhao","Ruichao Mo","Haotong Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11199v1","updated":"2023-08-22T05:21:31Z","published":"2023-08-22T05:21:31Z","title":"ConcatPlexer: Additional Dim1 Batching for Faster ViTs","summary":" Transformers have demonstrated tremendous success not only in the natural\nlanguage processing (NLP) domain but also the field of computer vision,\nigniting various creative approaches and applications. Yet, the superior\nperformance and modeling flexibility of transformers came with a severe\nincrease in computation costs, and hence several works have proposed methods to\nreduce this burden. Inspired by a cost-cutting method originally proposed for\nlanguage models, Data Multiplexing (DataMUX), we propose a novel approach for\nefficient visual recognition that employs additional dim1 batching (i.e.,\nconcatenation) that greatly improves the throughput with little compromise in\nthe accuracy. We first introduce a naive adaptation of DataMux for vision\nmodels, Image Multiplexer, and devise novel components to overcome its\nweaknesses, rendering our final model, ConcatPlexer, at the sweet spot between\ninference speed and accuracy. The ConcatPlexer was trained on ImageNet1K and\nCIFAR100 dataset and it achieved 23.5% less GFLOPs than ViT-B/16 with 69.5% and\n83.4% validation accuracy, respectively.\n","authors":["Donghoon Han","Seunghyeon Seo","Donghyeon Jeon","Jiho Jang","Chaerin Kong","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2308.11199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11197v1","updated":"2023-08-22T05:14:42Z","published":"2023-08-22T05:14:42Z","title":"Toward Generalizable Machine Learning Models in Speech, Language, and\n Hearing Sciences: Power Analysis and Sample Size Estimation","summary":" This study's first purpose is to provide quantitative evidence that would\nincentivize researchers to instead use the more robust method of nested\ncross-validation. The second purpose is to present methods and MATLAB codes for\ndoing power analysis for ML-based analysis during the design of a study. Monte\nCarlo simulations were used to quantify the interactions between the employed\ncross-validation method, the discriminative power of features, the\ndimensionality of the feature space, and the dimensionality of the model. Four\ndifferent cross-validations (single holdout, 10-fold, train-validation-test,\nand nested 10-fold) were compared based on the statistical power and\nstatistical confidence of the ML models. Distributions of the null and\nalternative hypotheses were used to determine the minimum required sample size\nfor obtaining a statistically significant outcome ({\\alpha}=0.05,\n1-\\b{eta}=0.8). Statistical confidence of the model was defined as the\nprobability of correct features being selected and hence being included in the\nfinal model. Our analysis showed that the model generated based on the single\nholdout method had very low statistical power and statistical confidence and\nthat it significantly overestimated the accuracy. Conversely, the nested\n10-fold cross-validation resulted in the highest statistical confidence and the\nhighest statistical power, while providing an unbiased estimate of the\naccuracy. The required sample size with a single holdout could be 50% higher\nthan what would be needed if nested cross-validation were used. Confidence in\nthe model based on nested cross-validation was as much as four times higher\nthan the confidence in the single holdout-based model. A computational model,\nMATLAB codes, and lookup tables are provided to assist researchers with\nestimating the sample size during the design of their future studies.\n","authors":["Hamzeh Ghasemzadeh","Robert E. Hillman","Daryush D. Mehta"],"pdf_url":"https://arxiv.org/pdf/2308.11197v1.pdf","comment":"Under review at JSLHR"},{"id":"http://arxiv.org/abs/2308.11192v1","updated":"2023-08-22T04:54:30Z","published":"2023-08-22T04:54:30Z","title":"Automatic Task Parallelization of Dataflow Graphs in ML/DL models","summary":" Several methods exist today to accelerate Machine Learning(ML) or\nDeep-Learning(DL) model performance for training and inference. However, modern\ntechniques that rely on various graph and operator parallelism methodologies\nrely on search space optimizations which are costly in terms of power and\nhardware usage. Especially in the case of inference, when the batch size is 1\nand execution is on CPUs or for power-constrained edge devices, current\ntechniques can become costly, complicated or inapplicable. To ameliorate this,\nwe present a Critical-Path-based Linear Clustering approach to exploit inherent\nparallel paths in ML dataflow graphs. Our task parallelization approach further\noptimizes the structure of graphs via cloning and prunes them via constant\npropagation and dead-code elimination. Contrary to other work, we generate\nreadable and executable parallel Pytorch+Python code from input ML models in\nONNX format via a new tool that we have built called {\\bf Ramiel}. This allows\nus to benefit from other downstream acceleration techniques like intra-op\nparallelism and potentially pipeline parallelism. Our preliminary results on\nseveral ML graphs demonstrate up to 1.9$\\times$ speedup over serial execution\nand outperform some of the current mechanisms in both compile and runtimes.\nLastly, our methods are lightweight and fast enough so that they can be used\neffectively for power and resource-constrained devices, while still enabling\ndownstream optimizations.\n","authors":["Srinjoy Das","Lawrence Rauchwerger"],"pdf_url":"https://arxiv.org/pdf/2308.11192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11189v1","updated":"2023-08-22T04:49:23Z","published":"2023-08-22T04:49:23Z","title":"Diversity Measures: Domain-Independent Proxies for Failure in Language\n Model Queries","summary":" Error prediction in large language models often relies on domain-specific\ninformation. In this paper, we present measures for quantification of error in\nthe response of a large language model based on the diversity of responses to a\ngiven prompt - hence independent of the underlying application. We describe how\nthree such measures - based on entropy, Gini impurity, and centroid distance -\ncan be employed. We perform a suite of experiments on multiple datasets and\ntemperature settings to demonstrate that these measures strongly correlate with\nthe probability of failure. Additionally, we present empirical results\ndemonstrating how these measures can be applied to few-shot prompting,\nchain-of-thought reasoning, and error detection.\n","authors":["Noel Ngu","Nathaniel Lee","Paulo Shakarian"],"pdf_url":"https://arxiv.org/pdf/2308.11189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10753v2","updated":"2023-08-22T04:41:17Z","published":"2023-02-17T06:59:45Z","title":"DTAAD: Dual Tcn-Attention Networks for Anomaly Detection in Multivariate\n Time Series Data","summary":" Anomaly detection techniques enable effective anomaly detection and diagnosis\nin multi-variate time series data, which are of major significance for today's\nindustrial applications. However, establishing an anomaly detection system that\ncan be rapidly and accurately located is a challenging problem due to the lack\nof outlier tags, the high dimensional complexity of the data, memory\nbottlenecks in the actual hardware, and the need for fast reasoning. We have\nproposed an anomaly detection and diagnosis model -- DTAAD in this paper, based\non Transformer, and Dual Temporal Convolutional Network(TCN). Our overall model\nwill be an integrated design in which autoregressive model(AR) combines\nautoencoder(AE) structures, and scaling methods and feedback mechanisms are\nintroduced to improve prediction accuracy and expand correlation differences.\nConstructed by us, the Dual TCN-Attention Network (DTA) only uses a single\nlayer of Transformer encoder in our baseline experiment, that belongs to an\nultra-lightweight model. Our extensive experiments on six publicly datasets\nvalidate that DTAAD exceeds current most advanced baseline methods in both\ndetection and diagnostic performance. Specifically, DTAAD improved F1 scores by\n$8.38\\%$, and reduced training time by $99\\%$ compared to baseline. The code\nand training scripts are publicly on GitHub at\nhttps://github.com/Yu-Lingrui/DTAAD.\n","authors":["Lingrui Yu"],"pdf_url":"https://arxiv.org/pdf/2302.10753v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.04552v2","updated":"2023-08-22T04:36:47Z","published":"2023-02-09T10:42:11Z","title":"Optimistic Online Mirror Descent for Bridging Stochastic and Adversarial\n Online Convex Optimization","summary":" Stochastically Extended Adversarial (SEA) model is introduced by Sachs et al.\n[2022] as an interpolation between stochastic and adversarial online convex\noptimization. Under the smoothness condition, they demonstrate that the\nexpected regret of optimistic follow-the-regularized-leader (FTRL) depends on\nthe cumulative stochastic variance $\\sigma_{1:T}^2$ and the cumulative\nadversarial variation $\\Sigma_{1:T}^2$ for convex functions. They also provide\na slightly weaker bound based on the maximal stochastic variance\n$\\sigma_{\\max}^2$ and the maximal adversarial variation $\\Sigma_{\\max}^2$ for\nstrongly convex functions. Inspired by their work, we investigate the\ntheoretical guarantees of optimistic online mirror descent (OMD) for the SEA\nmodel. For convex and smooth functions, we obtain the same\n$\\mathcal{O}(\\sqrt{\\sigma_{1:T}^2}+\\sqrt{\\Sigma_{1:T}^2})$ regret bound,\nwithout the convexity requirement of individual functions. For strongly convex\nand smooth functions, we establish an $\\mathcal{O}((\\sigma_{\\max}^2 +\n\\Sigma_{\\max}^2) \\log (\\sigma_{1:T}^2+\\Sigma_{1:T}^2))$ bound, better than\ntheir $\\mathcal{O}((\\sigma_{\\max}^2 + \\Sigma_{\\max}^2) \\log T)$ result. For\nexp-concave and smooth functions, we achieve a new\n$\\mathcal{O}(d\\log(\\sigma_{1:T}^2+\\Sigma_{1:T}^2))$ bound. Owing to the OMD\nframework, we broaden our work to study dynamic regret minimization and\nscenarios where the online functions are non-smooth. We establish the first\ndynamic regret guarantee for the SEA model with convex and smooth functions,\nwhich is more favorable than static regret bounds in non-stationary scenarios.\nFurthermore, to deal with non-smooth and convex functions in the SEA model, we\npropose novel algorithms building on optimistic OMD with an implicit update,\nwhich provably attain static regret and dynamic regret guarantees without\nsmoothness conditions.\n","authors":["Sijia Chen","Yu-Jie Zhang","Wei-Wei Tu","Peng Zhao","Lijun Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.04552v2.pdf","comment":"conference version appeared at ICML 2023; this extended version\n enriches the content with improved regret bounds for strongly convex\n functions, discussions on the optimism design for dynamic regret\n minimization, and extensions to non-smooth scenarios"},{"id":"http://arxiv.org/abs/2306.10715v2","updated":"2023-08-22T04:20:03Z","published":"2023-06-19T06:22:02Z","title":"Maximum Entropy Heterogeneous-Agent Mirror Learning","summary":" Multi-agent reinforcement learning (MARL) has been shown effective for\ncooperative games in recent years. However, existing state-of-the-art methods\nface challenges related to sample inefficiency, brittleness regarding\nhyperparameters, and the risk of converging to a suboptimal Nash Equilibrium.\nTo resolve these issues, in this paper, we propose a novel theoretical\nframework, named Maximum Entropy Heterogeneous-Agent Mirror Learning (MEHAML),\nthat leverages the maximum entropy principle to design maximum entropy MARL\nactor-critic algorithms. We prove that algorithms derived from the MEHAML\nframework enjoy the desired properties of the monotonic improvement of the\njoint maximum entropy objective and the convergence to quantal response\nequilibrium (QRE). The practicality of MEHAML is demonstrated by developing a\nMEHAML extension of the widely used RL algorithm, HASAC (for soft\nactor-critic), which shows significant improvements in exploration and\nrobustness on three challenging benchmarks: Multi-Agent MuJoCo, StarCraftII,\nand Google Research Football. Our results show that HASAC outperforms strong\nbaseline methods such as HATD3, HAPPO, QMIX, and MAPPO, thereby establishing\nthe new state of the art. See our project page at\nhttps://sites.google.com/view/mehaml.\n","authors":["Jiarong Liu","Yifan Zhong","Siyi Hu","Haobo Fu","Qiang Fu","Xiaojun Chang","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2306.10715v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11179v1","updated":"2023-08-22T04:10:14Z","published":"2023-08-22T04:10:14Z","title":"A three in one bottom-up framework for simultaneous semantic\n segmentation, instance segmentation and classification of multi-organ nuclei\n in digital cancer histology","summary":" Simultaneous segmentation and classification of nuclei in digital histology\nplay an essential role in computer-assisted cancer diagnosis; however, it\nremains challenging. The highest achieved binary and multi-class Panoptic\nQuality (PQ) remains as low as 0.68 bPQ and 0.49 mPQ, respectively. It is due\nto the higher staining variability, variability across the tissue, rough\nclinical conditions, overlapping nuclei, and nuclear class imbalance. The\ngeneric deep-learning methods usually rely on end-to-end models, which fail to\naddress these problems associated explicitly with digital histology. In our\nprevious work, DAN-NucNet, we resolved these issues for semantic segmentation\nwith an end-to-end model. This work extends our previous model to simultaneous\ninstance segmentation and classification. We introduce additional decoder heads\nwith independent weighted losses, which produce semantic segmentation, edge\nproposals, and classification maps. We use the outputs from the three-head\nmodel to apply post-processing to produce the final segmentation and\nclassification. Our multi-stage approach utilizes edge proposals and semantic\nsegmentations compared to direct segmentation and classification strategies\nfollowed by most state-of-the-art methods. Due to this, we demonstrate a\nsignificant performance improvement in producing high-quality instance\nsegmentation and nuclei classification. We have achieved a 0.841 Dice score for\nsemantic segmentation, 0.713 bPQ scores for instance segmentation, and 0.633\nmPQ for nuclei classification. Our proposed framework is generalized across 19\ntypes of tissues. Furthermore, the framework is less complex compared to the\nstate-of-the-art.\n","authors":["Ibtihaj Ahmad","Syed Muhammad Israr","Zain Ul Islam"],"pdf_url":"https://arxiv.org/pdf/2308.11179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10522v2","updated":"2023-08-22T04:03:19Z","published":"2023-08-21T07:19:47Z","title":"Information Theory-Guided Heuristic Progressive Multi-View Coding","summary":" Multi-view representation learning aims to capture comprehensive information\nfrom multiple views of a shared context. Recent works intuitively apply\ncontrastive learning to different views in a pairwise manner, which is still\nscalable: view-specific noise is not filtered in learning view-shared\nrepresentations; the fake negative pairs, where the negative terms are actually\nwithin the same class as the positive, and the real negative pairs are\ncoequally treated; evenly measuring the similarities between terms might\ninterfere with optimization. Importantly, few works study the theoretical\nframework of generalized self-supervised multi-view learning, especially for\nmore than two views. To this end, we rethink the existing multi-view learning\nparadigm from the perspective of information theory and then propose a novel\ninformation theoretical framework for generalized multi-view learning. Guided\nby it, we build a multi-view coding method with a three-tier progressive\narchitecture, namely Information theory-guided hierarchical Progressive\nMulti-view Coding (IPMC). In the distribution-tier, IPMC aligns the\ndistribution between views to reduce view-specific noise. In the set-tier, IPMC\nconstructs self-adjusted contrasting pools, which are adaptively modified by a\nview filter. Lastly, in the instance-tier, we adopt a designed unified loss to\nlearn representations and reduce the gradient interference. Theoretically and\nempirically, we demonstrate the superiority of IPMC over state-of-the-art\nmethods.\n","authors":["Jiangmeng Li","Hang Gao","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.10522v2.pdf","comment":"This paper is accepted by the jourcal of Neural Networks (Elsevier)\n by 2023. A revised manuscript of arXiv:2109.02344"},{"id":"http://arxiv.org/abs/2109.02344v2","updated":"2023-08-22T03:55:42Z","published":"2021-09-06T10:32:24Z","title":"Information Theory-Guided Heuristic Progressive Multi-View Coding","summary":" Multi-view representation learning captures comprehensive information from\nmultiple views of a shared context. Recent works intuitively apply contrastive\nlearning (CL) to learn representations, regarded as a pairwise manner, which is\nstill scalable: view-specific noise is not filtered in learning view-shared\nrepresentations; the fake negative pairs, where the negative terms are actually\nwithin the same class as the positive, and the real negative pairs are\ncoequally treated; and evenly measuring the similarities between terms might\ninterfere with optimization. Importantly, few works research the theoretical\nframework of generalized self-supervised multi-view learning, especially for\nmore than two views. To this end, we rethink the existing multi-view learning\nparadigm from the information theoretical perspective and then propose a novel\ninformation theoretical framework for generalized multi-view learning. Guided\nby it, we build a multi-view coding method with a three-tier progressive\narchitecture, namely Information theory-guided heuristic Progressive Multi-view\nCoding (IPMC). In the distribution-tier, IPMC aligns the distribution between\nviews to reduce view-specific noise. In the set-tier, IPMC builds self-adjusted\npools for contrasting, which utilizes a view filter to adaptively modify the\npools. Lastly, in the instance-tier, we adopt a designed unified loss to learn\ndiscriminative representations and reduce the gradient interference.\nTheoretically and empirically, we demonstrate the superiority of IPMC over\nstate-of-the-art methods.\n","authors":["Jiangmeng Li","Wenwen Qiang","Hang Gao","Bing Su","Farid Razzak","Jie Hu","Changwen Zheng","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2109.02344v2.pdf","comment":"We have uploaded a new version of this paper in arXiv:2308.10522, so\n that we have to withdrawal this paper"},{"id":"http://arxiv.org/abs/2308.11162v1","updated":"2023-08-22T03:40:46Z","published":"2023-08-22T03:40:46Z","title":"A Preliminary Investigation into Search and Matching for Tumour\n Discrimination in WHO Breast Taxonomy Using Deep Networks","summary":" Breast cancer is one of the most common cancers affecting women worldwide.\nThey include a group of malignant neoplasms with a variety of biological,\nclinical, and histopathological characteristics. There are more than 35\ndifferent histological forms of breast lesions that can be classified and\ndiagnosed histologically according to cell morphology, growth, and architecture\npatterns. Recently, deep learning, in the field of artificial intelligence, has\ndrawn a lot of attention for the computerized representation of medical images.\nSearchable digital atlases can provide pathologists with patch matching tools\nallowing them to search among evidently diagnosed and treated archival cases, a\ntechnology that may be regarded as computational second opinion. In this study,\nwe indexed and analyzed the WHO breast taxonomy (Classification of Tumours 5th\nEd.) spanning 35 tumour types. We visualized all tumour types using deep\nfeatures extracted from a state-of-the-art deep learning model, pre-trained on\nmillions of diagnostic histopathology images from the TCGA repository.\nFurthermore, we test the concept of a digital \"atlas\" as a reference for search\nand matching with rare test cases. The patch similarity search within the WHO\nbreast taxonomy data reached over 88% accuracy when validating through\n\"majority vote\" and more than 91% accuracy when validating using top-n tumour\ntypes. These results show for the first time that complex relationships among\ncommon and rare breast lesions can be investigated using an indexed digital\narchive.\n","authors":["Abubakr Shafique","Ricardo Gonzalez","Liron Pantanowitz","Puay Hoon Tan","Alberto Machado","Ian A Cree","Hamid R. Tizhoosh"],"pdf_url":"https://arxiv.org/pdf/2308.11162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11155v1","updated":"2023-08-22T03:23:36Z","published":"2023-08-22T03:23:36Z","title":"xxMD: Benchmarking Neural Force Fields Using Extended Dynamics beyond\n Equilibrium","summary":" Neural force fields (NFFs) have gained prominence in computational chemistry\nas surrogate models, superseding quantum-chemistry calculations in ab initio\nmolecular dynamics. The prevalent benchmark for NFFs has been the MD17 dataset\nand its subsequent extension. These datasets predominantly comprise geometries\nfrom the equilibrium region of the ground electronic state potential energy\nsurface, sampling from direct adiabatic dynamics. However, many chemical\nreactions entail significant molecular deformations, notably bond breaking. We\ndemonstrate the constrained distribution of internal coordinates and energies\nin the MD17 datasets, underscoring their inadequacy for representing systems\nundergoing chemical reactions. Addressing this sampling limitation, we\nintroduce the xxMD (Extended Excited-state Molecular Dynamics) dataset, derived\nfrom non-adiabatic dynamics. This dataset encompasses energies and forces\nascertained from both multireference wave function theory and density\nfunctional theory. Furthermore, its nuclear configuration spaces authentically\ndepict chemical reactions, making xxMD a more chemically relevant dataset. Our\nre-assessment of equivariant models on the xxMD datasets reveals notably higher\nmean absolute errors than those reported for MD17 and its variants. This\nobservation underscores the challenges faced in crafting a generalizable NFF\nmodel with extrapolation capability. Our proposed xxMD-CASSCF and xxMD-DFT\ndatasets are available at \\url{https://github.com/zpengmei/xxMD}.\n","authors":["Zihan Pengmei","Junyu Liu","Yinan Shu"],"pdf_url":"https://arxiv.org/pdf/2308.11155v1.pdf","comment":"19 pages, many figures. Data available at\n \\url{https://github.com/zpengmei/xxMD}"},{"id":"http://arxiv.org/abs/2308.10608v2","updated":"2023-08-22T03:23:35Z","published":"2023-08-21T10:16:52Z","title":"FocalDreamer: Text-driven 3D Editing via Focal-fusion Assembly","summary":" While text-3D editing has made significant strides in leveraging score\ndistillation sampling, emerging approaches still fall short in delivering\nseparable, precise and consistent outcomes that are vital to content creation.\nIn response, we introduce FocalDreamer, a framework that merges base shape with\neditable parts according to text prompts for fine-grained editing within\ndesired regions. Specifically, equipped with geometry union and dual-path\nrendering, FocalDreamer assembles independent 3D parts into a complete object,\ntailored for convenient instance reuse and part-wise control. We propose\ngeometric focal loss and style consistency regularization, which encourage\nfocal fusion and congruent overall appearance. Furthermore, FocalDreamer\ngenerates high-fidelity geometry and PBR textures which are compatible with\nwidely-used graphics engines. Extensive experiments have highlighted the\nsuperior editing capabilities of FocalDreamer in both quantitative and\nqualitative evaluations.\n","authors":["Yuhan Li","Yishun Dou","Yue Shi","Yu Lei","Xuanhong Chen","Yi Zhang","Peng Zhou","Bingbing Ni"],"pdf_url":"https://arxiv.org/pdf/2308.10608v2.pdf","comment":"Project website: https://focaldreamer.github.io"},{"id":"http://arxiv.org/abs/2308.11154v1","updated":"2023-08-22T03:20:14Z","published":"2023-08-22T03:20:14Z","title":"Mobility-Aware Computation Offloading for Swarm Robotics using Deep\n Reinforcement Learning","summary":" Swarm robotics is envisioned to automate a large number of dirty, dangerous,\nand dull tasks. Robots have limited energy, computation capability, and\ncommunication resources. Therefore, current swarm robotics have a small number\nof robots, which can only provide limited spatio-temporal information. In this\npaper, we propose to leverage the mobile edge computing to alleviate the\ncomputation burden. We develop an effective solution based on a mobility-aware\ndeep reinforcement learning model at the edge server side for computing\nscheduling and resource. Our results show that the proposed approach can meet\ndelay requirements and guarantee computation precision by using minimum robot\nenergy.\n","authors":["Xiucheng Wang","Hongzhi Guo"],"pdf_url":"https://arxiv.org/pdf/2308.11154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08742v2","updated":"2023-08-22T03:19:16Z","published":"2023-08-17T02:33:43Z","title":"PMET: Precise Model Editing in a Transformer","summary":" Model editing techniques modify a minor proportion of knowledge in Large\nLanguage Models (LLMs) at a relatively low cost, which have demonstrated\nnotable success. Existing methods assume Transformer Layer (TL) hidden states\nare values of key-value memories of the Feed-Forward Network (FFN). They\nusually optimize the TL hidden states to memorize target knowledge and use it\nto update the weights of the FFN in LLMs. However, the information flow of TL\nhidden states comes from three parts: Multi-Head Self-Attention (MHSA), FFN,\nand residual connections. Existing methods neglect the fact that the TL hidden\nstates contains information not specifically required for FFN. Consequently,\nthe performance of model editing decreases. To achieve more precise model\nediting, we analyze hidden states of MHSA and FFN, finding that MHSA encodes\ncertain general knowledge extraction patterns. This implies that MHSA weights\ndo not require updating when new knowledge is introduced. Based on above\nfindings, we introduce PMET, which simultaneously optimizes Transformer\nComponent (TC, namely MHSA and FFN) hidden states, while only using the\noptimized TC hidden states of FFN to precisely update FFN weights. Our\nexperiments demonstrate that PMET exhibits state-of-the-art performance on both\nthe COUNTERFACT and zsRE datasets. Our ablation experiments substantiate the\neffectiveness of our enhancements, further reinforcing the finding that the\nMHSA encodes certain general knowledge extraction patterns and indicating its\nstorage of a small amount of factual knowledge. Our code is available at\nhttps://github.com/xpq-tech/PMET.git.\n","authors":["Xiaopeng Li","Shasha Li","Shezheng Song","Jing Yang","Jun Ma","Jie Yu"],"pdf_url":"https://arxiv.org/pdf/2308.08742v2.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2308.11152v1","updated":"2023-08-22T03:13:57Z","published":"2023-08-22T03:13:57Z","title":"Energy-Efficient On-Board Radio Resource Management for Satellite\n Communications via Neuromorphic Computing","summary":" The latest satellite communication (SatCom) missions are characterized by a\nfully reconfigurable on-board software-defined payload, capable of adapting\nradio resources to the temporal and spatial variations of the system traffic.\nAs pure optimization-based solutions have shown to be computationally tedious\nand to lack flexibility, machine learning (ML)-based methods have emerged as\npromising alternatives. We investigate the application of energy-efficient\nbrain-inspired ML models for on-board radio resource management. Apart from\nsoftware simulation, we report extensive experimental results leveraging the\nrecently released Intel Loihi 2 chip. To benchmark the performance of the\nproposed model, we implement conventional convolutional neural networks (CNN)\non a Xilinx Versal VCK5000, and provide a detailed comparison of accuracy,\nprecision, recall, and energy efficiency for different traffic demands. Most\nnotably, for relevant workloads, spiking neural networks (SNNs) implemented on\nLoihi 2 yield higher accuracy, while reducing power consumption by more than\n100$\\times$ as compared to the CNN-based reference platform. Our findings point\nto the significant potential of neuromorphic computing and SNNs in supporting\non-board SatCom operations, paving the way for enhanced efficiency and\nsustainability in future SatCom systems.\n","authors":["Flor Ortiz","Nicolas Skatchkovsky","Eva Lagunas","Wallace A. Martins","Geoffrey Eappen","Saed Daoud","Osvaldo Simeone","Bipin Rajendran","Symeon Chatzinotas"],"pdf_url":"https://arxiv.org/pdf/2308.11152v1.pdf","comment":"currently under review at IEEE Transactions on Machine Learning in\n Communications and Networking"},{"id":"http://arxiv.org/abs/2308.11148v1","updated":"2023-08-22T03:10:40Z","published":"2023-08-22T03:10:40Z","title":"LLaMA-Reviewer: Advancing Code Review Automation with Large Language\n Models through Parameter-Efficient Fine-Tuning (Practical Experience Report)","summary":" The automation of code review activities, a long-standing pursuit in software\nengineering, has been primarily addressed by numerous domain-specific\npre-trained models. Despite their success, these models frequently demand\nextensive resources for pre-training from scratch. In contrast, Large Language\nModels (LLMs) provide an intriguing alternative, given their remarkable\ncapabilities when supplemented with domain-specific knowledge. However, their\npotential for automating code review tasks remains largely unexplored.\n In response to this research gap, we present LLaMA-Reviewer, an innovative\nframework that leverages the capabilities of LLaMA, a popular LLM, in the realm\nof code review. Mindful of resource constraints, this framework employs\nparameter-efficient fine-tuning (PEFT) methods, delivering high performance\nwhile using less than 1% of trainable parameters.\n An extensive evaluation of LLaMA-Reviewer is conducted on two diverse,\npublicly available datasets. Notably, even with the smallest LLaMA base model\nconsisting of 6.7B parameters and a limited number of tuning epochs,\nLLaMA-Reviewer equals the performance of existing code-review-focused models.\n The ablation experiments provide insights into the influence of various\nfine-tuning process components, including input representation, instruction\ntuning, and different PEFT methods. To foster continuous progress in this\nfield, the code and all PEFT-weight plugins have been made open-source.\n","authors":["Junyi Lu","Lei Yu","Xiaojia Li","Li Yang","Chun Zuo"],"pdf_url":"https://arxiv.org/pdf/2308.11148v1.pdf","comment":"Accepted to the 34th IEEE International Symposium on Software\n Reliability Engineering (ISSRE 2023)"},{"id":"http://arxiv.org/abs/2308.11144v1","updated":"2023-08-22T02:54:42Z","published":"2023-08-22T02:54:42Z","title":"Exploring Unsupervised Cell Recognition with Prior Self-activation Maps","summary":" The success of supervised deep learning models on cell recognition tasks\nrelies on detailed annotations. Many previous works have managed to reduce the\ndependency on labels. However, considering the large number of cells contained\nin a patch, costly and inefficient labeling is still inevitable. To this end,\nwe explored label-free methods for cell recognition. Prior self-activation maps\n(PSM) are proposed to generate pseudo masks as training targets. To be\nspecific, an activation network is trained with self-supervised learning. The\ngradient information in the shallow layers of the network is aggregated to\ngenerate prior self-activation maps. Afterward, a semantic clustering module is\nthen introduced as a pipeline to transform PSMs to pixel-level semantic pseudo\nmasks for downstream tasks. We evaluated our method on two histological\ndatasets: MoNuSeg (cell segmentation) and BCData (multi-class cell detection).\nCompared with other fully-supervised and weakly-supervised methods, our method\ncan achieve competitive performance without any manual annotations. Our simple\nbut effective framework can also achieve multi-class cell detection which can\nnot be done by existing unsupervised methods. The results show the potential of\nPSMs that might inspire other research to deal with the hunger for labels in\nmedical area.\n","authors":["Pingyi Chen","Chenglu Zhu","Zhongyi Shui","Jiatong Cai","Sunyi Zheng","Shichuan Zhang","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2308.11144v1.pdf","comment":"MICCAI 2023. arXiv admin note: substantial text overlap with\n arXiv:2210.07862"},{"id":"http://arxiv.org/abs/2308.11142v1","updated":"2023-08-22T02:51:42Z","published":"2023-08-22T02:51:42Z","title":"Graph Encoding and Neural Network Approaches for Volleyball Analytics:\n From Game Outcome to Individual Play Predictions","summary":" This research aims to improve the accuracy of complex volleyball predictions\nand provide more meaningful insights to coaches and players. We introduce a\nspecialized graph encoding technique to add additional contact-by-contact\nvolleyball context to an already available volleyball dataset without any\nadditional data gathering. We demonstrate the potential benefits of using graph\nneural networks (GNNs) on this enriched dataset for three different volleyball\nprediction tasks: rally outcome prediction, set location prediction, and hit\ntype prediction. We compare the performance of our graph-based models to\nbaseline models and analyze the results to better understand the underlying\nrelationships in a volleyball rally. Our results show that the use of GNNs with\nour graph encoding yields a much more advanced analysis of the data, which\nnoticeably improves prediction results overall. We also show that these\nbaseline tasks can be significantly improved with simple adjustments, such as\nremoving blocked hits. Lastly, we demonstrate the importance of choosing a\nmodel architecture that will better extract the important information for a\ncertain task. Overall, our study showcases the potential strengths and\nweaknesses of using graph encodings in sports data analytics and hopefully will\ninspire future improvements in machine learning strategies across sports and\napplications by using graphbased encodings.\n","authors":["Rhys Tracy","Haotian Xia","Alex Rasla","Yuan-Fang Wang","Ambuj Singh"],"pdf_url":"https://arxiv.org/pdf/2308.11142v1.pdf","comment":"This paper is an extended version of the one accepted at the KDD 2023\n Workshop on Data Science and AI for Sports (DSAI4Sports), entitled\n 'RallyGraph: Specialized Graph Encoding for Enhanced Volleyball'"},{"id":"http://arxiv.org/abs/2308.11137v1","updated":"2023-08-22T02:34:47Z","published":"2023-08-22T02:34:47Z","title":"Towards Validating Long-Term User Feedbacks in Interactive\n Recommendation Systems","summary":" Interactive Recommender Systems (IRSs) have attracted a lot of attention, due\nto their ability to model interactive processes between users and recommender\nsystems. Numerous approaches have adopted Reinforcement Learning (RL)\nalgorithms, as these can directly maximize users' cumulative rewards. In IRS,\nresearchers commonly utilize publicly available review datasets to compare and\nevaluate algorithms. However, user feedback provided in public datasets merely\nincludes instant responses (e.g., a rating), with no inclusion of delayed\nresponses (e.g., the dwell time and the lifetime value). Thus, the question\nremains whether these review datasets are an appropriate choice to evaluate the\nlong-term effects of the IRS. In this work, we revisited experiments on IRS\nwith review datasets and compared RL-based models with a simple reward model\nthat greedily recommends the item with the highest one-step reward. Following\nextensive analysis, we can reveal three main findings: First, a simple greedy\nreward model consistently outperforms RL-based models in maximizing cumulative\nrewards. Second, applying higher weighting to long-term rewards leads to a\ndegradation of recommendation performance. Third, user feedbacks have mere\nlong-term effects on the benchmark datasets. Based on our findings, we conclude\nthat a dataset has to be carefully verified and that a simple greedy baseline\nshould be included for a proper evaluation of RL-based IRS approaches.\n","authors":["Hojoon Lee","Dongyoon Hwang","Kyushik Min","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2308.11137v1.pdf","comment":"Accepted to SIGIR'22"},{"id":"http://arxiv.org/abs/2308.09878v2","updated":"2023-08-22T02:32:01Z","published":"2023-08-19T02:11:49Z","title":"DatasetEquity: Are All Samples Created Equal? In The Quest For Equity\n Within Datasets","summary":" Data imbalance is a well-known issue in the field of machine learning,\nattributable to the cost of data collection, the difficulty of labeling, and\nthe geographical distribution of the data. In computer vision, bias in data\ndistribution caused by image appearance remains highly unexplored. Compared to\ncategorical distributions using class labels, image appearance reveals complex\nrelationships between objects beyond what class labels provide. Clustering deep\nperceptual features extracted from raw pixels gives a richer representation of\nthe data. This paper presents a novel method for addressing data imbalance in\nmachine learning. The method computes sample likelihoods based on image\nappearance using deep perceptual embeddings and clustering. It then uses these\nlikelihoods to weigh samples differently during training with a proposed\n$\\textbf{Generalized Focal Loss}$ function. This loss can be easily integrated\nwith deep learning algorithms. Experiments validate the method's effectiveness\nacross autonomous driving vision datasets including KITTI and nuScenes. The\nloss function improves state-of-the-art 3D object detection methods, achieving\nover $200\\%$ AP gains on under-represented classes (Cyclist) in the KITTI\ndataset. The results demonstrate the method is generalizable, complements\nexisting techniques, and is particularly beneficial for smaller datasets and\nrare classes. Code is available at:\nhttps://github.com/towardsautonomy/DatasetEquity\n","authors":["Shubham Shrivastava","Xianling Zhang","Sushruth Nagesh","Armin Parchami"],"pdf_url":"https://arxiv.org/pdf/2308.09878v2.pdf","comment":"ICCV 2023 Workshop"},{"id":"http://arxiv.org/abs/2308.11129v1","updated":"2023-08-22T02:22:34Z","published":"2023-08-22T02:22:34Z","title":"Transformers for Capturing Multi-level Graph Structure using\n Hierarchical Distances","summary":" Graph transformers need strong inductive biases to derive meaningful\nattention scores. Yet, current proposals rarely address methods capturing\nlonger ranges, hierarchical structures, or community structures, as they appear\nin various graphs such as molecules, social networks, and citation networks. In\nthis paper, we propose a hierarchy-distance structural encoding (HDSE), which\nmodels a hierarchical distance between the nodes in a graph focusing on its\nmulti-level, hierarchical nature. In particular, this yields a framework which\ncan be flexibly integrated with existing graph transformers, allowing for\nsimultaneous application with other positional representations. Through\nextensive experiments on 12 real-world datasets, we demonstrate that our HDSE\nmethod successfully enhances various types of baseline transformers, achieving\nstate-of-the-art empirical performances on 10 benchmark datasets.\n","authors":["Yuankai Luo"],"pdf_url":"https://arxiv.org/pdf/2308.11129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11127v1","updated":"2023-08-22T02:17:34Z","published":"2023-08-22T02:17:34Z","title":"How Expressive are Graph Neural Networks in Recommendation?","summary":" Graph Neural Networks (GNNs) have demonstrated superior performance on\nvarious graph learning tasks, including recommendation, where they leverage\nuser-item collaborative filtering signals in graphs. However, theoretical\nformulations of their capability are scarce, despite their empirical\neffectiveness in state-of-the-art recommender models. Recently, research has\nexplored the expressiveness of GNNs in general, demonstrating that message\npassing GNNs are at most as powerful as the Weisfeiler-Lehman test, and that\nGNNs combined with random node initialization are universal. Nevertheless, the\nconcept of \"expressiveness\" for GNNs remains vaguely defined. Most existing\nworks adopt the graph isomorphism test as the metric of expressiveness, but\nthis graph-level task may not effectively assess a model's ability in\nrecommendation, where the objective is to distinguish nodes of different\ncloseness. In this paper, we provide a comprehensive theoretical analysis of\nthe expressiveness of GNNs in recommendation, considering three levels of\nexpressiveness metrics: graph isomorphism (graph-level), node automorphism\n(node-level), and topological closeness (link-level). We propose the\ntopological closeness metric to evaluate GNNs' ability to capture the\nstructural distance between nodes, which aligns closely with the objective of\nrecommendation. To validate the effectiveness of this new metric in evaluating\nrecommendation performance, we introduce a learning-less GNN algorithm that is\noptimal on the new metric and can be optimal on the node-level metric with\nsuitable modification. We conduct extensive experiments comparing the proposed\nalgorithm against various types of state-of-the-art GNN models to explore the\nexplainability of the new metric in the recommendation task. For\nreproducibility, implementation codes are available at\nhttps://github.com/HKUDS/GTE.\n","authors":["Xuheng Cai","Lianghao Xia","Xubin Ren","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2308.11127v1.pdf","comment":"32nd ACM International Conference on Information and Knowledge\n Management (CIKM) 2023"},{"id":"http://arxiv.org/abs/2308.11119v1","updated":"2023-08-22T01:55:03Z","published":"2023-08-22T01:55:03Z","title":"Random Word Data Augmentation with CLIP for Zero-Shot Anomaly Detection","summary":" This paper presents a novel method that leverages a visual-language model,\nCLIP, as a data source for zero-shot anomaly detection. Tremendous efforts have\nbeen put towards developing anomaly detectors due to their potential industrial\napplications. Considering the difficulty in acquiring various anomalous samples\nfor training, most existing methods train models with only normal samples and\nmeasure discrepancies from the distribution of normal samples during inference,\nwhich requires training a model for each object category. The problem of this\ninefficient training requirement has been tackled by designing a CLIP-based\nanomaly detector that applies prompt-guided classification to each part of an\nimage in a sliding window manner. However, the method still suffers from the\nlabor of careful prompt ensembling with known object categories. To overcome\nthe issues above, we propose leveraging CLIP as a data source for training. Our\nmethod generates text embeddings with the text encoder in CLIP with typical\nprompts that include words of normal and anomaly. In addition to these words,\nwe insert several randomly generated words into prompts, which enables the\nencoder to generate a diverse set of normal and anomalous samples. Using the\ngenerated embeddings as training data, a feed-forward neural network learns to\nextract features of normal and anomaly from CLIP's embeddings, and as a result,\na category-agnostic anomaly detector can be obtained without any training\nimages. Experimental results demonstrate that our method achieves\nstate-of-the-art performance without laborious prompt ensembling in zero-shot\nsetups.\n","authors":["Masato Tamura"],"pdf_url":"https://arxiv.org/pdf/2308.11119v1.pdf","comment":"Accepted to BMVC2023"},{"id":"http://arxiv.org/abs/2308.09895v2","updated":"2023-08-22T01:51:54Z","published":"2023-08-19T03:19:01Z","title":"Knowledge Transfer from High-Resource to Low-Resource Programming\n Languages for Code LLMs","summary":" Over the past few years, Large Language Models of Code (Code LLMs) have\nstarted to have a significant impact on programming practice. Code LLMs are\nalso emerging as a building block for research in programming languages and\nsoftware engineering. However, the quality of code produced by a Code LLM\nvaries significantly by programming languages. Code LLMs produce impressive\nresults on programming languages that are well represented in their training\ndata (e.g., Java, Python, or JavaScript), but struggle with low-resource\nlanguages, like OCaml and Racket.\n This paper presents an effective approach for boosting the performance of\nCode LLMs on low-resource languages using semi-synthetic data. Our approach\ngenerates high-quality datasets for low-resource languages, which can then be\nused to fine-tune any pretrained Code LLM. Our approach, called MultiPL-T,\ntranslates training data from high-resource languages into training data for\nlow-resource languages. We apply our approach to generate tens of thousands of\nnew, validated training items for Racket, OCaml, and Lua from Python. Moreover,\nwe use an open dataset (The Stack) and model (StarCoderBase), which allow us to\ndecontaminate benchmarks and train models on this data without violating the\nmodel license.\n With MultiPL-T generated data, we present fine-tuned versions of\nStarCoderBase that achieve state-of-the-art performance for Racket, OCaml, and\nLua on benchmark problems. For Lua, our fine-tuned model achieves the same\nperformance as StarCoderBase as Python -- a very high-resource language -- on\nthe MultiPL-E benchmarks. For Racket and OCaml, we double their performance on\nMultiPL-E, bringing their performance close to higher-resource languages such\nas Ruby and C#.\n","authors":["Federico Cassano","John Gouwar","Francesca Lucchetti","Claire Schlesinger","Carolyn Jane Anderson","Michael Greenberg","Abhinav Jangda","Arjun Guha"],"pdf_url":"https://arxiv.org/pdf/2308.09895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09996v3","updated":"2023-08-22T01:40:44Z","published":"2022-10-18T17:01:35Z","title":"Perceptual Grouping in Contrastive Vision-Language Models","summary":" Recent advances in zero-shot image recognition suggest that vision-language\nmodels learn generic visual representations with a high degree of semantic\ninformation that may be arbitrarily probed with natural language phrases.\nUnderstanding an image, however, is not just about understanding what content\nresides within an image, but importantly, where that content resides. In this\nwork we examine how well vision-language models are able to understand where\nobjects reside within an image and group together visually related parts of the\nimagery. We demonstrate how contemporary vision and language representation\nlearning models based on contrastive losses and large web-based data capture\nlimited object localization information. We propose a minimal set of\nmodifications that results in models that uniquely learn both semantic and\nspatial information. We measure this performance in terms of zero-shot image\nrecognition, unsupervised bottom-up and top-down semantic segmentations, as\nwell as robustness analyses. We find that the resulting model achieves\nstate-of-the-art results in terms of unsupervised segmentation, and demonstrate\nthat the learned representations are uniquely robust to spurious correlations\nin datasets designed to probe the causal behavior of vision models.\n","authors":["Kanchana Ranasinghe","Brandon McKinzie","Sachin Ravi","Yinfei Yang","Alexander Toshev","Jonathon Shlens"],"pdf_url":"https://arxiv.org/pdf/2210.09996v3.pdf","comment":"Accepted and presented at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.10856v2","updated":"2023-08-22T01:31:28Z","published":"2023-08-21T16:50:59Z","title":"Majorana Demonstrator Data Release for AI/ML Applications","summary":" The enclosed data release consists of a subset of the calibration data from\nthe Majorana Demonstrator experiment. Each Majorana event is accompanied by raw\nGermanium detector waveforms, pulse shape discrimination cuts, and calibrated\nfinal energies, all shared in an HDF5 file format along with relevant metadata.\nThis release is specifically designed to support the training and testing of\nArtificial Intelligence (AI) and Machine Learning (ML) algorithms upon our\ndata. This document is structured as follows. Section I provides an overview of\nthe dataset's content and format; Section II outlines the location of this\ndataset and the method for accessing it; Section III presents the NPML Machine\nLearning Challenge associated with this dataset; Section IV contains a\ndisclaimer from the Majorana collaboration regarding the use of this dataset;\nAppendix A contains technical details of this data release. Please direct\nquestions about the material provided within this release to liaobo77@ucsd.edu\n(A. Li).\n","authors":["I. J. Arnquist","F. T. Avignone III","A. S. Barabash","C. J. Barton","K. H. Bhimani","E. Blalock","B. Bos","M. Busch","M. Buuck","T. S. Caldwell","Y. -D. Chan","C. D. Christofferson","P. -H. Chu","M. L. Clark","C. Cuesta","J. A. Detwiler","Yu. Efremenko","H. Ejiri","S. R. Elliott","N. Fuad","G. K. Giovanetti","M. P. Green","J. Gruszko","I. S. Guinn","V. E. Guiseppe","C. R. Haufe","R. Henning","D. Hervas Aguilar","E. W. Hoppe","A. Hostiuc","M. F. Kidd","I. Kim","R. T. Kouzes","T. E. Lannen V","A. Li","J. M. Lopez-Castano","R. D. Martin","R. Massarczyk","S. J. Meijer","S. Mertens","T. K. Oli","L. S. Paudel","W. Pettus","A. W. P. Poon","B. Quenallata","D. C. Radford","A. L. Reine","K. Rielage","N. W. Ruof","D. C. Schaper","S. J. Schleich","D. Tedeschi","R. L. Varner","S. Vasilyev","S. L. Watkins","J. F. Wilkerson","C. Wiseman","W. Xu","C. -H. Yu","B. X. Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.10856v2.pdf","comment":"Zenodo DOI: https://doi.org/10.5281/zenodo.8257027"},{"id":"http://arxiv.org/abs/2308.11112v1","updated":"2023-08-22T01:27:04Z","published":"2023-08-22T01:27:04Z","title":"Development of a Novel Quantum Pre-processing Filter to Improve Image\n Classification Accuracy of Neural Network Models","summary":" This paper proposes a novel quantum pre-processing filter (QPF) to improve\nthe image classification accuracy of neural network (NN) models. A simple four\nqubit quantum circuit that uses Y rotation gates for encoding and two\ncontrolled NOT gates for creating correlation among the qubits is applied as a\nfeature extraction filter prior to passing data into the fully connected NN\narchitecture. By applying the QPF approach, the results show that the image\nclassification accuracy based on the MNIST (handwritten 10 digits) and the\nEMNIST (handwritten 47 class digits and letters) datasets can be improved, from\n92.5% to 95.4% and from 68.9% to 75.9%, respectively. These improvements were\nobtained without introducing extra model parameters or optimizations in the\nmachine learning process. However, tests performed on the developed QPF\napproach against a relatively complex GTSRB dataset with 43 distinct class\nreal-life traffic sign images showed a degradation in the classification\naccuracy. Considering this result, further research into the understanding and\nthe design of a more suitable quantum circuit approach for image classification\nneural networks could be explored utilizing the baseline method proposed in\nthis paper.\n","authors":["Farina Riaz","Shahab Abdulla","Hajime Suzuki","Srinjoy Ganguly","Ravinesh C. Deo","Susan Hopkins"],"pdf_url":"https://arxiv.org/pdf/2308.11112v1.pdf","comment":"13 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.11111v1","updated":"2023-08-22T01:24:14Z","published":"2023-08-22T01:24:14Z","title":"CAME: Contrastive Automated Model Evaluation","summary":" The Automated Model Evaluation (AutoEval) framework entertains the\npossibility of evaluating a trained machine learning model without resorting to\na labeled testing set. Despite the promise and some decent results, the\nexisting AutoEval methods heavily rely on computing distribution shifts between\nthe unlabelled testing set and the training set. We believe this reliance on\nthe training set becomes another obstacle in shipping this technology to\nreal-world ML development. In this work, we propose Contrastive Automatic Model\nEvaluation (CAME), a novel AutoEval framework that is rid of involving training\nset in the loop. The core idea of CAME bases on a theoretical analysis which\nbonds the model performance with a contrastive loss. Further, with extensive\nempirical validation, we manage to set up a predictable relationship between\nthe two, simply by deducing on the unlabeled/unseen testing set. The resulting\nframework CAME establishes a new SOTA results for AutoEval by surpassing prior\nwork significantly.\n","authors":["Ru Peng","Qiuyang Duan","Haobo Wang","Jiachen Ma","Yanbo Jiang","Yongjun Tu","Xiu Jiang","Junbo Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.11111v1.pdf","comment":"ICCV2023 main conference"},{"id":"http://arxiv.org/abs/2209.09626v4","updated":"2023-08-22T01:13:51Z","published":"2022-09-14T20:01:22Z","title":"Sequence Learning Using Equilibrium Propagation","summary":" Equilibrium Propagation (EP) is a powerful and more bio-plausible alternative\nto conventional learning frameworks such as backpropagation. The effectiveness\nof EP stems from the fact that it relies only on local computations and\nrequires solely one kind of computational unit during both of its training\nphases, thereby enabling greater applicability in domains such as bio-inspired\nneuromorphic computing. The dynamics of the model in EP is governed by an\nenergy function and the internal states of the model consequently converge to a\nsteady state following the state transition rules defined by the same. However,\nby definition, EP requires the input to the model (a convergent RNN) to be\nstatic in both the phases of training. Thus it is not possible to design a\nmodel for sequence classification using EP with an LSTM or GRU like\narchitecture. In this paper, we leverage recent developments in modern hopfield\nnetworks to further understand energy based models and develop solutions for\ncomplex sequence classification tasks using EP while satisfying its convergence\ncriteria and maintaining its theoretical similarities with recurrent\nbackpropagation. We explore the possibility of integrating modern hopfield\nnetworks as an attention mechanism with convergent RNN models used in EP,\nthereby extending its applicability for the first time on two different\nsequence classification tasks in natural language processing viz. sentiment\nanalysis (IMDB dataset) and natural language inference (SNLI dataset).\n","authors":["Malyaban Bal","Abhronil Sengupta"],"pdf_url":"https://arxiv.org/pdf/2209.09626v4.pdf","comment":"Accepted at IJCAI 2023"},{"id":"http://arxiv.org/abs/2308.11103v1","updated":"2023-08-22T00:57:36Z","published":"2023-08-22T00:57:36Z","title":"Anonymity at Risk? Assessing Re-Identification Capabilities of Large\n Language Models","summary":" Anonymity of both natural and legal persons in court rulings is a critical\naspect of privacy protection in the European Union and Switzerland. With the\nadvent of LLMs, concerns about large-scale re-identification of anonymized\npersons are growing. In accordance with the Federal Supreme Court of\nSwitzerland, we explore the potential of LLMs to re-identify individuals in\ncourt rulings by constructing a proof-of-concept using actual legal data from\nthe Swiss federal supreme court. Following the initial experiment, we\nconstructed an anonymized Wikipedia dataset as a more rigorous testing ground\nto further investigate the findings. With the introduction and application of\nthe new task of re-identifying people in texts, we also introduce new metrics\nto measure performance. We systematically analyze the factors that influence\nsuccessful re-identifications, identifying model size, input length, and\ninstruction tuning among the most critical determinants. Despite high\nre-identification rates on Wikipedia, even the best LLMs struggled with court\ndecisions. The complexity is attributed to the lack of test datasets, the\nnecessity for substantial training resources, and data sparsity in the\ninformation used for re-identification. In conclusion, this study demonstrates\nthat re-identification using LLMs may not be feasible for now, but as the\nproof-of-concept on Wikipedia showed, it might become possible in the future.\nWe hope that our system can help enhance the confidence in the security of\nanonymized decisions, thus leading to the courts being more confident to\npublish decisions.\n","authors":["Alex Nyffenegger","Matthias Stürmer","Joel Niklaus"],"pdf_url":"https://arxiv.org/pdf/2308.11103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08158v3","updated":"2023-08-22T00:55:21Z","published":"2023-06-13T22:07:54Z","title":"Survey on Sociodemographic Bias in Natural Language Processing","summary":" Deep neural networks often learn unintended bias during training, which might\nhave harmful effects when deployed in real-world settings. This work surveys\n214 papers related to sociodemographic bias in natural language processing\n(NLP). In this study, we aim to provide a more comprehensive understanding of\nthe similarities and differences among approaches to sociodemographic bias in\nNLP. To better understand the distinction between bias and real-world harm, we\nturn to ideas from psychology and behavioral economics to propose a definition\nfor sociodemographic bias. We identify three main categories of NLP bias\nresearch: types of bias, quantifying bias, and debiasing techniques. We\nhighlight the current trends in quantifying bias and debiasing techniques,\noffering insights into their strengths and weaknesses. We conclude that current\napproaches on quantifying bias face reliability issues, that many of the bias\nmetrics do not relate to real-world bias, and that debiasing techniques need to\nfocus more on training methods. Finally, we provide recommendations for future\nwork.\n","authors":["Vipul Gupta","Pranav Narayanan Venkit","Shomir Wilson","Rebecca J. Passonneau"],"pdf_url":"https://arxiv.org/pdf/2306.08158v3.pdf","comment":"23 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.11098v1","updated":"2023-08-22T00:43:14Z","published":"2023-08-22T00:43:14Z","title":"Explicability and Inexplicability in the Interpretation of Quantum\n Neural Networks","summary":" Interpretability of artificial intelligence (AI) methods, particularly deep\nneural networks, is of great interest due to the widespread use of AI-backed\nsystems, which often have unexplainable behavior. The interpretability of such\nmodels is a crucial component of building trusted systems. Many methods exist\nto approach this problem, but they do not obviously generalize to the quantum\nsetting. Here we explore the interpretability of quantum neural networks using\nlocal model-agnostic interpretability measures of quantum and classical neural\nnetworks. We introduce the concept of the band of inexplicability, representing\nthe interpretable region in which data samples have no explanation, likely\nvictims of inherently random quantum measurements. We see this as a step toward\nunderstanding how to build responsible and accountable quantum AI models.\n","authors":["Lirandë Pira","Chris Ferrie"],"pdf_url":"https://arxiv.org/pdf/2308.11098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.09767v3","updated":"2023-08-22T00:22:42Z","published":"2023-01-24T00:32:56Z","title":"Truveta Mapper: A Zero-shot Ontology Alignment Framework","summary":" In this paper, a new perspective is suggested for unsupervised Ontology\nMatching (OM) or Ontology Alignment (OA) by treating it as a translation task.\nOntologies are represented as graphs, and the translation is performed from a\nnode in the source ontology graph to a path in the target ontology graph. The\nproposed framework, Truveta Mapper (TM), leverages a multi-task\nsequence-to-sequence transformer model to perform alignment across multiple\nontologies in a zero-shot, unified and end-to-end manner. Multi-tasking enables\nthe model to implicitly learn the relationship between different ontologies via\ntransfer-learning without requiring any explicit cross-ontology manually\nlabeled data. This also enables the formulated framework to outperform existing\nsolutions for both runtime latency and alignment quality. The model is\npre-trained and fine-tuned only on publicly available text corpus and\ninner-ontologies data. The proposed solution outperforms state-of-the-art\napproaches, Edit-Similarity, LogMap, AML, BERTMap, and the recently presented\nnew OM frameworks in Ontology Alignment Evaluation Initiative (OAEI22), offers\nlog-linear complexity, and overall makes the OM task efficient and more\nstraightforward without much post-processing involving mapping extension or\nmapping repair. We are open sourcing our solution.\n","authors":["Mariyam Amir","Murchana Baruah","Mahsa Eslamialishah","Sina Ehsani","Alireza Bahramali","Sadra Naddaf-Sh","Saman Zarandioon"],"pdf_url":"https://arxiv.org/pdf/2301.09767v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11093v1","updated":"2023-08-22T00:21:32Z","published":"2023-08-22T00:21:32Z","title":"Video OWL-ViT: Temporally-consistent open-world localization in video","summary":" We present an architecture and a training recipe that adapts pre-trained\nopen-world image models to localization in videos. Understanding the open\nvisual world (without being constrained by fixed label spaces) is crucial for\nmany real-world vision tasks. Contrastive pre-training on large image-text\ndatasets has recently led to significant improvements for image-level tasks.\nFor more structured tasks involving object localization applying pre-trained\nmodels is more challenging. This is particularly true for video tasks, where\ntask-specific data is limited. We show successful transfer of open-world models\nby building on the OWL-ViT open-vocabulary detection model and adapting it to\nvideo by adding a transformer decoder. The decoder propagates object\nrepresentations recurrently through time by using the output tokens for one\nframe as the object queries for the next. Our model is end-to-end trainable on\nvideo data and enjoys improved temporal consistency compared to\ntracking-by-detection baselines, while retaining the open-world capabilities of\nthe backbone detector. We evaluate our model on the challenging TAO-OW\nbenchmark and demonstrate that open-world capabilities, learned from\nlarge-scale image-text pre-training, can be transferred successfully to\nopen-world localization across diverse videos.\n","authors":["Georg Heigold","Matthias Minderer","Alexey Gritsenko","Alex Bewley","Daniel Keysers","Mario Lučić","Fisher Yu","Thomas Kipf"],"pdf_url":"https://arxiv.org/pdf/2308.11093v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2305.19370v2","updated":"2023-08-22T00:19:05Z","published":"2023-05-30T19:25:51Z","title":"Blockwise Parallel Transformer for Long Context Large Models","summary":" Transformers have emerged as the cornerstone of state-of-the-art natural\nlanguage processing models, showcasing exceptional performance across a wide\nrange of AI applications. However, the memory demands posed by the\nself-attention mechanism and the large feedforward network in Transformers\nlimit their ability to handle long sequences, thereby creating challenges for\ntasks involving multiple long sequences or long-term dependencies. We present a\ndistinct approach, Blockwise Parallel Transformer (BPT), that leverages\nblockwise computation of self-attention and feedforward network fusion to\nminimize memory costs. By processing longer input sequences while maintaining\nmemory efficiency, BPT enables training sequences up to 32 times longer than\nvanilla Transformers and 2 to 4 times longer than previous memory-efficient\nmethods. Extensive experiments on language modeling and reinforcement learning\ntasks demonstrate the effectiveness of BPT in reducing memory requirements and\nimproving performance.\n","authors":["Hao Liu","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2305.19370v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11090v1","updated":"2023-08-22T00:10:23Z","published":"2023-08-22T00:10:23Z","title":"Addressing Fairness and Explainability in Image Classification Using\n Optimal Transport","summary":" Algorithmic Fairness and the explainability of potentially unfair outcomes\nare crucial for establishing trust and accountability of Artificial\nIntelligence systems in domains such as healthcare and policing. Though\nsignificant advances have been made in each of the fields separately, achieving\nexplainability in fairness applications remains challenging, particularly so in\ndomains where deep neural networks are used. At the same time, ethical\ndata-mining has become ever more relevant, as it has been shown countless times\nthat fairness-unaware algorithms result in biased outcomes. Current approaches\nfocus on mitigating biases in the outcomes of the model, but few attempts have\nbeen made to try to explain \\emph{why} a model is biased. To bridge this gap,\nwe propose a comprehensive approach that leverages optimal transport theory to\nuncover the causes and implications of biased regions in images, which easily\nextends to tabular data as well. Through the use of Wasserstein barycenters, we\nobtain scores that are independent of a sensitive variable but keep their\nmarginal orderings. This step ensures predictive accuracy but also helps us to\nrecover the regions most associated with the generation of the biases. Our\nfindings hold significant implications for the development of trustworthy and\nunbiased AI systems, fostering transparency, accountability, and fairness in\ncritical decision-making scenarios across diverse domains.\n","authors":["Philipp Ratz","François Hu","Arthur Charpentier"],"pdf_url":"https://arxiv.org/pdf/2308.11090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11836v1","updated":"2023-08-22T23:49:04Z","published":"2023-08-22T23:49:04Z","title":"Characterizing normal perinatal development of the human brain\n structural connectivity","summary":" Early brain development is characterized by the formation of a highly\norganized structural connectome. The interconnected nature of this connectome\nunderlies the brain's cognitive abilities and influences its response to\ndiseases and environmental factors. Hence, quantitative assessment of\nstructural connectivity in the perinatal stage is useful for studying normal\nand abnormal neurodevelopment. However, estimation of the connectome from\ndiffusion MRI data involves complex computations. For the perinatal period,\nthese computations are further challenged by the rapid brain development and\nimaging difficulties. Combined with high inter-subject variability, these\nfactors make it difficult to chart the normal development of the structural\nconnectome. As a result, there is a lack of reliable normative baselines of\nstructural connectivity metrics at this critical stage in brain development. In\nthis study, we developed a computational framework, based on spatio-temporal\naveraging, for determining such baselines. We used this framework to analyze\nthe structural connectivity between 33 and 44 postmenstrual weeks using data\nfrom 166 subjects. Our results unveiled clear and strong trends in the\ndevelopment of structural connectivity in perinatal stage. Connection weighting\nbased on fractional anisotropy and neurite density produced the most consistent\nresults. We observed increases in global and local efficiency, a decrease in\ncharacteristic path length, and widespread strengthening of the connections\nwithin and across brain lobes and hemispheres. We also observed asymmetry\npatterns that were consistent between different connection weighting\napproaches. The new computational method and results are useful for assessing\nnormal and abnormal development of the structural connectome early in life.\n","authors":["Yihan Wu","Lana Vasung","Camilo Calixto","Ali Gholipour","Davood Karimi"],"pdf_url":"https://arxiv.org/pdf/2308.11836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11834v1","updated":"2023-08-22T23:46:21Z","published":"2023-08-22T23:46:21Z","title":"Performance Comparison and Implementation of Bayesian Variants for\n Network Intrusion Detection","summary":" Bayesian classifiers perform well when each of the features is completely\nindependent of the other which is not always valid in real world application.\nThe aim of this study is to implement and compare the performances of each\nvariant of Bayesian classifier (Multinomial, Bernoulli, and Gaussian) on\nanomaly detection in network intrusion, and to investigate whether there is any\nassociation between each variant assumption and their performance. Our\ninvestigation showed that each variant of Bayesian algorithm blindly follows\nits assumption regardless of feature property, and that the assumption is the\nsingle most important factor that influences their accuracy. Experimental\nresults show that Bernoulli has accuracy of 69.9% test (71% train), Multinomial\nhas accuracy of 31.2% test (31.2% train), while Gaussian has accuracy of 81.69%\ntest (82.84% train). Going deeper, we investigated and found that each Naive\nBayes variants performances and accuracy is largely due to each classifier\nassumption, Gaussian classifier performed best on anomaly detection due to its\nassumption that features follow normal distributions which are continuous,\nwhile multinomial classifier have a dismal performance as it simply assumes\ndiscreet and multinomial distribution.\n","authors":["Tosin Ige","Christopher Kiekintveld"],"pdf_url":"https://arxiv.org/pdf/2308.11834v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2308.11827v1","updated":"2023-08-22T23:18:53Z","published":"2023-08-22T23:18:53Z","title":"Exploring the Effectiveness of GPT Models in Test-Taking: A Case Study\n of the Driver's License Knowledge Test","summary":" Large language models such as Open AI's Generative Pre-trained Transformer\n(GPT) models are proficient at answering questions, but their knowledge is\nconfined to the information present in their training data. This limitation\nrenders them ineffective when confronted with questions about recent\ndevelopments or non-public documents. Our research proposes a method that\nenables GPT models to answer questions by employing context from an information\nsource not previously included in their training data. The methodology includes\npreprocessing of contextual information, the embedding of contexts and queries,\nconstructing prompt through the integration of context embeddings, and\ngenerating answers using GPT models. We applied this method in a controlled\ntest scenario using the California Driver's Handbook as the information source.\nThe GPT-3 model achieved a 96% passing score on a set of 50 sample driving\nknowledge test questions. In contrast, without context, the model's passing\nscore fell to 82%. However, the model still fails to answer some questions\ncorrectly even with providing library of context, highlighting room for\nimprovement. The research also examined the impact of prompt length and context\nformat, on the model's performance. Overall, the study provides insights into\nthe limitations and potential improvements for GPT models in question-answering\ntasks.\n","authors":["Saba Rahimi","Tucker Balch","Manuela Veloso"],"pdf_url":"https://arxiv.org/pdf/2308.11827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11825v1","updated":"2023-08-22T23:12:17Z","published":"2023-08-22T23:12:17Z","title":"Accel-GCN: High-Performance GPU Accelerator Design for Graph Convolution\n Networks","summary":" Graph Convolutional Networks (GCNs) are pivotal in extracting latent\ninformation from graph data across various domains, yet their acceleration on\nmainstream GPUs is challenged by workload imbalance and memory access\nirregularity. To address these challenges, we present Accel-GCN, a GPU\naccelerator architecture for GCNs. The design of Accel-GCN encompasses: (i) a\nlightweight degree sorting stage to group nodes with similar degree; (ii) a\nblock-level partition strategy that dynamically adjusts warp workload sizes,\nenhancing shared memory locality and workload balance, and reducing metadata\noverhead compared to designs like GNNAdvisor; (iii) a combined warp strategy\nthat improves memory coalescing and computational parallelism in the column\ndimension of dense matrices.\n Utilizing these principles, we formulated a kernel for sparse matrix\nmultiplication (SpMM) in GCNs that employs block-level partitioning and\ncombined warp strategy. This approach augments performance and multi-level\nmemory efficiency and optimizes memory bandwidth by exploiting memory\ncoalescing and alignment. Evaluation of Accel-GCN across 18 benchmark graphs\nreveals that it outperforms cuSPARSE, GNNAdvisor, and graph-BLAST by factors of\n1.17 times, 1.86 times, and 2.94 times respectively. The results underscore\nAccel-GCN as an effective solution for enhancing GCN computational efficiency.\n","authors":["Xi Xie","Hongwu Peng","Amit Hasan","Shaoyi Huang","Jiahui Zhao","Haowen Fang","Wei Zhang","Tong Geng","Omer Khan","Caiwen Ding"],"pdf_url":"https://arxiv.org/pdf/2308.11825v1.pdf","comment":"ICCAD 2023 accepted publication"},{"id":"http://arxiv.org/abs/2308.11822v1","updated":"2023-08-22T23:02:06Z","published":"2023-08-22T23:02:06Z","title":"PatchBackdoor: Backdoor Attack against Deep Neural Networks without\n Model Modification","summary":" Backdoor attack is a major threat to deep learning systems in safety-critical\nscenarios, which aims to trigger misbehavior of neural network models under\nattacker-controlled conditions. However, most backdoor attacks have to modify\nthe neural network models through training with poisoned data and/or direct\nmodel editing, which leads to a common but false belief that backdoor attack\ncan be easily avoided by properly protecting the model. In this paper, we show\nthat backdoor attacks can be achieved without any model modification. Instead\nof injecting backdoor logic into the training data or the model, we propose to\nplace a carefully-designed patch (namely backdoor patch) in front of the\ncamera, which is fed into the model together with the input images. The patch\ncan be trained to behave normally at most of the time, while producing wrong\nprediction when the input image contains an attacker-controlled trigger object.\nOur main techniques include an effective training method to generate the\nbackdoor patch and a digital-physical transformation modeling method to enhance\nthe feasibility of the patch in real deployments. Extensive experiments show\nthat PatchBackdoor can be applied to common deep learning models (VGG,\nMobileNet, ResNet) with an attack success rate of 93% to 99% on classification\ntasks. Moreover, we implement PatchBackdoor in real-world scenarios and show\nthat the attack is still threatening.\n","authors":["Yizhen Yuan","Rui Kong","Shenghao Xie","Yuanchun Li","Yunxin Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11822v1.pdf","comment":"accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.11819v1","updated":"2023-08-22T22:43:20Z","published":"2023-08-22T22:43:20Z","title":"Mitigating Health Disparity on Biased Electronic Health Records via\n Deconfounder","summary":" The fairness issue of clinical data modeling, especially on Electronic Health\nRecords (EHRs), is of utmost importance due to EHR's complex latent structure\nand potential selection bias. It is frequently necessary to mitigate health\ndisparity while keeping the model's overall accuracy in practice. However,\ntraditional methods often encounter the trade-off between accuracy and\nfairness, as they fail to capture the underlying factors beyond observed data.\nTo tackle this challenge, we propose a novel model called Fair Longitudinal\nMedical Deconfounder (FLMD) that aims to achieve both fairness and accuracy in\nlongitudinal Electronic Health Records (EHR) modeling. Drawing inspiration from\nthe deconfounder theory, FLMD employs a two-stage training process. In the\nfirst stage, FLMD captures unobserved confounders for each encounter, which\neffectively represents underlying medical factors beyond observed EHR, such as\npatient genotypes and lifestyle habits. This unobserved confounder is crucial\nfor addressing the accuracy/fairness dilemma. In the second stage, FLMD\ncombines the learned latent representation with other relevant features to make\npredictions. By incorporating appropriate fairness criteria, such as\ncounterfactual fairness, FLMD ensures that it maintains high prediction\naccuracy while simultaneously minimizing health disparities. We conducted\ncomprehensive experiments on two real-world EHR datasets to demonstrate the\neffectiveness of FLMD. Apart from the comparison of baseline methods and FLMD\nvariants in terms of fairness and accuracy, we assessed the performance of all\nmodels on disturbed/imbalanced and synthetic datasets to showcase the\nsuperiority of FLMD across different settings and provide valuable insights\ninto its capabilities.\n","authors":["Zheng Liu","Xiaohan Li","Philip Yu"],"pdf_url":"https://arxiv.org/pdf/2308.11819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11818v1","updated":"2023-08-22T22:41:33Z","published":"2023-08-22T22:41:33Z","title":"Incorporating Nonlocal Traffic Flow Model in Physics-informed Neural\n Networks","summary":" This research contributes to the advancement of traffic state estimation\nmethods by leveraging the benefits of the nonlocal LWR model within a\nphysics-informed deep learning framework. The classical LWR model, while\nuseful, falls short of accurately representing real-world traffic flows. The\nnonlocal LWR model addresses this limitation by considering the speed as a\nweighted mean of the downstream traffic density. In this paper, we propose a\nnovel PIDL framework that incorporates the nonlocal LWR model. We introduce\nboth fixed-length and variable-length kernels and develop the required\nmathematics. The proposed PIDL framework undergoes a comprehensive evaluation,\nincluding various convolutional kernels and look-ahead windows, using data from\nthe NGSIM and CitySim datasets. The results demonstrate improvements over the\nbaseline PIDL approach using the local LWR model. The findings highlight the\npotential of the proposed approach to enhance the accuracy and reliability of\ntraffic state estimation, enabling more effective traffic management\nstrategies.\n","authors":["Archie J. Huang","Animesh Biswas","Shaurya Agarwal"],"pdf_url":"https://arxiv.org/pdf/2308.11818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11814v1","updated":"2023-08-22T22:38:54Z","published":"2023-08-22T22:38:54Z","title":"Evaluation of Deep Neural Operator Models toward Ocean Forecasting","summary":" Data-driven, deep-learning modeling frameworks have been recently developed\nfor forecasting time series data. Such machine learning models may be useful in\nmultiple domains including the atmospheric and oceanic ones, and in general,\nthe larger fluids community. The present work investigates the possible\neffectiveness of such deep neural operator models for reproducing and\npredicting classic fluid flows and simulations of realistic ocean dynamics. We\nfirst briefly evaluate the capabilities of such deep neural operator models\nwhen trained on a simulated two-dimensional fluid flow past a cylinder. We then\ninvestigate their application to forecasting ocean surface circulation in the\nMiddle Atlantic Bight and Massachusetts Bay, learning from high-resolution\ndata-assimilative simulations employed for real sea experiments. We confirm\nthat trained deep neural operator models are capable of predicting idealized\nperiodic eddy shedding. For realistic ocean surface flows and our preliminary\nstudy, they can predict several of the features and show some skill, providing\npotential for future research and applications.\n","authors":["Ellery Rajagopal","Anantha N. S. Babu","Tony Ryu","Patrick J. Haley Jr.","Chris Mirabito","Pierre F. J. Lermusiaux"],"pdf_url":"https://arxiv.org/pdf/2308.11814v1.pdf","comment":"Rajagopal, E., A.N.S. Babu, T. Ryu, P.J. Haley, Jr., C. Mirabito, and\n P.F.J. Lermusiaux, 2023. Evaluation of Deep Neural Operator Models toward\n Ocean Forecasting. In OCEANS' 23 IEEE/MTS Gulf Coast, 25-28 September 2023,\n in press"},{"id":"http://arxiv.org/abs/2308.06644v2","updated":"2023-08-22T22:25:54Z","published":"2023-08-12T21:25:24Z","title":"Accelerating Diffusion-based Combinatorial Optimization Solvers by\n Progressive Distillation","summary":" Graph-based diffusion models have shown promising results in terms of\ngenerating high-quality solutions to NP-complete (NPC) combinatorial\noptimization (CO) problems. However, those models are often inefficient in\ninference, due to the iterative evaluation nature of the denoising diffusion\nprocess. This paper proposes to use progressive distillation to speed up the\ninference by taking fewer steps (e.g., forecasting two steps ahead within a\nsingle step) during the denoising process. Our experimental results show that\nthe progressively distilled model can perform inference 16 times faster with\nonly 0.019% degradation in performance on the TSP-50 dataset.\n","authors":["Junwei Huang","Zhiqing Sun","Yiming Yang"],"pdf_url":"https://arxiv.org/pdf/2308.06644v2.pdf","comment":"Published at ICML 2023, Sampling and Optimization in Discrete Space\n Workshop. The implementation is at\n https://github.com/jwrh/Accelerating-Diffusion-based-Combinatorial-Optimization-Solvers-by-Progressive-Distillation"},{"id":"http://arxiv.org/abs/2308.11804v1","updated":"2023-08-22T21:57:22Z","published":"2023-08-22T21:57:22Z","title":"Ceci n'est pas une pomme: Adversarial Illusions in Multi-Modal\n Embeddings","summary":" Multi-modal encoders map images, sounds, texts, videos, etc. into a single\nembedding space, aligning representations across modalities (e.g., associate an\nimage of a dog with a barking sound). We show that multi-modal embeddings can\nbe vulnerable to an attack we call \"adversarial illusions.\" Given an input in\nany modality, an adversary can perturb it so as to make its embedding close to\nthat of an arbitrary, adversary-chosen input in another modality. Illusions\nthus enable the adversary to align any image with any text, any text with any\nsound, etc.\n Adversarial illusions exploit proximity in the embedding space and are thus\nagnostic to downstream tasks. Using ImageBind embeddings, we demonstrate how\nadversarially aligned inputs, generated without knowledge of specific\ndownstream tasks, mislead image generation, text generation, and zero-shot\nclassification.\n","authors":["Eugene Bagdasaryan","Vitaly Shmatikov"],"pdf_url":"https://arxiv.org/pdf/2308.11804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11801v1","updated":"2023-08-22T21:51:39Z","published":"2023-08-22T21:51:39Z","title":"Variational Density Propagation Continual Learning","summary":" Deep Neural Networks (DNNs) deployed to the real world are regularly subject\nto out-of-distribution (OoD) data, various types of noise, and shifting\nconceptual objectives. This paper proposes a framework for adapting to data\ndistribution drift modeled by benchmark Continual Learning datasets. We develop\nand evaluate a method of Continual Learning that leverages uncertainty\nquantification from Bayesian Inference to mitigate catastrophic forgetting. We\nexpand on previous approaches by removing the need for Monte Carlo sampling of\nthe model weights to sample the predictive distribution. We optimize a\nclosed-form Evidence Lower Bound (ELBO) objective approximating the predictive\ndistribution by propagating the first two moments of a distribution, i.e. mean\nand covariance, through all network layers. Catastrophic forgetting is\nmitigated by using the closed-form ELBO to approximate the Minimum Description\nLength (MDL) Principle, inherently penalizing changes in the model likelihood\nby minimizing the KL Divergence between the variational posterior for the\ncurrent task and the previous task's variational posterior acting as the prior.\nLeveraging the approximation of the MDL principle, we aim to initially learn a\nsparse variational posterior and then minimize additional model complexity\nlearned for subsequent tasks. Our approach is evaluated for the task\nincremental learning scenario using density propagated versions of\nfully-connected and convolutional neural networks across multiple sequential\nbenchmark datasets with varying task sequence lengths. Ultimately, this\nprocedure produces a minimally complex network over a series of tasks\nmitigating catastrophic forgetting.\n","authors":["Christopher Angelini","Nidhal Bouaynaya","Ghulam Rasool"],"pdf_url":"https://arxiv.org/pdf/2308.11801v1.pdf","comment":"6 pages, 13th Int'l Symposium on Image and Signal Processing and\n Analysis"},{"id":"http://arxiv.org/abs/2308.11800v1","updated":"2023-08-22T21:49:38Z","published":"2023-08-22T21:49:38Z","title":"Complex-valued neural networks for voice anti-spoofing","summary":" Current anti-spoofing and audio deepfake detection systems use either\nmagnitude spectrogram-based features (such as CQT or Melspectrograms) or raw\naudio processed through convolution or sinc-layers. Both methods have\ndrawbacks: magnitude spectrograms discard phase information, which affects\naudio naturalness, and raw-feature-based models cannot use traditional\nexplainable AI methods. This paper proposes a new approach that combines the\nbenefits of both methods by using complex-valued neural networks to process the\ncomplex-valued, CQT frequency-domain representation of the input audio. This\nmethod retains phase information and allows for explainable AI methods. Results\nshow that this approach outperforms previous methods on the \"In-the-Wild\"\nanti-spoofing dataset and enables interpretation of the results through\nexplainable AI. Ablation studies confirm that the model has learned to use\nphase information to detect voice spoofing.\n","authors":["Nicolas M. Müller","Philip Sperl","Konstantin Böttinger"],"pdf_url":"https://arxiv.org/pdf/2308.11800v1.pdf","comment":"Interspeech 2023"},{"id":"http://arxiv.org/abs/2208.05776v3","updated":"2023-08-22T21:43:21Z","published":"2022-08-10T16:04:58Z","title":"Neural Networks for Scalar Input and Functional Output","summary":" The regression of a functional response on a set of scalar predictors can be\na challenging task, especially if there is a large number of predictors, or the\nrelationship between those predictors and the response is nonlinear. In this\nwork, we propose a solution to this problem: a feed-forward neural network (NN)\ndesigned to predict a functional response using scalar inputs. First, we\ntransform the functional response to a finite-dimensional representation and\nconstruct an NN that outputs this representation. Then, we propose to modify\nthe output of an NN via the objective function and introduce different\nobjective functions for network training. The proposed models are suited for\nboth regularly and irregularly spaced data, and a roughness penalty can be\nfurther applied to control the smoothness of the predicted curve. The\ndifficulty in implementing both those features lies in the definition of\nobjective functions that can be back-propagated. In our experiments, we\ndemonstrate that our model outperforms the conventional function-on-scalar\nregression model in multiple scenarios while computationally scaling better\nwith the dimension of the predictors.\n","authors":["Sidi Wu","Cédric Beaulac","Jiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2208.05776v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17618v3","updated":"2023-08-22T21:39:22Z","published":"2023-03-30T11:26:40Z","title":"Data-driven abstractions via adaptive refinements and a Kantorovich\n metric [extended version]","summary":" We introduce an adaptive refinement procedure for smart, and scalable\nabstraction of dynamical systems. Our technique relies on partitioning the\nstate space depending on the observation of future outputs. However, this\nknowledge is dynamically constructed in an adaptive, asymmetric way. In order\nto learn the optimal structure, we define a Kantorovich-inspired metric between\nMarkov chains, and we use it as a loss function. Our technique is prone to\ndata-driven frameworks, but not restricted to.\n We also study properties of the above mentioned metric between Markov chains,\nwhich we believe could be of application for wider purpose. We propose an\nalgorithm to approximate it, and we show that our method yields a much better\ncomputational complexity than using classical linear programming techniques.\n","authors":["Adrien Banse","Licio Romao","Alessandro Abate","Raphaël M. Jungers"],"pdf_url":"https://arxiv.org/pdf/2303.17618v3.pdf","comment":"This paper is an extended version of a CDC2023 submission"},{"id":"http://arxiv.org/abs/2308.11792v1","updated":"2023-08-22T21:14:57Z","published":"2023-08-22T21:14:57Z","title":"Karasu: A Collaborative Approach to Efficient Cluster Configuration for\n Big Data Analytics","summary":" Selecting the right resources for big data analytics jobs is hard because of\nthe wide variety of configuration options like machine type and cluster size.\nAs poor choices can have a significant impact on resource efficiency, cost, and\nenergy usage, automated approaches are gaining popularity. Most existing\nmethods rely on profiling recurring workloads to find near-optimal solutions\nover time. Due to the cold-start problem, this often leads to lengthy and\ncostly profiling phases. However, big data analytics jobs across users can\nshare many common properties: they often operate on similar infrastructure,\nusing similar algorithms implemented in similar frameworks. The potential in\nsharing aggregated profiling runs to collaboratively address the cold start\nproblem is largely unexplored.\n We present Karasu, an approach to more efficient resource configuration\nprofiling that promotes data sharing among users working with similar\ninfrastructures, frameworks, algorithms, or datasets. Karasu trains lightweight\nperformance models using aggregated runtime information of collaborators and\ncombines them into an ensemble method to exploit inherent knowledge of the\nconfiguration search space. Moreover, Karasu allows the optimization of\nmultiple objectives simultaneously. Our evaluation is based on performance data\nfrom diverse workload executions in a public cloud environment. We show that\nKarasu is able to significantly boost existing methods in terms of performance,\nsearch time, and cost, even when few comparable profiling runs are available\nthat share only partial common characteristics with the target job.\n","authors":["Dominik Scheinert","Philipp Wiesner","Thorsten Wittkopp","Lauritz Thamsen","Jonathan Will","Odej Kao"],"pdf_url":"https://arxiv.org/pdf/2308.11792v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2306.04647v2","updated":"2023-08-22T21:06:48Z","published":"2023-06-05T01:29:24Z","title":"Compressed Sensing: A Discrete Optimization Approach","summary":" We study the Compressed Sensing (CS) problem, which is the problem of finding\nthe most sparse vector that satisfies a set of linear measurements up to some\nnumerical tolerance. CS is a central problem in Statistics, Operations Research\nand Machine Learning which arises in applications such as signal processing,\ndata compression and image reconstruction. We introduce an $\\ell_2$ regularized\nformulation of CS which we reformulate as a mixed integer second order cone\nprogram. We derive a second order cone relaxation of this problem and show that\nunder mild conditions on the regularization parameter, the resulting relaxation\nis equivalent to the well studied basis pursuit denoising problem. We present a\nsemidefinite relaxation that strengthens the second order cone relaxation and\ndevelop a custom branch-and-bound algorithm that leverages our second order\ncone relaxation to solve instances of CS to certifiable optimality. Our\nnumerical results show that our approach produces solutions that are on average\n$6.22\\%$ more sparse than solutions returned by state of the art benchmark\nmethods on synthetic data in minutes. On real world ECG data, for a given\n$\\ell_2$ reconstruction error our approach produces solutions that are on\naverage $9.95\\%$ more sparse than benchmark methods, while for a given sparsity\nlevel our approach produces solutions that have on average $10.77\\%$ lower\nreconstruction error than benchmark methods in minutes.\n","authors":["Dimitris Bertsimas","Nicholas A. G. Johnson"],"pdf_url":"https://arxiv.org/pdf/2306.04647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11787v1","updated":"2023-08-22T20:59:21Z","published":"2023-08-22T20:59:21Z","title":"HypBO: Expert-Guided Chemist-in-the-Loop Bayesian Search for New\n Materials","summary":" Robotics and automation offer massive accelerations for solving intractable,\nmultivariate scientific problems such as materials discovery, but the available\nsearch spaces can be dauntingly large. Bayesian optimization (BO) has emerged\nas a popular sample-efficient optimization engine, thriving in tasks where no\nanalytic form of the target function/property is known. Here we exploit expert\nhuman knowledge in the form of hypotheses to direct Bayesian searches more\nquickly to promising regions of chemical space. Previous methods have used\nunderlying distributions derived from existing experimental measurements, which\nis unfeasible for new, unexplored scientific tasks. Also, such distributions\ncannot capture intricate hypotheses. Our proposed method, which we call HypBO,\nuses expert human hypotheses to generate an improved seed of samples.\nUnpromising seeds are automatically discounted, while promising seeds are used\nto augment the surrogate model data, thus achieving better-informed sampling.\nThis process continues in a global versus local search fashion, organized in a\nbilevel optimization framework. We validate the performance of our method on a\nrange of synthetic functions and demonstrate its practical utility on a real\nchemical design task where the use of expert hypotheses accelerates the search\nperformance significantly.\n","authors":["Abdoulatif Cisse","Xenophon Evangelopoulos","Sam Carruthers","Vladimir V. Gusev","Andrew I. Cooper"],"pdf_url":"https://arxiv.org/pdf/2308.11787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11783v1","updated":"2023-08-22T20:43:31Z","published":"2023-08-22T20:43:31Z","title":"Coarse-to-Fine Multi-Scene Pose Regression with Transformers","summary":" Absolute camera pose regressors estimate the position and orientation of a\ncamera given the captured image alone. Typically, a convolutional backbone with\na multi-layer perceptron (MLP) head is trained using images and pose labels to\nembed a single reference scene at a time. Recently, this scheme was extended to\nlearn multiple scenes by replacing the MLP head with a set of fully connected\nlayers. In this work, we propose to learn multi-scene absolute camera pose\nregression with Transformers, where encoders are used to aggregate activation\nmaps with self-attention and decoders transform latent features and scenes\nencoding into pose predictions. This allows our model to focus on general\nfeatures that are informative for localization, while embedding multiple scenes\nin parallel. We extend our previous MS-Transformer approach\n\\cite{shavit2021learning} by introducing a mixed classification-regression\narchitecture that improves the localization accuracy. Our method is evaluated\non commonly benchmark indoor and outdoor datasets and has been shown to exceed\nboth multi-scene and state-of-the-art single-scene absolute pose regressors.\n","authors":["Yoli Shavit","Ron Ferens","Yosi Keller"],"pdf_url":"https://arxiv.org/pdf/2308.11783v1.pdf","comment":"Accepted to IEEE Transactions on Pattern Analysis and Machine\n Intelligence (TPAMI). arXiv admin note: substantial text overlap with\n arXiv:2103.11468"},{"id":"http://arxiv.org/abs/2308.11781v1","updated":"2023-08-22T20:40:31Z","published":"2023-08-22T20:40:31Z","title":"Addressing Dynamic and Sparse Qualitative Data: A Hilbert Space\n Embedding of Categorical Variables","summary":" We propose a novel framework for incorporating qualitative data into\nquantitative models for causal estimation. Previous methods use categorical\nvariables derived from qualitative data to build quantitative models. However,\nthis approach can lead to data-sparse categories and yield inconsistent\n(asymptotically biased) and imprecise (finite sample biased) estimates if the\nqualitative information is dynamic and intricate. We use functional analysis to\ncreate a more nuanced and flexible framework. We embed the observed categories\ninto a latent Baire space and introduce a continuous linear map -- a Hilbert\nspace embedding -- from the Baire space of categories to a Reproducing Kernel\nHilbert Space (RKHS) of representation functions. Through the Riesz\nrepresentation theorem, we establish that the canonical treatment of\ncategorical variables in causal models can be transformed into an identified\nstructure in the RKHS. Transfer learning acts as a catalyst to streamline\nestimation -- embeddings from traditional models are paired with the kernel\ntrick to form the Hilbert space embedding. We validate our model through\ncomprehensive simulation evidence and demonstrate its relevance in a real-world\nstudy that contrasts theoretical predictions from economics and psychology in\nan e-commerce marketplace. The results confirm the superior performance of our\nmodel, particularly in scenarios where qualitative information is nuanced and\ncomplex.\n","authors":["Anirban Mukherjee","Hannah H. Chang"],"pdf_url":"https://arxiv.org/pdf/2308.11781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11780v1","updated":"2023-08-22T20:40:21Z","published":"2023-08-22T20:40:21Z","title":"Few-shot Anomaly Detection in Text with Deviation Learning","summary":" Most current methods for detecting anomalies in text concentrate on\nconstructing models solely relying on unlabeled data. These models operate on\nthe presumption that no labeled anomalous examples are available, which\nprevents them from utilizing prior knowledge of anomalies that are typically\npresent in small numbers in many real-world applications. Furthermore, these\nmodels prioritize learning feature embeddings rather than optimizing anomaly\nscores directly, which could lead to suboptimal anomaly scoring and inefficient\nuse of data during the learning process. In this paper, we introduce FATE, a\ndeep few-shot learning-based framework that leverages limited anomaly examples\nand learns anomaly scores explicitly in an end-to-end method using deviation\nlearning. In this approach, the anomaly scores of normal examples are adjusted\nto closely resemble reference scores obtained from a prior distribution.\nConversely, anomaly samples are forced to have anomalous scores that\nconsiderably deviate from the reference score in the upper tail of the prior.\nAdditionally, our model is optimized to learn the distinct behavior of\nanomalies by utilizing a multi-head self-attention layer and multiple instance\nlearning approaches. Comprehensive experiments on several benchmark datasets\ndemonstrate that our proposed approach attains a new level of state-of-the-art\nperformance.\n","authors":["Anindya Sundar Das","Aravind Ajay","Sriparna Saha","Monowar Bhuyan"],"pdf_url":"https://arxiv.org/pdf/2308.11780v1.pdf","comment":"Accepted in ICONIP 2023"},{"id":"http://arxiv.org/abs/2306.14574v2","updated":"2023-08-22T20:38:47Z","published":"2023-06-26T10:35:31Z","title":"U-TOE: Universal TinyML On-board Evaluation Toolkit for Low-Power IoT","summary":" Results from the TinyML community demonstrate that, it is possible to execute\nmachine learning models directly on the terminals themselves, even if these are\nsmall microcontroller-based devices. However, to date, practitioners in the\ndomain lack convenient all-in-one toolkits to help them evaluate the\nfeasibility of executing arbitrary models on arbitrary low-power IoT hardware.\nTo this effect, we present in this paper U-TOE, a universal toolkit we designed\nto facilitate the task of IoT designers and researchers, by combining\nfunctionalities from a low-power embedded OS, a generic model transpiler and\ncompiler, an integrated performance measurement module, and an open-access\nremote IoT testbed. We provide an open source implementation of U-TOE and we\ndemonstrate its use to experimentally evaluate the performance of various\nmodels, on a wide variety of low-power IoT boards, based on popular\nmicrocontroller architectures. U-TOE allows easily reproducible and\ncustomizable comparative evaluation experiments on a wide variety of IoT\nhardware all-at-once. The availability of a toolkit such as U-TOE is desirable\nto accelerate research combining Artificial Intelligence and IoT towards fully\nexploiting the potential of edge computing.\n","authors":["Zhaolan Huang","Koen Zandberg","Kaspar Schleiser","Emmanuel Baccelli"],"pdf_url":"https://arxiv.org/pdf/2306.14574v2.pdf","comment":"to be published in the proceedings of IFIP/IEEE PEMWN 2023"},{"id":"http://arxiv.org/abs/2308.11778v1","updated":"2023-08-22T20:36:16Z","published":"2023-08-22T20:36:16Z","title":"Understanding Hessian Alignment for Domain Generalization","summary":" Out-of-distribution (OOD) generalization is a critical ability for deep\nlearning models in many real-world scenarios including healthcare and\nautonomous vehicles. Recently, different techniques have been proposed to\nimprove OOD generalization. Among these methods, gradient-based regularizers\nhave shown promising performance compared with other competitors. Despite this\nsuccess, our understanding of the role of Hessian and gradient alignment in\ndomain generalization is still limited. To address this shortcoming, we analyze\nthe role of the classifier's head Hessian matrix and gradient in domain\ngeneralization using recent OOD theory of transferability. Theoretically, we\nshow that spectral norm between the classifier's head Hessian matrices across\ndomains is an upper bound of the transfer measure, a notion of distance between\ntarget and source domains. Furthermore, we analyze all the attributes that get\naligned when we encourage similarity between Hessians and gradients. Our\nanalysis explains the success of many regularizers like CORAL, IRM, V-REx,\nFish, IGA, and Fishr as they regularize part of the classifier's head Hessian\nand/or gradient. Finally, we propose two simple yet effective methods to match\nthe classifier's head Hessians and gradients in an efficient way, based on the\nHessian Gradient Product (HGP) and Hutchinson's method (Hutchinson), and\nwithout directly calculating Hessians. We validate the OOD generalization\nability of proposed methods in different scenarios, including transferability,\nsevere correlation shift, label shift and diversity shift. Our results show\nthat Hessian alignment methods achieve promising performance on various OOD\nbenchmarks. The code is available at\n\\url{https://github.com/huawei-noah/Federated-Learning/tree/main/HessianAlignment}.\n","authors":["Sobhan Hemati","Guojun Zhang","Amir Estiri","Xi Chen"],"pdf_url":"https://arxiv.org/pdf/2308.11778v1.pdf","comment":"ICCV 2023"}],"Multimedia":[{"id":"http://arxiv.org/abs/2210.11549v3","updated":"2023-08-22T16:15:26Z","published":"2022-10-20T19:31:23Z","title":"H4VDM: H.264 Video Device Matching","summary":" Methods that can determine if two given video sequences are captured by the\nsame device (e.g., mobile telephone or digital camera) can be used in many\nforensics tasks. In this paper we refer to this as \"video device matching\". In\nopen-set video forensics scenarios it is easier to determine if two video\nsequences were captured with the same device than identifying the specific\ndevice. In this paper, we propose a technique for open-set video device\nmatching. Given two H.264 compressed video sequences, our method can determine\nif they are captured by the same device, even if our method has never\nencountered the device in training. We denote our proposed technique as H.264\nVideo Device Matching (H4VDM). H4VDM uses H.264 compression information\nextracted from video sequences to make decisions. It is more robust against\nartifacts that alter camera sensor fingerprints, and it can be used to analyze\nrelatively small fragments of the H.264 sequence. We trained and tested our\nmethod on a publicly available video forensics dataset consisting of 35\ndevices, where our proposed method demonstrated good performance.\n","authors":["Ziyue Xiang","Paolo Bestagini","Stefano Tubaro","Edward J. Delp"],"pdf_url":"https://arxiv.org/pdf/2210.11549v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11351v1","updated":"2023-08-22T11:00:09Z","published":"2023-08-22T11:00:09Z","title":"M3PS: End-to-End Multi-Grained Multi-Modal Attribute-Aware Product\n Summarization in E-commerce","summary":" Given the long textual product information and the product image, Multi-Modal\nProduct Summarization (MMPS) aims to attract customers' interest and increase\ntheir desire to purchase by highlighting product characteristics with a short\ntextual summary. Existing MMPS methods have achieved promising performance.\nNevertheless, there still exist several problems: 1) lack end-to-end product\nsummarization, 2) lack multi-grained multi-modal modeling, and 3) lack\nmulti-modal attribute modeling. To address these issues, we propose an\nend-to-end multi-grained multi-modal attribute-aware product summarization\nmethod (M3PS) for generating high-quality product summaries in e-commerce. M3PS\njointly models product attributes and generates product summaries. Meanwhile,\nwe design several multi-grained multi-modal tasks to better guide the\nmulti-modal learning of M3PS. Furthermore, we model product attributes based on\nboth text and image modalities so that multi-modal product characteristics can\nbe manifested in the generated summaries. Extensive experiments on a real\nlarge-scale Chinese e-commence dataset demonstrate that our model outperforms\nstate-of-the-art product summarization methods w.r.t. several summarization\nmetrics.\n","authors":["Tao Chen","Ze Lin","Hui Li","Jiayi Ji","Yiyi Zhou","Guanbin Li","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.11351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11276v1","updated":"2023-08-22T08:43:33Z","published":"2023-08-22T08:43:33Z","title":"Music Understanding LLaMA: Advancing Text-to-Music Generation with\n Question Answering and Captioning","summary":" Text-to-music generation (T2M-Gen) faces a major obstacle due to the scarcity\nof large-scale publicly available music datasets with natural language\ncaptions. To address this, we propose the Music Understanding LLaMA (MU-LLaMA),\ncapable of answering music-related questions and generating captions for music\nfiles. Our model utilizes audio representations from a pretrained MERT model to\nextract music features. However, obtaining a suitable dataset for training the\nMU-LLaMA model remains challenging, as existing publicly accessible audio\nquestion answering datasets lack the necessary depth for open-ended music\nquestion answering. To fill this gap, we present a methodology for generating\nquestion-answer pairs from existing audio captioning datasets and introduce the\nMusicQA Dataset designed for answering open-ended music-related questions. The\nexperiments demonstrate that the proposed MU-LLaMA model, trained on our\ndesigned MusicQA dataset, achieves outstanding performance in both music\nquestion answering and music caption generation across various metrics,\noutperforming current state-of-the-art (SOTA) models in both fields and\noffering a promising advancement in the T2M-Gen research field.\n","authors":["Shansong Liu","Atin Sakkeer Hussain","Chenshuo Sun","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2308.11276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11175v1","updated":"2023-08-22T04:06:56Z","published":"2023-08-22T04:06:56Z","title":"MISSRec: Pre-training and Transferring Multi-modal Interest-aware\n Sequence Representation for Recommendation","summary":" The goal of sequential recommendation (SR) is to predict a user's potential\ninterested items based on her/his historical interaction sequences. Most\nexisting sequential recommenders are developed based on ID features, which,\ndespite their widespread use, often underperform with sparse IDs and struggle\nwith the cold-start problem. Besides, inconsistent ID mappings hinder the\nmodel's transferability, isolating similar recommendation domains that could\nhave been co-optimized. This paper aims to address these issues by exploring\nthe potential of multi-modal information in learning robust and generalizable\nsequence representations. We propose MISSRec, a multi-modal pre-training and\ntransfer learning framework for SR. On the user side, we design a\nTransformer-based encoder-decoder model, where the contextual encoder learns to\ncapture the sequence-level multi-modal synergy while a novel interest-aware\ndecoder is developed to grasp item-modality-interest relations for better\nsequence representation. On the candidate item side, we adopt a dynamic fusion\nmodule to produce user-adaptive item representation, providing more precise\nmatching between users and items. We pre-train the model with contrastive\nlearning objectives and fine-tune it in an efficient manner. Extensive\nexperiments demonstrate the effectiveness and flexibility of MISSRec, promising\nan practical solution for real-world recommendation scenarios.\n","authors":["Jinpeng Wang","Ziyun Zeng","Yunxiao Wang","Yuting Wang","Xingyu Lu","Tianxiang Li","Jun Yuan","Rui Zhang","Hai-Tao Zheng","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2308.11175v1.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.10195v2","updated":"2023-08-22T02:55:39Z","published":"2023-08-20T07:56:34Z","title":"WMFormer++: Nested Transformer for Visible Watermark Removal via Implict\n Joint Learning","summary":" Watermarking serves as a widely adopted approach to safeguard media\ncopyright. In parallel, the research focus has extended to watermark removal\ntechniques, offering an adversarial means to enhance watermark robustness and\nfoster advancements in the watermarking field. Existing watermark removal\nmethods mainly rely on UNet with task-specific decoder branches--one for\nwatermark localization and the other for background image restoration. However,\nwatermark localization and background restoration are not isolated tasks;\nprecise watermark localization inherently implies regions necessitating\nrestoration, and the background restoration process contributes to more\naccurate watermark localization. To holistically integrate information from\nboth branches, we introduce an implicit joint learning paradigm. This empowers\nthe network to autonomously navigate the flow of information between implicit\nbranches through a gate mechanism. Furthermore, we employ cross-channel\nattention to facilitate local detail restoration and holistic structural\ncomprehension, while harnessing nested structures to integrate multi-scale\ninformation. Extensive experiments are conducted on various challenging\nbenchmarks to validate the effectiveness of our proposed method. The results\ndemonstrate our approach's remarkable superiority, surpassing existing\nstate-of-the-art methods by a large margin.\n","authors":["Dongjian Huo","Zehong Zhang","Hanjing Su","Guanbin Li","Chaowei Fang","Qingyao Wu"],"pdf_url":"https://arxiv.org/pdf/2308.10195v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11123v1","updated":"2023-08-22T02:06:27Z","published":"2023-08-22T02:06:27Z","title":"Hey That's Mine Imperceptible Watermarks are Preserved in Diffusion\n Generated Outputs","summary":" Generative models have seen an explosion in popularity with the release of\nhuge generative Diffusion models like Midjourney and Stable Diffusion to the\npublic. Because of this new ease of access, questions surrounding the automated\ncollection of data and issues regarding content ownership have started to\nbuild. In this paper we present new work which aims to provide ways of\nprotecting content when shared to the public. We show that a generative\nDiffusion model trained on data that has been imperceptibly watermarked will\ngenerate new images with these watermarks present. We further show that if a\ngiven watermark is correlated with a certain feature of the training data, the\ngenerated images will also have this correlation. Using statistical tests we\nshow that we are able to determine whether a model has been trained on marked\ndata, and what data was marked. As a result our system offers a solution to\nprotect intellectual property when sharing content online.\n","authors":["Luke Ditria","Tom Drummond"],"pdf_url":"https://arxiv.org/pdf/2308.11123v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11797v1","updated":"2023-08-22T21:29:55Z","published":"2023-08-22T21:29:55Z","title":"CLIP Multi-modal Hashing: A new baseline CLIPMH","summary":" The multi-modal hashing method is widely used in multimedia retrieval. It can\nfuse multi-source data to generate binary hash code. However, the current\nmulti-modal methods have the problem of low retrieval accuracy. The reason is\nthat the individual backbone networks have limited feature expression\ncapabilities and are not jointly pre-trained on large-scale unsupervised\nmulti-modal data. To solve this problem, we propose a new baseline CLIP\nMulti-modal Hashing (CLIPMH) method. It uses CLIP model to extract text and\nimage features, and then fuse to generate hash code. CLIP improves the\nexpressiveness of each modal feature. In this way, it can greatly improve the\nretrieval performance of multi-modal hashing methods. In comparison to\nstate-of-the-art unsupervised and supervised multi-modal hashing methods,\nexperiments reveal that the proposed CLIPMH can significantly enhance\nperformance (Maximum increase of 8.38%). CLIP also has great advantages over\nthe text and visual backbone networks commonly used before.\n","authors":["Jian Zhu","Mingkai Sheng","Mingda Ke","Zhangmin Huang","Jingfei Chang"],"pdf_url":"https://arxiv.org/pdf/2308.11797v1.pdf","comment":"submit to ICASSP2024"},{"id":"http://arxiv.org/abs/2308.11681v1","updated":"2023-08-22T14:58:36Z","published":"2023-08-22T14:58:36Z","title":"VadCLIP: Adapting Vision-Language Models for Weakly Supervised Video\n Anomaly Detection","summary":" The recent contrastive language-image pre-training (CLIP) model has shown\ngreat success in a wide range of image-level tasks, revealing remarkable\nability for learning powerful visual representations with rich semantics. An\nopen and worthwhile problem is efficiently adapting such a strong model to the\nvideo domain and designing a robust video anomaly detector. In this work, we\npropose VadCLIP, a new paradigm for weakly supervised video anomaly detection\n(WSVAD) by leveraging the frozen CLIP model directly without any pre-training\nand fine-tuning process. Unlike current works that directly feed extracted\nfeatures into the weakly supervised classifier for frame-level binary\nclassification, VadCLIP makes full use of fine-grained associations between\nvision and language on the strength of CLIP and involves dual branch. One\nbranch simply utilizes visual features for coarse-grained binary\nclassification, while the other fully leverages the fine-grained language-image\nalignment. With the benefit of dual branch, VadCLIP achieves both\ncoarse-grained and fine-grained video anomaly detection by transferring\npre-trained knowledge from CLIP to WSVAD task. We conduct extensive experiments\non two commonly-used benchmarks, demonstrating that VadCLIP achieves the best\nperformance on both coarse-grained and fine-grained WSVAD, surpassing the\nstate-of-the-art methods by a large margin. Specifically, VadCLIP achieves\n84.51% AP and 88.02% AUC on XD-Violence and UCF-Crime, respectively. Code and\nfeatures will be released to facilitate future VAD research.\n","authors":["Peng Wu","Xuerong Zhou","Guansong Pang","Lingru Zhou","Qingsen Yan","Peng Wang","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11681v1.pdf","comment":"Submitted"}]},"2023-08-23T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2308.12284v1","updated":"2023-08-23T17:58:14Z","published":"2023-08-23T17:58:14Z","title":"D4: Improving LLM Pretraining via Document De-Duplication and\n Diversification","summary":" Over recent years, an increasing amount of compute and data has been poured\ninto training large language models (LLMs), usually by doing one-pass learning\non as many tokens as possible randomly selected from large-scale web corpora.\nWhile training on ever-larger portions of the internet leads to consistent\nperformance improvements, the size of these improvements diminishes with scale,\nand there has been little work exploring the effect of data selection on\npre-training and downstream performance beyond simple de-duplication methods\nsuch as MinHash. Here, we show that careful data selection (on top of\nde-duplicated data) via pre-trained model embeddings can speed up training (20%\nefficiency gains) and improves average downstream accuracy on 16 NLP tasks (up\nto 2%) at the 6.7B model scale. Furthermore, we show that repeating data\nintelligently consistently outperforms baseline training (while repeating\nrandom data performs worse than baseline training). Our results indicate that\nclever data selection can significantly improve LLM pre-training, calls into\nquestion the common practice of training for a single epoch on as much data as\npossible, and demonstrates a path to keep improving our models past the limits\nof randomly sampling web data.\n","authors":["Kushal Tirumala","Daniel Simig","Armen Aghajanyan","Ari S. Morcos"],"pdf_url":"https://arxiv.org/pdf/2308.12284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12272v1","updated":"2023-08-23T17:40:35Z","published":"2023-08-23T17:40:35Z","title":"Simple is Better and Large is Not Enough: Towards Ensembling of\n Foundational Language Models","summary":" Foundational Language Models (FLMs) have advanced natural language processing\n(NLP) research. Current researchers are developing larger FLMs (e.g., XLNet,\nT5) to enable contextualized language representation, classification, and\ngeneration. While developing larger FLMs has been of significant advantage, it\nis also a liability concerning hallucination and predictive uncertainty.\nFundamentally, larger FLMs are built on the same foundations as smaller FLMs\n(e.g., BERT); hence, one must recognize the potential of smaller FLMs which can\nbe realized through an ensemble. In the current research, we perform a reality\ncheck on FLMs and their ensemble on benchmark and real-world datasets. We\nhypothesize that the ensembling of FLMs can influence the individualistic\nattention of FLMs and unravel the strength of coordination and cooperation of\ndifferent FLMs. We utilize BERT and define three other ensemble techniques:\n{Shallow, Semi, and Deep}, wherein the Deep-Ensemble introduces a\nknowledge-guided reinforcement learning approach. We discovered that the\nsuggested Deep-Ensemble BERT outperforms its large variation i.e. BERTlarge, by\na factor of many times using datasets that show the usefulness of NLP in\nsensitive fields, such as mental health.\n","authors":["Nancy Tyagi","Aidin Shiri","Surjodeep Sarkar","Abhishek Kumar Umrawal","Manas Gaur"],"pdf_url":"https://arxiv.org/pdf/2308.12272v1.pdf","comment":"Accepted at the 10th Mid-Atlantic Student Colloquium on Speech,\n Language and Learning (MASC-SLL 2023)"},{"id":"http://arxiv.org/abs/2308.11601v2","updated":"2023-08-23T17:34:17Z","published":"2023-08-22T17:48:24Z","title":"Tryage: Real-time, intelligent Routing of User Prompts to Large Language\n Models","summary":" The introduction of the transformer architecture and the self-attention\nmechanism has led to an explosive production of language models trained on\nspecific downstream tasks and data domains. With over 200, 000 models in the\nHugging Face ecosystem, users grapple with selecting and optimizing models to\nsuit multifaceted workflows and data domains while addressing computational,\nsecurity, and recency concerns. There is an urgent need for machine learning\nframeworks that can eliminate the burden of model selection and customization\nand unleash the incredible power of the vast emerging model library for end\nusers. Here, we propose a context-aware routing system, Tryage, that leverages\na language model router for optimal selection of expert models from a model\nlibrary based on analysis of individual input prompts. Inspired by the thalamic\nrouter in the brain, Tryage employs a perceptive router to predict down-stream\nmodel performance on prompts and, then, makes a routing decision using an\nobjective function that integrates performance predictions with user goals and\nconstraints that are incorporated through flags (e.g., model size, model\nrecency). Tryage allows users to explore a Pareto front and automatically\ntrade-off between task accuracy and secondary goals including minimization of\nmodel size, recency, security, verbosity, and readability. Across heterogeneous\ndata sets that include code, text, clinical data, and patents, the Tryage\nframework surpasses Gorilla and GPT3.5 turbo in dynamic model selection\nidentifying the optimal model with an accuracy of 50.9% , compared to 23.6% by\nGPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how\nrouting models can be applied to program and control the behavior of\nmulti-model LLM systems to maximize efficient use of the expanding and evolving\nlanguage model ecosystem.\n","authors":["Surya Narayanan Hari","Matt Thomson"],"pdf_url":"https://arxiv.org/pdf/2308.11601v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12261v1","updated":"2023-08-23T17:28:21Z","published":"2023-08-23T17:28:21Z","title":"Prompt2Model: Generating Deployable Models from Natural Language\n Instructions","summary":" Large language models (LLMs) enable system builders today to create competent\nNLP systems through prompting, where they only need to describe the task in\nnatural language and provide a few examples. However, in other ways, LLMs are a\nstep backward from traditional special-purpose NLP models; they require\nextensive computational resources for deployment and can be gated behind APIs.\nIn this paper, we propose Prompt2Model, a general-purpose method that takes a\nnatural language task description like the prompts provided to LLMs, and uses\nit to train a special-purpose model that is conducive to deployment. This is\ndone through a multi-step process of retrieval of existing datasets and\npretrained models, dataset generation using LLMs, and supervised fine-tuning on\nthese retrieved and generated datasets. Over three tasks, we demonstrate that\ngiven the same few-shot prompt as input, Prompt2Model trains models that\noutperform the results of a strong LLM, gpt-3.5-turbo, by an average of 20%\nwhile being up to 700 times smaller. We also show that this data can be used to\nobtain reliable performance estimates of model performance, enabling model\ndevelopers to assess model reliability before deployment. Prompt2Model is\navailable open-source at https://github.com/neulab/prompt2model.\n","authors":["Vijay Viswanathan","Chenyang Zhao","Amanda Bertsch","Tongshuang Wu","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2308.12261v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2308.10261v2","updated":"2023-08-23T16:49:33Z","published":"2023-08-20T13:15:18Z","title":"How Good Are Large Language Models at Out-of-Distribution Detection?","summary":" Out-of-distribution (OOD) detection plays a vital role in enhancing the\nreliability of machine learning (ML) models. The emergence of large language\nmodels (LLMs) has catalyzed a paradigm shift within the ML community,\nshowcasing their exceptional capabilities across diverse natural language\nprocessing tasks. While existing research has probed OOD detection with\nrelative small-scale Transformers like BERT, RoBERTa and GPT-2, the stark\ndifferences in scales, pre-training objectives, and inference paradigms call\ninto question the applicability of these findings to LLMs. This paper embarks\non a pioneering empirical investigation of OOD detection in the domain of LLMs,\nfocusing on LLaMA series ranging from 7B to 65B in size. We thoroughly evaluate\ncommonly-used OOD detectors, scrutinizing their performance in both zero-grad\nand fine-tuning scenarios. Notably, we alter previous discriminative\nin-distribution fine-tuning into generative fine-tuning, aligning the\npre-training objective of LLMs with downstream tasks. Our findings unveil that\na simple cosine distance OOD detector demonstrates superior efficacy,\noutperforming other OOD detectors. We provide an intriguing explanation for\nthis phenomenon by highlighting the isotropic nature of the embedding spaces of\nLLMs, which distinctly contrasts with the anisotropic property observed in\nsmaller BERT family models. The new insight enhances our understanding of how\nLLMs detect OOD data, thereby enhancing their adaptability and reliability in\ndynamic environments.\n","authors":["Bo Liu","Liming Zhan","Zexin Lu","Yujie Feng","Lei Xue","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2308.10261v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2308.12247v1","updated":"2023-08-23T16:48:04Z","published":"2023-08-23T16:48:04Z","title":"How to Protect Copyright Data in Optimization of Large Language Models?","summary":" Large language models (LLMs) and generative AI have played a transformative\nrole in computer research and applications. Controversy has arisen as to\nwhether these models output copyrighted data, which can occur if the data the\nmodels are trained on is copyrighted. LLMs are built on the transformer neural\nnetwork architecture, which in turn relies on a mathematical computation called\nAttention that uses the softmax function.\n In this paper, we show that large language model training and optimization\ncan be seen as a softmax regression problem. We then establish a method of\nefficiently performing softmax regression, in a way that prevents the\nregression function from generating copyright data. This establishes a\ntheoretical method of training large language models in a way that avoids\ngenerating copyright data.\n","authors":["Timothy Chu","Zhao Song","Chiwun Yang"],"pdf_url":"https://arxiv.org/pdf/2308.12247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08986v2","updated":"2023-08-23T16:31:34Z","published":"2022-12-18T01:57:30Z","title":"Low-Resource Authorship Style Transfer: Can Non-Famous Authors Be\n Imitated?","summary":" Authorship style transfer involves altering text to match the style of a\ntarget author whilst preserving the original meaning. Existing unsupervised\napproaches like STRAP have largely focused on style transfer to target authors\nwith many examples of their writing style in books, speeches, or other\npublished works. This high-resource training data requirement (often greater\nthan 100,000 words) makes these approaches primarily useful for style transfer\nto published authors, politicians, or other well-known figures and authorship\nstyles, while style transfer to non-famous authors has not been well-studied.\nWe introduce the \\textit{low-resource authorship style transfer} task, a more\nchallenging class of authorship style transfer where only a limited amount of\ntext in the target author's style may exist. In our experiments, we\nspecifically choose source and target authors from Reddit and style transfer\ntheir Reddit posts, limiting ourselves to just 16 posts (on average ~500 words)\nof the target author's style. Style transfer accuracy is typically measured by\nhow often a classifier or human judge will classify an output as written by the\ntarget author. Recent authorship representations models excel at authorship\nidentification even with just a few writing samples, making automatic\nevaluation of this task possible for the first time through evaluation metrics\nwe propose. Our results establish an in-context learning technique we develop\nas the strongest baseline, though we find current approaches do not yet achieve\nmastery of this challenging task. We release our data and implementations to\nencourage further investigation.\n","authors":["Ajay Patel","Nicholas Andrews","Chris Callison-Burch"],"pdf_url":"https://arxiv.org/pdf/2212.08986v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12219v1","updated":"2023-08-23T16:01:12Z","published":"2023-08-23T16:01:12Z","title":"Diffusion Language Models Can Perform Many Tasks with Scaling and\n Instruction-Finetuning","summary":" The recent surge of generative AI has been fueled by the generative power of\ndiffusion probabilistic models and the scalable capabilities of large language\nmodels. Despite their potential, it remains elusive whether diffusion language\nmodels can solve general language tasks comparable to their autoregressive\ncounterparts. This paper demonstrates that scaling diffusion models w.r.t.\ndata, sizes, and tasks can effectively make them strong language learners. We\nbuild competent diffusion language models at scale by first acquiring knowledge\nfrom massive data via masked language modeling pretraining thanks to their\nintrinsic connections. We then reprogram pretrained masked language models into\ndiffusion language models via diffusive adaptation, wherein task-specific\nfinetuning and instruction finetuning are explored to unlock their versatility\nin solving general language tasks. Experiments show that scaling diffusion\nlanguage models consistently improves performance across downstream language\ntasks. We further discover that instruction finetuning can elicit zero-shot and\nfew-shot in-context learning abilities that help tackle many unseen tasks by\nfollowing natural language instructions, and show promise in advanced and\nchallenging abilities such as reasoning\n","authors":["Jiasheng Ye","Zaixiang Zheng","Yu Bao","Lihua Qian","Quanquan Gu"],"pdf_url":"https://arxiv.org/pdf/2308.12219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12215v1","updated":"2023-08-23T15:52:20Z","published":"2023-08-23T15:52:20Z","title":"The Challenges of Machine Learning for Trust and Safety: A Case Study on\n Misinformation Detection","summary":" We examine the disconnect between scholarship and practice in applying\nmachine learning to trust and safety problems, using misinformation detection\nas a case study. We systematize literature on automated detection of\nmisinformation across a corpus of 270 well-cited papers in the field. We then\nexamine subsets of papers for data and code availability, design missteps,\nreproducibility, and generalizability. We find significant shortcomings in the\nliterature that call into question claimed performance and practicality.\nDetection tasks are often meaningfully distinct from the challenges that online\nservices actually face. Datasets and model evaluation are often\nnon-representative of real-world contexts, and evaluation frequently is not\nindependent of model training. Data and code availability is poor. Models do\nnot generalize well to out-of-domain data. Based on these results, we offer\nrecommendations for evaluating machine learning applications to trust and\nsafety problems. Our aim is for future work to avoid the pitfalls that we\nidentify.\n","authors":["Madelyne Xiao","Jonathan Mayer"],"pdf_url":"https://arxiv.org/pdf/2308.12215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12202v1","updated":"2023-08-23T15:39:42Z","published":"2023-08-23T15:39:42Z","title":"Curriculum Learning with Adam: The Devil Is in the Wrong Details","summary":" Curriculum learning (CL) posits that machine learning models -- similar to\nhumans -- may learn more efficiently from data that match their current\nlearning progress. However, CL methods are still poorly understood and, in\nparticular for natural language processing (NLP), have achieved only limited\nsuccess. In this paper, we explore why. Starting from an attempt to replicate\nand extend a number of recent curriculum methods, we find that their results\nare surprisingly brittle when applied to NLP. A deep dive into the\n(in)effectiveness of the curricula in some scenarios shows us why: when\ncurricula are employed in combination with the popular Adam optimisation\nalgorithm, they oftentimes learn to adapt to suboptimally chosen optimisation\nparameters for this algorithm. We present a number of different case studies\nwith different common hand-crafted and automated CL approaches to illustrate\nthis phenomenon, and we find that none of them outperforms optimisation with\nonly Adam with well-chosen hyperparameters. As such, our results contribute to\nunderstanding why CL methods work, but at the same time urge caution when\nclaiming positive results.\n","authors":["Lucas Weber","Jaap Jumelet","Paul Michel","Elia Bruni","Dieuwke Hupkes"],"pdf_url":"https://arxiv.org/pdf/2308.12202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11534v2","updated":"2023-08-23T14:33:53Z","published":"2023-08-21T06:51:56Z","title":"Large Language Model as a User Simulator","summary":" The unparalleled performance of closed-sourced ChatGPT has sparked efforts\ntowards its democratization, with notable strides made by leveraging real user\nand ChatGPT conversations, as evidenced by Vicuna. However, while current\nendeavors like Baize and UltraChat aim to auto-generate conversational data due\nto challenges in gathering human participation, they primarily rely on ChatGPT\nto simulate human behaviors based on directives rather than genuine human\nlearning. This results in a limited scope, diminished diversity, and an absence\nof genuine multi-round conversational dynamics. To address the above issues, we\ninnovatively target human questions extracted from genuine human-machine\nconversations as a learning goal and train a user simulator, UserGPT, to\nproduce a high-quality human-centric synthetic conversation dataset, RealChat.\nSubsequently, this dataset trains our assistant model, ReaLM. Experimentally,\nReaLM outpaces baseline models in both Vicuna-Bench and MT-Bench by pairwise\ncomparison when considering equivalent training set sizes, and manual\nevaluation also shows that our model is highly competitive. Impressively, when\nfine-tuned with the latest LLaMA 2 model, ReaLM secured a leading score of 6.33\nin the MT-Bench, outshining the contemporary same-scale models, including the\nLLaMA-2-7B-chat model. Further in-depth analysis demonstrates the scalability\nand transferability of our approach. A preliminary exploration into the\ninterplay between training set data quality and resultant model performance is\nalso undertaken, laying a robust groundwork for future investigations. The code\nis available at https://github.com/FreedomIntelligence/ReaLM.\n","authors":["Chuyi Kong","Yaxin Fan","Xiang Wan","Feng Jiang","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02206v2","updated":"2023-08-23T14:23:48Z","published":"2023-03-03T20:35:38Z","title":"Domain Specific Question Answering Over Knowledge Graphs Using Logical\n Programming and Large Language Models","summary":" Answering questions over domain-specific graphs requires a tailored approach\ndue to the limited number of relations and the specific nature of the domain.\nOur approach integrates classic logical programming languages into large\nlanguage models (LLMs), enabling the utilization of logical reasoning\ncapabilities to tackle the KGQA task. By representing the questions as Prolog\nqueries, which are readable and near close to natural language in\nrepresentation, we facilitate the generation of programmatically derived\nanswers. To validate the effectiveness of our approach, we evaluate it using a\nwell-known benchmark dataset, MetaQA. Our experimental results demonstrate that\nour method achieves accurate identification of correct answer entities for all\ntest questions, even when trained on a small fraction of annotated data.\nOverall, our work presents a promising approach to addressing question\nanswering over domain-specific graphs, offering an explainable and robust\nsolution by incorporating logical programming languages.\n","authors":["Navid Madani","Rohini K. Srihari","Kenneth Joseph"],"pdf_url":"https://arxiv.org/pdf/2303.02206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12157v1","updated":"2023-08-23T14:18:44Z","published":"2023-08-23T14:18:44Z","title":"Evaluation of Faithfulness Using the Longest Supported Subsequence","summary":" As increasingly sophisticated language models emerge, their trustworthiness\nbecomes a pivotal issue, especially in tasks such as summarization and\nquestion-answering. Ensuring their responses are contextually grounded and\nfaithful is challenging due to the linguistic diversity and the myriad of\npossible answers. In this paper, we introduce a novel approach to evaluate\nfaithfulness of machine-generated text by computing the longest noncontinuous\nsubstring of the claim that is supported by the context, which we refer to as\nthe Longest Supported Subsequence (LSS). Using a new human-annotated dataset,\nwe finetune a model to generate LSS. We introduce a new method of evaluation\nand demonstrate that these metrics correlate better with human ratings when LSS\nis employed, as opposed to when it is not. Our proposed metric demonstrates an\n18% enhancement over the prevailing state-of-the-art metric for faithfulness on\nour dataset. Our metric consistently outperforms other metrics on a\nsummarization dataset across six different models. Finally, we compare several\npopular Large Language Models (LLMs) for faithfulness using this metric. We\nrelease the human-annotated dataset built for predicting LSS and our fine-tuned\nmodel for evaluating faithfulness.\n","authors":["Anirudh Mittal","Timo Schick","Mikel Artetxe","Jane Dwivedi-Yu"],"pdf_url":"https://arxiv.org/pdf/2308.12157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10652v4","updated":"2023-08-23T14:00:36Z","published":"2023-07-20T07:33:30Z","title":"Exploring the Landscape of Natural Language Processing Research","summary":" As an efficient approach to understand, generate, and process natural\nlanguage texts, research in natural language processing (NLP) has exhibited a\nrapid spread and wide adoption in recent years. Given the increasing research\nwork in this area, several NLP-related approaches have been surveyed in the\nresearch community. However, a comprehensive study that categorizes established\ntopics, identifies trends, and outlines areas for future research remains\nabsent. Contributing to closing this gap, we have systematically classified and\nanalyzed research papers in the ACL Anthology. As a result, we present a\nstructured overview of the research landscape, provide a taxonomy of fields of\nstudy in NLP, analyze recent developments in NLP, summarize our findings, and\nhighlight directions for future work.\n","authors":["Tim Schopf","Karim Arabi","Florian Matthes"],"pdf_url":"https://arxiv.org/pdf/2307.10652v4.pdf","comment":"Extended version of the paper published at the 14th International\n Conference on Recent Advances in Natural Language Processing (RANLP 2023)"},{"id":"http://arxiv.org/abs/2308.12131v1","updated":"2023-08-23T13:37:02Z","published":"2023-08-23T13:37:02Z","title":"Semantic Change Detection for the Romanian Language","summary":" Automatic semantic change methods try to identify the changes that appear\nover time in the meaning of words by analyzing their usage in diachronic\ncorpora. In this paper, we analyze different strategies to create static and\ncontextual word embedding models, i.e., Word2Vec and ELMo, on real-world\nEnglish and Romanian datasets. To test our pipeline and determine the\nperformance of our models, we first evaluate both word embedding models on an\nEnglish dataset (SEMEVAL-CCOHA). Afterward, we focus our experiments on a\nRomanian dataset, and we underline different aspects of semantic changes in\nthis low-resource language, such as meaning acquisition and loss. The\nexperimental results show that, depending on the corpus, the most important\nfactors to consider are the choice of model and the distance to calculate a\nscore for detecting semantic change.\n","authors":["Ciprian-Octavian Truică","Victor Tudose","Elena-Simona Apostol"],"pdf_url":"https://arxiv.org/pdf/2308.12131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12097v1","updated":"2023-08-23T12:36:57Z","published":"2023-08-23T12:36:57Z","title":"Instruction Position Matters in Sequence Generation with Large Language\n Models","summary":" Large language models (LLMs) are capable of performing conditional sequence\ngeneration tasks, such as translation or summarization, through instruction\nfine-tuning. The fine-tuning data is generally sequentially concatenated from a\nspecific task instruction, an input sentence, and the corresponding response.\nConsidering the locality modeled by the self-attention mechanism of LLMs, these\nmodels face the risk of instruction forgetting when generating responses for\nlong input sentences. To mitigate this issue, we propose enhancing the\ninstruction-following capability of LLMs by shifting the position of task\ninstructions after the input sentences. Theoretical analysis suggests that our\nstraightforward method can alter the model's learning focus, thereby\nemphasizing the training of instruction-following capabilities. Concurrently,\nexperimental results demonstrate that our approach consistently outperforms\ntraditional settings across various model scales (1B / 7B / 13B) and different\nsequence generation tasks (translation and summarization), without any\nadditional data or annotation costs. Notably, our method significantly improves\nthe zero-shot performance on conditional sequence generation, e.g., up to 9.7\nBLEU points on WMT zero-shot translation tasks.\n","authors":["Yijin Liu","Xianfeng Zeng","Fandong Meng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.12097v1.pdf","comment":"Codes and results are at\n https://github.com/Adaxry/Post-Instruction/tree/main"},{"id":"http://arxiv.org/abs/2307.00009v2","updated":"2023-08-23T12:24:02Z","published":"2023-06-18T20:06:58Z","title":"Comparison of Machine Learning Methods for Assigning Software Issues to\n Team Members","summary":" Software issues contain units of work to fix, improve, or create new threads\nduring the development and facilitate communication among the team members.\nAssigning an issue to the most relevant team member and determining a category\nof an issue is a tedious and challenging task. Wrong classifications cause\ndelays and rework in the project and trouble among the team members. This paper\nproposes a set of carefully curated linguistic features for shallow machine\nlearning methods and compares the performance of shallow and ensemble methods\nwith deep language models. Unlike the state-of-the-art, we assign issues to\nfour roles (designer, developer, tester, and leader) rather than to specific\nindividuals or teams to contribute to the generality of our solution. We also\nconsider the level of experience of the developers to reflect the industrial\npractices in our solution formulation. We collect and annotate five industrial\ndata sets from one of the top three global television producers to evaluate our\nproposal and compare it with deep language models. Our data sets contain 5324\nissues in total. We show that an ensemble classifier of shallow techniques\nachieves 0.92 for issue assignment in accuracy which is statistically\ncomparable to the state-of-the-art deep language models. The contributions\ninclude the public sharing of five annotated industrial issue data sets, the\ndevelopment of a clear and comprehensive feature set, the introduction of a\nnovel label set, and the validation of the efficacy of an ensemble classifier\nof shallow machine learning techniques.\n","authors":["Büşra Tabak","Fatma Başak Aydemir"],"pdf_url":"https://arxiv.org/pdf/2307.00009v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12086v1","updated":"2023-08-23T12:11:27Z","published":"2023-08-23T12:11:27Z","title":"Out of the Cage: How Stochastic Parrots Win in Cyber Security\n Environments","summary":" Large Language Models (LLMs) have gained widespread popularity across diverse\ndomains involving text generation, summarization, and various natural language\nprocessing tasks. Despite their inherent limitations, LLM-based designs have\nshown promising capabilities in planning and navigating open-world scenarios.\nThis paper introduces a novel application of pre-trained LLMs as agents within\ncybersecurity network environments, focusing on their utility for sequential\ndecision-making processes.\n We present an approach wherein pre-trained LLMs are leveraged as attacking\nagents in two reinforcement learning environments. Our proposed agents\ndemonstrate similar or better performance against state-of-the-art agents\ntrained for thousands of episodes in most scenarios and configurations. In\naddition, the best LLM agents perform similarly to human testers of the\nenvironment without any additional training process. This design highlights the\npotential of LLMs to efficiently address complex decision-making tasks within\ncybersecurity.\n Furthermore, we introduce a new network security environment named\nNetSecGame. The environment is designed to eventually support complex\nmulti-agent scenarios within the network security domain. The proposed\nenvironment mimics real network attacks and is designed to be highly modular\nand adaptable for various scenarios.\n","authors":["Maria Rigaki","Ondřej Lukáš","Carlos A. Catania","Sebastian Garcia"],"pdf_url":"https://arxiv.org/pdf/2308.12086v1.pdf","comment":"Under review. 10 pages plus appendices, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.12067v1","updated":"2023-08-23T11:27:30Z","published":"2023-08-23T11:27:30Z","title":"InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4","summary":" Multimodal large language models acquire their instruction-following\ncapabilities through a two-stage training process: pre-training on image-text\npairs and fine-tuning on supervised vision-language instruction data. Recent\nstudies have shown that large language models can achieve satisfactory results\neven with a limited amount of high-quality instruction-following data. In this\npaper, we introduce InstructionGPT-4, which is fine-tuned on a small dataset\ncomprising only 200 examples, amounting to approximately 6% of the\ninstruction-following data used in the alignment dataset for MiniGPT-4. We\nfirst propose several metrics to access the quality of multimodal instruction\ndata. Based on these metrics, we present a simple and effective data selector\nto automatically identify and filter low-quality vision-language data. By\nemploying this method, InstructionGPT-4 outperforms the original MiniGPT-4 on\nvarious evaluations (e.g., visual question answering, GPT-4 preference).\nOverall, our findings demonstrate that less but high-quality instruction tuning\ndata is efficient to enable multimodal large language models to generate better\noutput.\n","authors":["Lai Wei","Zihao Jiang","Weiran Huang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10592v2","updated":"2023-08-23T11:01:21Z","published":"2023-08-21T09:47:31Z","title":"BAN-PL: a Novel Polish Dataset of Banned Harmful and Offensive Content\n from Wykop.pl web service","summary":" Advances in automated detection of offensive language online, including hate\nspeech and cyberbullying, require improved access to publicly available\ndatasets comprising social media content. In this paper, we introduce BAN-PL,\nthe first open dataset in the Polish language that encompasses texts flagged as\nharmful and subsequently removed by professional moderators. The dataset\nencompasses a total of 691,662 pieces of content from a popular social\nnetworking service, Wykop, often referred to as the \"Polish Reddit\", including\nboth posts and comments, and is evenly distributed into two distinct classes:\n\"harmful\" and \"neutral\". We provide a comprehensive description of the data\ncollection and preprocessing procedures, as well as highlight the linguistic\nspecificity of the data. The BAN-PL dataset, along with advanced preprocessing\nscripts for, i.a., unmasking profanities, will be publicly available.\n","authors":["Inez Okulska","Kinga Głąbińska","Anna Kołos","Agnieszka Karlińska","Emilia Wiśnios","Adam Nowakowski","Paweł Ellerik","Andrzej Prałat"],"pdf_url":"https://arxiv.org/pdf/2308.10592v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12060v1","updated":"2023-08-23T11:00:36Z","published":"2023-08-23T11:00:36Z","title":"FlexKBQA: A Flexible LLM-Powered Framework for Few-Shot Knowledge Base\n Question Answering","summary":" Knowledge base question answering (KBQA) is a critical yet challenging task\ndue to the vast number of entities within knowledge bases and the diversity of\nnatural language questions posed by users. Unfortunately, the performance of\nmost KBQA models tends to decline significantly in real-world scenarios where\nhigh-quality annotated data is insufficient. To mitigate the burden associated\nwith manual annotation, we introduce FlexKBQA by utilizing Large Language\nModels (LLMs) as program translators for addressing the challenges inherent in\nthe few-shot KBQA task. Specifically, FlexKBQA leverages automated algorithms\nto sample diverse programs, such as SPARQL queries, from the knowledge base,\nwhich are subsequently converted into natural language questions via LLMs. This\nsynthetic dataset facilitates training a specialized lightweight model for the\nKB. Additionally, to reduce the barriers of distribution shift between\nsynthetic data and real user questions, FlexKBQA introduces an executionguided\nself-training method to iterative leverage unlabeled user questions.\nFurthermore, we explore harnessing the inherent reasoning capability of LLMs to\nenhance the entire framework. Consequently, FlexKBQA delivers substantial\nflexibility, encompassing data annotation, deployment, and being domain\nagnostic. Through extensive experiments on GrailQA, WebQSP, and KQA Pro, we\nobserve that under the few-shot even the more challenging zero-shot scenarios,\nFlexKBQA achieves impressive results with a few annotations, surpassing all\nprevious baselines and even approaching the performance of supervised models,\nachieving a remarkable 93% performance relative to the fully-supervised models.\nWe posit that FlexKBQA represents a significant advancement towards exploring\nbetter integration of large and lightweight models. The code is open-sourced.\n","authors":["Zhenyu Li","Sunqi Fan","Yu Gu","Xiuxing Li","Zhichao Duan","Bowen Dong","Ning Liu","Jianyong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11466v2","updated":"2023-08-23T10:46:16Z","published":"2023-08-22T14:25:15Z","title":"SONAR: Sentence-Level Multimodal and Language-Agnostic Representations","summary":" We introduce SONAR, a new multilingual and multimodal fixed-size sentence\nembedding space. Our single text encoder, covering 200 languages, substantially\noutperforms existing sentence embeddings such as LASER3 and LabSE on the xsim\nand xsim++ multilingual similarity search tasks. Speech segments can be\nembedded in the same SONAR embedding space using language-specific speech\nencoders trained in a teacher-student setting on speech transcription data. Our\nencoders outperform existing speech encoders on similarity search tasks. We\nalso provide a text decoder for 200 languages, which allows us to perform\ntext-to-text and speech-to-text machine translation, including for zero-shot\nlanguage and modality combinations. Our text-to-text results are competitive\ncompared to the state-of-the-art NLLB~1B model, despite the fixed-size\nbottleneck representation. Our zero-shot speech-to-text translation results\ncompare favorably with strong supervised baselines such as Whisper.\n","authors":["Paul-Ambroise Duquenne","Holger Schwenk","Benoît Sagot"],"pdf_url":"https://arxiv.org/pdf/2308.11466v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12050v1","updated":"2023-08-23T10:41:07Z","published":"2023-08-23T10:41:07Z","title":"Aligning Language Models with Offline Reinforcement Learning from Human\n Feedback","summary":" Learning from human preferences is crucial for language models (LMs) to\neffectively cater to human needs and societal values. Previous research has\nmade notable progress by leveraging human feedback to follow instructions.\nHowever, these approaches rely primarily on online reinforcement learning (RL)\ntechniques like Proximal Policy Optimization (PPO), which have been proven\nunstable and challenging to tune for language models. Moreover, PPO requires\ncomplex distributed system implementation, hindering the efficiency of\nlarge-scale distributed training. In this study, we propose an offline\nreinforcement learning from human feedback (RLHF) framework to align LMs using\npre-generated samples without interacting with RL environments. Specifically,\nwe explore maximum likelihood estimation (MLE) with filtering, reward-weighted\nregression (RWR), and Decision Transformer (DT) to align language models to\nhuman preferences. By employing a loss function similar to supervised\nfine-tuning, our methods ensure more stable model training than PPO with a\nsimple machine learning system~(MLSys) and much fewer (around 12.3\\%) computing\nresources. Experimental results demonstrate the DT alignment outperforms other\nOffline RLHF methods and is better than PPO.\n","authors":["Jian Hu","Li Tao","June Yang","Chandler Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.12050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12045v1","updated":"2023-08-23T10:25:37Z","published":"2023-08-23T10:25:37Z","title":"CgT-GAN: CLIP-guided Text GAN for Image Captioning","summary":" The large-scale visual-language pre-trained model, Contrastive Language-Image\nPre-training (CLIP), has significantly improved image captioning for scenarios\nwithout human-annotated image-caption pairs. Recent advanced CLIP-based image\ncaptioning without human annotations follows a text-only training paradigm,\ni.e., reconstructing text from shared embedding space. Nevertheless, these\napproaches are limited by the training/inference gap or huge storage\nrequirements for text embeddings. Given that it is trivial to obtain images in\nthe real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates\nimages into the training process to enable the model to \"see\" real visual\nmodality. Particularly, we use adversarial training to teach CgT-GAN to mimic\nthe phrases of an external text corpus and CLIP-based reward to provide\nsemantic guidance. The caption generator is jointly rewarded based on the\ncaption naturalness to human language calculated from the GAN's discriminator\nand the semantic guidance reward computed by the CLIP-based reward module. In\naddition to the cosine similarity as the semantic guidance reward (i.e.,\nCLIP-cos), we further introduce a novel semantic guidance reward called\nCLIP-agg, which aligns the generated caption with a weighted text embedding by\nattentively aggregating the entire corpus. Experimental results on three\nsubtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms\nstate-of-the-art methods significantly across all metrics. Code is available at\nhttps://github.com/Lihr747/CgtGAN.\n","authors":["Jiarui Yu","Haoran Li","Yanbin Hao","Bin Zhu","Tong Xu","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2308.12045v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.12043v1","updated":"2023-08-23T10:08:10Z","published":"2023-08-23T10:08:10Z","title":"IncreLoRA: Incremental Parameter Allocation Method for\n Parameter-Efficient Fine-tuning","summary":" With the increasing size of pre-trained language models (PLMs), fine-tuning\nall the parameters in the model is not efficient, especially when there are a\nlarge number of downstream tasks, which incur significant training and storage\ncosts. Many parameter-efficient fine-tuning (PEFT) approaches have been\nproposed, among which, Low-Rank Adaptation (LoRA) is a representative approach\nthat injects trainable rank decomposition matrices into every target module.\nYet LoRA ignores the importance of parameters in different modules. To address\nthis problem, many works have been proposed to prune the parameters of LoRA.\nHowever, under limited training conditions, the upper bound of the rank of the\npruned parameter matrix is still affected by the preset values. We, therefore,\npropose IncreLoRA, an incremental parameter allocation method that adaptively\nadds trainable parameters during training based on the importance scores of\neach module. This approach is different from the pruning method as it is not\nlimited by the initial number of training parameters, and each parameter matrix\nhas a higher rank upper bound for the same training overhead. We conduct\nextensive experiments on GLUE to demonstrate the effectiveness of IncreLoRA.\nThe results show that our method owns higher parameter efficiency, especially\nwhen under the low-resource settings where our method significantly outperforms\nthe baselines. Our code is publicly available.\n","authors":["Feiyu Zhang","Liangzhi Li","Junhao Chen","Zhouqiang Jiang","Bowen Wang","Yiming Qian"],"pdf_url":"https://arxiv.org/pdf/2308.12043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12039v1","updated":"2023-08-23T09:56:59Z","published":"2023-08-23T09:56:59Z","title":"Hybrid Retrieval and Multi-stage Text Ranking Solution at TREC 2022 Deep\n Learning Track","summary":" Large-scale text retrieval technology has been widely used in various\npractical business scenarios. This paper presents our systems for the TREC 2022\nDeep Learning Track. We explain the hybrid text retrieval and multi-stage text\nranking method adopted in our solution. The retrieval stage combined the two\nstructures of traditional sparse retrieval and neural dense retrieval. In the\nranking stage, in addition to the full interaction-based ranking model built on\nlarge pre-trained language model, we also proposes a lightweight sub-ranking\nmodule to further enhance the final text ranking performance. Evaluation\nresults demonstrate the effectiveness of our proposed approach. Our models\nachieve the 1st and 4th rank on the test set of passage ranking and document\nranking respectively.\n","authors":["Guangwei Xu","Yangzhao Zhang","Longhui Zhang","Dingkun Long","Pengjun Xie","Ruijie Guo"],"pdf_url":"https://arxiv.org/pdf/2308.12039v1.pdf","comment":"TREC 2022 Deep Learning Track"},{"id":"http://arxiv.org/abs/2308.12038v1","updated":"2023-08-23T09:55:41Z","published":"2023-08-23T09:55:41Z","title":"Large Multilingual Models Pivot Zero-Shot Multimodal Learning across\n Languages","summary":" Recently there has been a significant surge in multimodal learning in terms\nof both image-to-text and text-to-image generation. However, the success is\ntypically limited to English, leaving other languages largely behind. Building\na competitive counterpart in other languages is highly challenging due to the\nlow-resource nature of non-English multimodal data (i.e., lack of large-scale,\nhigh-quality image-text data). In this work, we propose MPM, an effective\ntraining paradigm for training large multimodal models in low-resource\nlanguages. MPM demonstrates that Multilingual language models can Pivot\nzero-shot Multimodal learning across languages. Specifically, based on a strong\nmultilingual large language model, multimodal models pretrained on English-only\nimage-text data can well generalize to other languages in a zero-shot manner\nfor both image-to-text and text-to-image generation, even surpassing models\ntrained on image-text data in native languages. Taking Chinese as a practice of\nMPM, we build large multimodal models VisCPM in image-to-text and text-to-image\ngeneration, which achieve state-of-the-art (open-source) performance in\nChinese. To facilitate future research, we open-source codes and model weights\nat https://github.com/OpenBMB/VisCPM.git.\n","authors":["Jinyi Hu","Yuan Yao","Chongyi Wang","Shan Wang","Yinxu Pan","Qianyu Chen","Tianyu Yu","Hanghao Wu","Yue Zhao","Haoye Zhang","Xu Han","Yankai Lin","Jiao Xue","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12038v1.pdf","comment":"https://github.com/OpenBMB/VisCPM.git"},{"id":"http://arxiv.org/abs/2308.12033v1","updated":"2023-08-23T09:46:37Z","published":"2023-08-23T09:46:37Z","title":"PREFER: Prompt Ensemble Learning via Feedback-Reflect-Refine","summary":" As an effective tool for eliciting the power of Large Language Models (LLMs),\nprompting has recently demonstrated unprecedented abilities across a variety of\ncomplex tasks. To further improve the performance, prompt ensemble has\nattracted substantial interest for tackling the hallucination and instability\nof LLMs. However, existing methods usually adopt a two-stage paradigm, which\nrequires a pre-prepared set of prompts with substantial manual effort, and is\nunable to perform directed optimization for different weak learners. In this\npaper, we propose a simple, universal, and automatic method named PREFER (Pompt\nEnsemble learning via Feedback-Reflect-Refine) to address the stated\nlimitations. Specifically, given the fact that weak learners are supposed to\nfocus on hard examples during boosting, PREFER builds a feedback mechanism for\nreflecting on the inadequacies of existing weak learners. Based on this, the\nLLM is required to automatically synthesize new prompts for iterative\nrefinement. Moreover, to enhance stability of the prompt effect evaluation, we\npropose a novel prompt bagging method involving forward and backward thinking,\nwhich is superior to majority voting and is beneficial for both feedback and\nweight calculation in boosting. Extensive experiments demonstrate that our\nPREFER achieves state-of-the-art performance in multiple types of tasks by a\nsignificant margin. We have made our code publicly available.\n","authors":["Chenrui Zhang","Lin Liu","Jinpeng Wang","Chuyuan Wang","Xiao Sun","Hongyu Wang","Mingchen Cai"],"pdf_url":"https://arxiv.org/pdf/2308.12033v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.12032v1","updated":"2023-08-23T09:45:29Z","published":"2023-08-23T09:45:29Z","title":"From Quantity to Quality: Boosting LLM Performance with Self-Guided Data\n Selection for Instruction Tuning","summary":" In the realm of Large Language Models, the balance between instruction data\nquality and quantity has become a focal point. Recognizing this, we introduce a\nself-guided methodology for LLMs to autonomously discern and select cherry\nsamples from vast open-source datasets, effectively minimizing manual curation\nand potential cost for instruction tuning an LLM. Our key innovation, the\nInstruction-Following Difficulty (IFD) metric, emerges as a pivotal tool to\nidentify discrepancies between a model's expected responses and its autonomous\ngeneration prowess. Through the adept application of IFD, cherry samples are\npinpointed, leading to a marked uptick in model training efficiency. Empirical\nvalidations on renowned datasets like Alpaca and WizardLM underpin our\nfindings; with a mere 10% of conventional data input, our strategy showcases\nimproved results. This synthesis of self-guided cherry-picking and the IFD\nmetric signifies a transformative leap in the optimization of LLMs, promising\nboth efficiency and resource-conscious advancements.\n","authors":["Ming Li","Yong Zhang","Zhitao Li","Jiuhai Chen","Lichang Chen","Ning Cheng","Jianzong Wang","Tianyi Zhou","Jing Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.12032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12030v1","updated":"2023-08-23T09:43:10Z","published":"2023-08-23T09:43:10Z","title":"Prompt-Based Length Controlled Generation with Reinforcement Learning","summary":" Recently, large language models (LLMs) like ChatGPT and GPT-4 have attracted\ngreat attention given their surprising improvement and performance. Length\ncontrolled generation of LLMs emerges as an important topic, which also enables\nusers to fully leverage the capability of LLMs in more real-world scenarios\nlike generating a proper answer or essay of a desired length. In addition, the\nautoregressive generation in LLMs is extremely time-consuming, while the\nability of controlling this generated length can arbitrarily reduce the\ninference cost by limiting the length, and thus satisfy different needs.\nTherefore, we aim to propose a prompt-based length control method to achieve\nthis length controlled generation, which can also be widely applied in\nGPT-style LLMs. In particular, we adopt reinforcement learning with the reward\nsignal given by either trainable or rule-based reward model, which further\naffects the generation of LLMs via rewarding a pre-defined target length.\nExperiments show that our method significantly improves the accuracy of\nprompt-based length control for summarization task on popular datasets like\nCNNDM and NYT. We believe this length-controllable ability can provide more\npotentials towards the era of LLMs.\n","authors":["Renlong Jie","Xiaojun Meng","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07758v3","updated":"2023-08-23T09:35:33Z","published":"2023-08-15T13:19:59Z","title":"Forward-Backward Reasoning in Large Language Models for Verification","summary":" Chain-of-Though (CoT) prompting has shown promising performance in various\nreasoning tasks. Recently, Self-Consistency \\citep{wang2023selfconsistency}\nproposes to sample a diverse set of reasoning chains which may lead to\ndifferent answers while the answer that receives the most votes is selected. In\nthis paper, we propose a novel method to use backward reasoning in verifying\ncandidate answers. We mask a token in the question by ${\\bf x}$ and ask the LLM\nto predict the masked token when a candidate answer is provided by \\textit{a\nsimple template}, i.e., \"\\textit{\\textbf{If we know the answer of the above\nquestion is \\{a candidate answer\\}, what is the value of unknown variable ${\\bf\nx}$?}}\" Intuitively, the LLM is expected to predict the masked token\nsuccessfully if the provided candidate answer is correct. We further propose\nFOBAR to combine forward and backward reasoning for estimating the probability\nof candidate answers. We conduct extensive experiments on six data sets and\nthree LLMs. Experimental results demonstrate that FOBAR achieves\nstate-of-the-art performance on various reasoning benchmarks.\n","authors":["Weisen Jiang","Han Shi","Longhui Yu","Zhengying Liu","Yu Zhang","Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2308.07758v3.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2308.12025v1","updated":"2023-08-23T09:32:40Z","published":"2023-08-23T09:32:40Z","title":"Knowledge-injected Prompt Learning for Chinese Biomedical Entity\n Normalization","summary":" The Biomedical Entity Normalization (BEN) task aims to align raw,\nunstructured medical entities to standard entities, thus promoting data\ncoherence and facilitating better downstream medical applications. Recently,\nprompt learning methods have shown promising results in this task. However,\nexisting research falls short in tackling the more complex Chinese BEN task,\nespecially in the few-shot scenario with limited medical data, and the vast\npotential of the external medical knowledge base has yet to be fully harnessed.\nTo address these challenges, we propose a novel Knowledge-injected Prompt\nLearning (PL-Knowledge) method. Specifically, our approach consists of five\nstages: candidate entity matching, knowledge extraction, knowledge encoding,\nknowledge injection, and prediction output. By effectively encoding the\nknowledge items contained in medical entities and incorporating them into our\ntailor-made knowledge-injected templates, the additional knowledge enhances the\nmodel's ability to capture latent relationships between medical entities, thus\nachieving a better match with the standard entities. We extensively evaluate\nour model on a benchmark dataset in both few-shot and full-scale scenarios. Our\nmethod outperforms existing baselines, with an average accuracy boost of\n12.96\\% in few-shot and 0.94\\% in full-data cases, showcasing its excellence in\nthe BEN task.\n","authors":["Songhua Yang","Chenghao Zhang","Hongfei Xu","Yuxiang Jia"],"pdf_url":"https://arxiv.org/pdf/2308.12025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12022v1","updated":"2023-08-23T09:29:29Z","published":"2023-08-23T09:29:29Z","title":"Reranking Passages with Coarse-to-Fine Neural Retriever using\n List-Context Information","summary":" Passage reranking is a crucial task in many applications, particularly when\ndealing with large-scale documents. Traditional neural architectures are\nlimited in retrieving the best passage for a question because they usually\nmatch the question to each passage separately, seldom considering contextual\ninformation in other passages that can provide comparison and reference\ninformation. This paper presents a list-context attention mechanism to augment\nthe passage representation by incorporating the list-context information from\nother candidates. The proposed coarse-to-fine (C2F) neural retriever addresses\nthe out-of-memory limitation of the passage attention mechanism by dividing the\nlist-context modeling process into two sub-processes, allowing for efficient\nencoding of context information from a large number of candidate answers. This\nmethod can be generally used to encode context information from any number of\ncandidate answers in one pass. Different from most multi-stage information\nretrieval architectures, this model integrates the coarse and fine rankers into\nthe joint optimization process, allowing for feedback between the two layers to\nupdate the model simultaneously. Experiments demonstrate the effectiveness of\nthe proposed approach.\n","authors":["Hongyin Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.12022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12014v1","updated":"2023-08-23T09:11:13Z","published":"2023-08-23T09:11:13Z","title":"From Instructions to Intrinsic Human Values -- A Survey of Alignment\n Goals for Big Models","summary":" Big models, exemplified by Large Language Models (LLMs), are models typically\npre-trained on massive data and comprised of enormous parameters, which not\nonly obtain significantly improved performance across diverse tasks but also\npresent emergent capabilities absent in smaller models. However, the growing\nintertwining of big models with everyday human lives poses potential risks and\nmight cause serious social harm. Therefore, many efforts have been made to\nalign LLMs with humans to make them better follow user instructions and satisfy\nhuman preferences. Nevertheless, `what to align with' has not been fully\ndiscussed, and inappropriate alignment goals might even backfire. In this\npaper, we conduct a comprehensive survey of different alignment goals in\nexisting work and trace their evolution paths to help identify the most\nessential goal. Particularly, we investigate related works from two\nperspectives: the definition of alignment goals and alignment evaluation. Our\nanalysis encompasses three distinct levels of alignment goals and reveals a\ngoal transformation from fundamental abilities to value orientation, indicating\nthe potential of intrinsic human values as the alignment goal for enhanced\nLLMs. Based on such results, we further discuss the challenges of achieving\nsuch intrinsic value alignment and provide a collection of available resources\nfor future research on the alignment of big models.\n","authors":["Jing Yao","Xiaoyuan Yi","Xiting Wang","Jindong Wang","Xing Xie"],"pdf_url":"https://arxiv.org/pdf/2308.12014v1.pdf","comment":"20 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.12008v1","updated":"2023-08-23T08:54:05Z","published":"2023-08-23T08:54:05Z","title":"Graecia capta ferum victorem cepit. Detecting Latin Allusions to Ancient\n Greek Literature","summary":" Intertextual allusions hold a pivotal role in Classical Philology, with Latin\nauthors frequently referencing Ancient Greek texts. Until now, the automatic\nidentification of these intertextual references has been constrained to\nmonolingual approaches, seeking parallels solely within Latin or Greek texts.\nIn this study, we introduce SPhilBERTa, a trilingual Sentence-RoBERTa model\ntailored for Classical Philology, which excels at cross-lingual semantic\ncomprehension and identification of identical sentences across Ancient Greek,\nLatin, and English. We generate new training data by automatically translating\nEnglish texts into Ancient Greek. Further, we present a case study,\ndemonstrating SPhilBERTa's capability to facilitate automated detection of\nintertextual parallels. Our models and resources are available at\nhttps://github.com/Heidelberg-NLP/ancient-language-models.\n","authors":["Frederick Riemenschneider","Anette Frank"],"pdf_url":"https://arxiv.org/pdf/2308.12008v1.pdf","comment":"Paper accepted for publication at the First Workshop on Ancient\n Language Processing (ALP) 2023; 9 pages, 5 tables"},{"id":"http://arxiv.org/abs/2308.11995v1","updated":"2023-08-23T08:33:14Z","published":"2023-08-23T08:33:14Z","title":"Topical-Chat: Towards Knowledge-Grounded Open-Domain Conversations","summary":" Building socialbots that can have deep, engaging open-domain conversations\nwith humans is one of the grand challenges of artificial intelligence (AI). To\nthis end, bots need to be able to leverage world knowledge spanning several\ndomains effectively when conversing with humans who have their own world\nknowledge. Existing knowledge-grounded conversation datasets are primarily\nstylized with explicit roles for conversation partners. These datasets also do\nnot explore depth or breadth of topical coverage with transitions in\nconversations. We introduce Topical-Chat, a knowledge-grounded human-human\nconversation dataset where the underlying knowledge spans 8 broad topics and\nconversation partners don't have explicitly defined roles, to help further\nresearch in open-domain conversational AI. We also train several\nstate-of-the-art encoder-decoder conversational models on Topical-Chat and\nperform automated and human evaluation for benchmarking.\n","authors":["Karthik Gopalakrishnan","Behnam Hedayatnia","Qinlang Chen","Anna Gottardi","Sanjeev Kwatra","Anu Venkatesh","Raefer Gabriel","Dilek Hakkani-Tur"],"pdf_url":"https://arxiv.org/pdf/2308.11995v1.pdf","comment":"arXiving an old paper accepted at INTERSPEECH 2019"},{"id":"http://arxiv.org/abs/2308.11971v1","updated":"2023-08-23T07:36:30Z","published":"2023-08-23T07:36:30Z","title":"EVE: Efficient Vision-Language Pre-training with Masked Prediction and\n Modality-Aware MoE","summary":" Building scalable vision-language models to learn from diverse, multimodal\ndata remains an open challenge. In this paper, we introduce an Efficient\nVision-languagE foundation model, namely EVE, which is one unified multimodal\nTransformer pre-trained solely by one unified pre-training task. Specifically,\nEVE encodes both vision and language within a shared Transformer network\nintegrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which\ncapture modality-specific information by selectively switching to different\nexperts. To unify pre-training tasks of vision and language, EVE performs\nmasked signal modeling on image-text pairs to reconstruct masked signals, i.e.,\nimage pixels and text tokens, given visible signals. This simple yet effective\npre-training objective accelerates training by 3.5x compared to the model\npre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing\nto the combination of the unified architecture and pre-training task, EVE is\neasy to scale up, enabling better downstream performance with fewer resources\nand faster training speed. Despite its simplicity, EVE achieves\nstate-of-the-art performance on various vision-language downstream tasks,\nincluding visual question answering, visual reasoning, and image-text\nretrieval.\n","authors":["Junyi Chen","Longteng Guo","Jia Sun","Shuai Shao","Zehuan Yuan","Liang Lin","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06857v2","updated":"2023-08-23T07:06:53Z","published":"2023-07-11T17:51:48Z","title":"Self-consistency for open-ended generations","summary":" Large Language Models (LLMs) can exhibit considerable variation in the\nquality of their sampled outputs. Reranking and selecting the best generation\nfrom the sampled set is a popular way of obtaining strong gains in generation\nquality. In this paper, we present a novel approach for reranking LLM\ngenerations. Unlike other techniques that might involve additional inferences\nor training a specialized reranker, our approach relies on easy to compute\npairwise statistics between the generations that have minimal compute overhead.\nWe show that our approach can be formalized as an extension of self-consistency\nand analyze its performance in that framework, theoretically as well as via\nsimulations. We show strong improvements for selecting the best $k$ generations\nfor code generation tasks as well as robust improvements for best generation\nfor the tasks of autoformalization, and summarization. While our approach only\nassumes black-box access to LLMs, we show that additional access to token\nprobabilities can improve performance even further.\n","authors":["Siddhartha Jain","Xiaofei Ma","Anoop Deoras","Bing Xiang"],"pdf_url":"https://arxiv.org/pdf/2307.06857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11940v1","updated":"2023-08-23T06:21:46Z","published":"2023-08-23T06:21:46Z","title":"Audio Generation with Multiple Conditional Diffusion Model","summary":" Text-based audio generation models have limitations as they cannot encompass\nall the information in audio, leading to restricted controllability when\nrelying solely on text. To address this issue, we propose a novel model that\nenhances the controllability of existing pre-trained text-to-audio models by\nincorporating additional conditions including content (timestamp) and style\n(pitch contour and energy contour) as supplements to the text. This approach\nachieves fine-grained control over the temporal order, pitch, and energy of\ngenerated audio. To preserve the diversity of generation, we employ a trainable\ncontrol condition encoder that is enhanced by a large language model and a\ntrainable Fusion-Net to encode and fuse the additional conditions while keeping\nthe weights of the pre-trained text-to-audio model frozen. Due to the lack of\nsuitable datasets and evaluation metrics, we consolidate existing datasets into\na new dataset comprising the audio and corresponding conditions and use a\nseries of evaluation metrics to evaluate the controllability performance.\nExperimental results demonstrate that our model successfully achieves\nfine-grained control to accomplish controllable audio generation. Audio samples\nand our dataset are publicly available at\nhttps://conditionaudiogen.github.io/conditionaudiogen/\n","authors":["Zhifang Guo","Jianguo Mao","Rui Tao","Long Yan","Kazushige Ouchi","Hong Liu","Xiangdong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11940v1.pdf","comment":"Submitted to AAAI 2024"},{"id":"http://arxiv.org/abs/2206.08955v3","updated":"2023-08-23T05:34:42Z","published":"2022-06-17T18:11:34Z","title":"Making first order linear logic a generating grammar","summary":" It is known that different categorial grammars have surface representation in\na fragment of first order multiplicative linear logic (MLL1). We show that the\nfragment of interest is equivalent to the recently introduced extended tensor\ntype calculus (ETTC). ETTC is a calculus of specific typed terms, which\nrepresent tuples of strings, more precisely bipartite graphs decorated with\nstrings. Types are derived from linear logic formulas, and rules correspond to\nconcrete operations on these string-labeled graphs, so that they can be\nconveniently visualized. This provides the above mentioned fragment of MLL1\nthat is relevant for language modeling not only with some alternative syntax\nand intuitive geometric representation, but also with an intrinsic deductive\nsystem, which has been absent.\n In this work we consider a non-trivial notationally enriched variation of the\npreviously introduced {\\bf ETTC}, which allows more concise and transparent\ncomputations. We present both a cut-free sequent calculus and a natural\ndeduction formalism.\n","authors":["Sergey Slavnov"],"pdf_url":"https://arxiv.org/pdf/2206.08955v3.pdf","comment":"Revised and extended version with detailed proofs. arXiv admin note:\n substantial text overlap with arXiv:2112.15253"},{"id":"http://arxiv.org/abs/2205.03977v3","updated":"2023-08-23T05:18:04Z","published":"2022-05-08T23:58:40Z","title":"A Structured Span Selector","summary":" Many natural language processing tasks, e.g., coreference resolution and\nsemantic role labeling, require selecting text spans and making decisions about\nthem. A typical approach to such tasks is to score all possible spans and\ngreedily select spans for task-specific downstream processing. This approach,\nhowever, does not incorporate any inductive bias about what sort of spans ought\nto be selected, e.g., that selected spans tend to be syntactic constituents. In\nthis paper, we propose a novel grammar-based structured span selection model\nwhich learns to make use of the partial span-level annotation provided for such\nproblems. Compared to previous approaches, our approach gets rid of the\nheuristic greedy span selection scheme, allowing us to model the downstream\ntask on an optimal set of spans. We evaluate our model on two popular span\nprediction tasks: coreference resolution and semantic role labeling. We show\nempirical improvements on both.\n","authors":["Tianyu Liu","Yuchen Eleanor Jiang","Ryan Cotterell","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2205.03977v3.pdf","comment":"NAACL 2022 camera-ready"},{"id":"http://arxiv.org/abs/2308.11923v1","updated":"2023-08-23T05:13:25Z","published":"2023-08-23T05:13:25Z","title":"Audio Difference Captioning Utilizing Similarity-Discrepancy\n Disentanglement","summary":" We proposed Audio Difference Captioning (ADC) as a new extension task of\naudio captioning for describing the semantic differences between input pairs of\nsimilar but slightly different audio clips. The ADC solves the problem that\nconventional audio captioning sometimes generates similar captions for similar\naudio clips, failing to describe the difference in content. We also propose a\ncross-attention-concentrated transformer encoder to extract differences by\ncomparing a pair of audio clips and a similarity-discrepancy disentanglement to\nemphasize the difference in the latent space. To evaluate the proposed methods,\nwe built an AudioDiffCaps dataset consisting of pairs of similar but slightly\ndifferent audio clips with human-annotated descriptions of their differences.\nThe experiment with the AudioDiffCaps dataset showed that the proposed methods\nsolve the ADC task effectively and improve the attention weights to extract the\ndifference by visualizing them in the transformer encoder.\n","authors":["Daiki Takeuchi","Yasunori Ohishi","Daisuke Niizumi","Noboru Harada","Kunio Kashino"],"pdf_url":"https://arxiv.org/pdf/2308.11923v1.pdf","comment":"Accepted to DCASE2023 Workshop"},{"id":"http://arxiv.org/abs/2306.14122v3","updated":"2023-08-23T05:04:58Z","published":"2023-06-25T04:33:56Z","title":"Chain-of-Thought Prompt Distillation for Multimodal Named Entity\n Recognition and Multimodal Relation Extraction","summary":" Multimodal Named Entity Recognition (MNER) and Multimodal Relation Extraction\n(MRE) necessitate the fundamental reasoning capacity for intricate linguistic\nand multimodal comprehension. In this study, we explore distilling the\nreasoning ability of large language models (LLMs) into a more compact student\nmodel by generating a \\textit{chain of thought} (CoT) -- a sequence of\nintermediate reasoning steps. Specifically, we commence by exemplifying the\nelicitation of such reasoning ability from LLMs through CoT prompts covering\nmulti-grain (noun, sentence, multimodality) and data-augmentation (style,\nentity, image) dimensions. Subsequently, we present a novel conditional prompt\ndistillation method to assimilate the commonsense reasoning ability from LLMs,\nthereby enhancing the utility of the student model in addressing text-only\ninputs without the requisite addition of image and CoT knowledge. Extensive\nexperiments reveal that our approach attains state-of-the-art accuracy and\nmanifests a plethora of advantages concerning interpretability, data\nefficiency, and cross-domain generalization on MNER and MRE datasets.\n","authors":["Feng Chen","Yujian Feng"],"pdf_url":"https://arxiv.org/pdf/2306.14122v3.pdf","comment":"modification"},{"id":"http://arxiv.org/abs/2308.11891v1","updated":"2023-08-23T03:38:21Z","published":"2023-08-23T03:38:21Z","title":"Bridging the Gap: Deciphering Tabular Data Using Large Language Model","summary":" In the realm of natural language processing, the understanding of tabular\ndata has perpetually stood as a focal point of scholarly inquiry. The emergence\nof expansive language models, exemplified by the likes of ChatGPT, has ushered\nin a wave of endeavors wherein researchers aim to harness these models for\ntasks related to table-based question answering. Central to our investigative\npursuits is the elucidation of methodologies that amplify the aptitude of such\nlarge language models in discerning both the structural intricacies and\ninherent content of tables, ultimately facilitating their capacity to provide\ninformed responses to pertinent queries. To this end, we have architected a\ndistinctive module dedicated to the serialization of tables for seamless\nintegration with expansive language models. Additionally, we've instituted a\ncorrective mechanism within the model to rectify potential inaccuracies.\nExperimental results indicate that, although our proposed method trails the\nSOTA by approximately 11.7% in overall metrics, it surpasses the SOTA by about\n1.2% in tests on specific datasets. This research marks the first application\nof large language models to table-based question answering tasks, enhancing the\nmodel's comprehension of both table structures and content.\n","authors":["Hengyuan Zhang","Peng Chang","Zongcheng Ji"],"pdf_url":"https://arxiv.org/pdf/2308.11891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08239v2","updated":"2023-08-23T03:37:04Z","published":"2023-08-16T09:15:18Z","title":"MemoChat: Tuning LLMs to Use Memos for Consistent Long-Range Open-Domain\n Conversation","summary":" We propose MemoChat, a pipeline for refining instructions that enables large\nlanguage models (LLMs) to effectively employ self-composed memos for\nmaintaining consistent long-range open-domain conversations. We demonstrate a\nlong-range open-domain conversation through iterative\n\"memorization-retrieval-response\" cycles. This requires us to carefully design\ntailored tuning instructions for each distinct stage. The instructions are\nreconstructed from a collection of public datasets to teach the LLMs to\nmemorize and retrieve past dialogues with structured memos, leading to enhanced\nconsistency when participating in future conversations. We invite experts to\nmanually annotate a test set designed to evaluate the consistency of long-range\nconversations questions. Experiments on three testing scenarios involving both\nopen-source and API-accessible chatbots at scale verify the efficacy of\nMemoChat, which outperforms strong baselines. Our codes, data and models are\navailable here: https://github.com/LuJunru/MemoChat.\n","authors":["Junru Lu","Siyu An","Mingbao Lin","Gabriele Pergola","Yulan He","Di Yin","Xing Sun","Yunsheng Wu"],"pdf_url":"https://arxiv.org/pdf/2308.08239v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.16680v4","updated":"2023-08-23T03:28:30Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n A Comprehensive Survey","summary":" Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v4.pdf","comment":"Draft Version"},{"id":"http://arxiv.org/abs/2308.11878v1","updated":"2023-08-23T02:49:35Z","published":"2023-08-23T02:49:35Z","title":"Cabrita: closing the gap for foreign languages","summary":" The strategy of training the model from scratch in a specific language or\ndomain serves two essential purposes: i) enhancing performance in the\nparticular linguistic or domain context, and ii) ensuring effective\ntokenization. The main limitation inherent to this approach lies in the\nassociated cost, which can reach six to seven-digit dollar values, depending on\nthe model size and the number of parameters involved.\n The main solution to overcome the cost challenge is to rely on available\npre-trained models, which, despite recent advancements such as the LLaMA and\nLLaMA-2 models, still demonstrate inefficiency for certain specific domain\nproblems or prove ineffective in scenarios involving conversational memory\nresources, given the large number of tokens required to represent text.\n To overcome this issue, we present a methodology named Cabrita, which, as our\nresearch demonstrates, successfully addresses the performance and efficient\ntokenization problem, all at an affordable cost. We believe that this\nmethodology can be applied to any transformer-like architecture model. To\nvalidate the study, we conducted continuous pre-training exclusively using\nPortuguese text on a 3-billion-parameter model known as OpenLLaMA, resulting in\na model named openCabrita 3B. The openCabrita 3B also features a new tokenizer\nthat results in a significant reduction in the number of tokens required to\nrepresent the text. In our assessment, for few-shot learning tasks, we achieved\nsimilar results with this 3B model compared to a traditional continuous\npre-training approach as well as to 7B models English pre-trained models.\n","authors":["Celio Larcher","Marcos Piau","Paulo Finardi","Pedro Gengo","Piero Esposito","Vinicius Caridá"],"pdf_url":"https://arxiv.org/pdf/2308.11878v1.pdf","comment":"9 pages, 1 figure"},{"id":"http://arxiv.org/abs/2301.11004v5","updated":"2023-08-23T01:15:04Z","published":"2023-01-26T09:26:01Z","title":"NLP as a Lens for Causal Analysis and Perception Mining to Infer Mental\n Health on Social Media","summary":" Interactions among humans on social media often convey intentions behind\ntheir actions, yielding a psychological language resource for Mental Health\nAnalysis (MHA) of online users. The success of Computational Intelligence\nTechniques (CIT) for inferring mental illness from such social media resources\npoints to NLP as a lens for causal analysis and perception mining. However, we\nargue that more consequential and explainable research is required for optimal\nimpact on clinical psychology practice and personalized mental healthcare. To\nbridge this gap, we posit two significant dimensions: (1) Causal analysis to\nillustrate a cause and effect relationship in the user generated text; (2)\nPerception mining to infer psychological perspectives of social effects on\nonline users intentions. Within the scope of Natural Language Processing (NLP),\nwe further explore critical areas of inquiry associated with these two\ndimensions, specifically through recent advancements in discourse analysis.\nThis position paper guides the community to explore solutions in this space and\nadvance the state of practice in developing conversational agents for inferring\nmental health from social media. We advocate for a more explainable approach\ntoward modeling computational psychology problems through the lens of language\nas we observe an increased number of research contributions in dataset and\nproblem formulation for causal relation extraction and perception enhancements\nwhile inferring mental states.\n","authors":["Muskan Garg","Chandni Saxena","Usman Naseem","Bonnie J Dorr"],"pdf_url":"https://arxiv.org/pdf/2301.11004v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10380v2","updated":"2023-08-23T00:52:13Z","published":"2023-08-20T22:42:04Z","title":"A Human-on-the-Loop Optimization Autoformalism Approach for\n Sustainability","summary":" This paper outlines a natural conversational approach to solving personalized\nenergy-related problems using large language models (LLMs). We focus on\ncustomizable optimization problems that necessitate repeated solving with\nslight variations in modeling and are user-specific, hence posing a challenge\nto devising a one-size-fits-all model. We put forward a strategy that augments\nan LLM with an optimization solver, enhancing its proficiency in understanding\nand responding to user specifications and preferences while providing nonlinear\nreasoning capabilities. Our approach pioneers the novel concept of human-guided\noptimization autoformalism, translating a natural language task specification\nautomatically into an optimization instance. This enables LLMs to analyze,\nexplain, and tackle a variety of instance-specific energy-related problems,\npushing beyond the limits of current prompt-based techniques.\n Our research encompasses various commonplace tasks in the energy sector, from\nelectric vehicle charging and Heating, Ventilation, and Air Conditioning (HVAC)\ncontrol to long-term planning problems such as cost-benefit evaluations for\ninstalling rooftop solar photovoltaics (PVs) or heat pumps. This pilot study\nmarks an essential stride towards the context-based formulation of optimization\nusing LLMs, with the potential to democratize optimization processes. As a\nresult, stakeholders are empowered to optimize their energy consumption,\npromoting sustainable energy practices customized to personal needs and\npreferences.\n","authors":["Ming Jin","Bilgehan Sel","Fnu Hardeep","Wotao Yin"],"pdf_url":"https://arxiv.org/pdf/2308.10380v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2212.08663v2","updated":"2023-08-23T17:59:57Z","published":"2022-12-19T18:59:57Z","title":"Randomized Quantization: A Generic Augmentation for Data Agnostic\n Self-supervised Learning","summary":" Self-supervised representation learning follows a paradigm of withholding\nsome part of the data and tasking the network to predict it from the remaining\npart. Among many techniques, data augmentation lies at the core for creating\nthe information gap. Towards this end, masking has emerged as a generic and\npowerful tool where content is withheld along the sequential dimension, e.g.,\nspatial in images, temporal in audio, and syntactic in language. In this paper,\nwe explore the orthogonal channel dimension for generic data augmentation by\nexploiting precision redundancy. The data for each channel is quantized through\na non-uniform quantizer, with the quantized value sampled randomly within\nrandomly sampled quantization bins. From another perspective, quantization is\nanalogous to channel-wise masking, as it removes the information within each\nbin, but preserves the information across bins. Our approach significantly\nsurpasses existing generic data augmentation methods, while showing on par\nperformance against modality-specific augmentations. We comprehensively\nevaluate our approach on vision, audio, 3D point clouds, as well as the DABS\nbenchmark which is comprised of various data modalities. The code is available\nat https: //github.com/microsoft/random_quantize.\n","authors":["Huimin Wu","Chenyang Lei","Xiao Sun","Peng-Shuai Wang","Qifeng Chen","Kwang-Ting Cheng","Stephen Lin","Zhirong Wu"],"pdf_url":"https://arxiv.org/pdf/2212.08663v2.pdf","comment":"Accepted by ICCV 2023. The code is available at https:\n //github.com/microsoft/random_quantize"},{"id":"http://arxiv.org/abs/2308.12288v1","updated":"2023-08-23T17:59:11Z","published":"2023-08-23T17:59:11Z","title":"CHORUS: Learning Canonicalized 3D Human-Object Spatial Relations from\n Unbounded Synthesized Images","summary":" We present a method for teaching machines to understand and model the\nunderlying spatial common sense of diverse human-object interactions in 3D in a\nself-supervised way. This is a challenging task, as there exist specific\nmanifolds of the interactions that can be considered human-like and natural,\nbut the human pose and the geometry of objects can vary even for similar\ninteractions. Such diversity makes the annotating task of 3D interactions\ndifficult and hard to scale, which limits the potential to reason about that in\na supervised way. One way of learning the 3D spatial relationship between\nhumans and objects during interaction is by showing multiple 2D images captured\nfrom different viewpoints when humans interact with the same type of objects.\nThe core idea of our method is to leverage a generative model that produces\nhigh-quality 2D images from an arbitrary text prompt input as an \"unbounded\"\ndata generator with effective controllability and view diversity. Despite its\nimperfection of the image quality over real images, we demonstrate that the\nsynthesized images are sufficient to learn the 3D human-object spatial\nrelations. We present multiple strategies to leverage the synthesized images,\nincluding (1) the first method to leverage a generative image model for 3D\nhuman-object spatial relation learning; (2) a framework to reason about the 3D\nspatial relations from inconsistent 2D cues in a self-supervised manner via 3D\noccupancy reasoning with pose canonicalization; (3) semantic clustering to\ndisambiguate different types of interactions with the same object types; and\n(4) a novel metric to assess the quality of 3D spatial learning of interaction.\nProject Page: https://jellyheadandrew.github.io/projects/chorus\n","authors":["Sookwan Han","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2308.12288v1.pdf","comment":"Accepted to ICCV 2023 (Oral Presentation). Project Page:\n https://jellyheadandrew.github.io/projects/chorus"},{"id":"http://arxiv.org/abs/2307.03833v2","updated":"2023-08-23T17:40:11Z","published":"2023-07-07T21:03:18Z","title":"Back to Optimization: Diffusion-based Zero-Shot 3D Human Pose Estimation","summary":" Learning-based methods have dominated the 3D human pose estimation (HPE)\ntasks with significantly better performance in most benchmarks than traditional\noptimization-based methods. Nonetheless, 3D HPE in the wild is still the\nbiggest challenge of learning-based models, whether with 2D-3D lifting,\nimage-to-3D, or diffusion-based methods, since the trained networks implicitly\nlearn camera intrinsic parameters and domain-based 3D human pose distributions\nand estimate poses by statistical average. On the other hand, the\noptimization-based methods estimate results case-by-case, which can predict\nmore diverse and sophisticated human poses in the wild. By combining the\nadvantages of optimization-based and learning-based methods, we propose the\nZero-shot Diffusion-based Optimization (ZeDO) pipeline for 3D HPE to solve the\nproblem of cross-domain and in-the-wild 3D HPE. Our multi-hypothesis ZeDO\nachieves state-of-the-art (SOTA) performance on Human3.6M as minMPJPE $51.4$mm\nwithout training with any 2D-3D or image-3D pairs. Moreover, our\nsingle-hypothesis ZeDO achieves SOTA performance on 3DPW dataset with PA-MPJPE\n$42.6$mm on cross-dataset evaluation, which even outperforms learning-based\nmethods trained on 3DPW.\n","authors":["Zhongyu Jiang","Zhuoran Zhou","Lei Li","Wenhao Chai","Cheng-Yen Yang","Jenq-Neng Hwang"],"pdf_url":"https://arxiv.org/pdf/2307.03833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12271v1","updated":"2023-08-23T17:39:58Z","published":"2023-08-23T17:39:58Z","title":"A Generative Approach for Image Registration of Visible-Thermal (VT)\n Cancer Faces","summary":" Since thermal imagery offers a unique modality to investigate pain, the U.S.\nNational Institutes of Health (NIH) has collected a large and diverse set of\ncancer patient facial thermograms for AI-based pain research. However,\ndiffering angles from camera capture between thermal and visible sensors has\nled to misalignment between Visible-Thermal (VT) images. We modernize the\nclassic computer vision task of image registration by applying and modifying a\ngenerative alignment algorithm to register VT cancer faces, without the need\nfor a reference or alignment parameters. By registering VT faces, we\ndemonstrate that the quality of thermal images produced in the generative AI\ndownstream task of Visible-to-Thermal (V2T) image translation significantly\nimproves up to 52.5\\%, than without registration. Images in this paper have\nbeen approved by the NIH NCI for public dissemination.\n","authors":["Catherine Ordun","Alexandra Cha","Edward Raff","Sanjay Purushotham","Karen Kwok","Mason Rule","James Gulley"],"pdf_url":"https://arxiv.org/pdf/2308.12271v1.pdf","comment":"2nd Annual Artificial Intelligence over Infrared Images for Medical\n Applications Workshop (AIIIMA) at the 26th International Conference on\n Medical Image Computing and Computer Assisted Intervention (MICCAI 2023)"},{"id":"http://arxiv.org/abs/2110.03006v4","updated":"2023-08-23T16:47:25Z","published":"2021-10-06T18:25:50Z","title":"Unsupervised Selective Labeling for More Effective Semi-Supervised\n Learning","summary":" Given an unlabeled dataset and an annotation budget, we study how to\nselectively label a fixed number of instances so that semi-supervised learning\n(SSL) on such a partially labeled dataset is most effective. We focus on\nselecting the right data to label, in addition to usual SSL's propagating\nlabels from labeled data to the rest unlabeled data. This instance selection\ntask is challenging, as without any labeled data we do not know what the\nobjective of learning should be. Intuitively, no matter what the downstream\ntask is, instances to be labeled must be representative and diverse: The former\nwould facilitate label propagation to unlabeled data, whereas the latter would\nensure coverage of the entire dataset. We capture this idea by selecting\ncluster prototypes, either in a pretrained feature space, or along with feature\noptimization, both without labels. Our unsupervised selective labeling\nconsistently improves SSL methods over state-of-the-art active learning given\nlabeled data, by 8 to 25 times in label efficiency. For example, it boosts\nFixMatch by 10% (14%) in accuracy on CIFAR-10 (ImageNet-1K) with 0.08% (0.2%)\nlabeled data, demonstrating that small computation spent on selecting what data\nto label brings significant gain especially under a low annotation budget. Our\nwork sets a new standard for practical and efficient SSL.\n","authors":["Xudong Wang","Long Lian","Stella X. Yu"],"pdf_url":"https://arxiv.org/pdf/2110.03006v4.pdf","comment":"Accepted by ECCV 2022; Fixed a few typos"},{"id":"http://arxiv.org/abs/2304.13014v2","updated":"2023-08-23T16:28:52Z","published":"2023-04-25T17:38:41Z","title":"Methods and datasets for segmentation of minimally invasive surgical\n instruments in endoscopic images and videos: A review of the state of the art","summary":" In the field of computer- and robot-assisted minimally invasive surgery,\nenormous progress has been made in recent years based on the recognition of\nsurgical instruments in endoscopic images and videos. In particular, the\ndetermination of the position and type of instruments is of great interest.\nCurrent work involves both spatial and temporal information, with the idea that\npredicting the movement of surgical tools over time may improve the quality of\nthe final segmentations. The provision of publicly available datasets has\nrecently encouraged the development of new methods, mainly based on deep\nlearning. In this review, we identify and characterize datasets used for method\ndevelopment and evaluation and quantify their frequency of use in the\nliterature. We further present an overview of the current state of research\nregarding the segmentation and tracking of minimally invasive surgical\ninstruments in endoscopic images and videos. The paper focuses on methods that\nwork purely visually, without markers of any kind attached to the instruments,\nconsidering both single-frame semantic and instance segmentation approaches, as\nwell as those that incorporate temporal information. The publications analyzed\nwere identified through the platforms Google Scholar, Web of Science, and\nPubMed. The search terms used were \"instrument segmentation\", \"instrument\ntracking\", \"surgical tool segmentation\", and \"surgical tool tracking\",\nresulting in a total of 741 articles published between 01/2015 and 07/2023, of\nwhich 123 were included using systematic selection criteria. A discussion of\nthe reviewed literature is provided, highlighting existing shortcomings and\nemphasizing the available potential for future developments.\n","authors":["Tobias Rueckert","Daniel Rueckert","Christoph Palm"],"pdf_url":"https://arxiv.org/pdf/2304.13014v2.pdf","comment":"29 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.11489v2","updated":"2023-08-23T16:16:44Z","published":"2023-08-22T15:10:42Z","title":"Learning from Semantic Alignment between Unpaired Multiviews for\n Egocentric Video Recognition","summary":" We are concerned with a challenging scenario in unpaired multiview video\nlearning. In this case, the model aims to learn comprehensive multiview\nrepresentations while the cross-view semantic information exhibits variations.\nWe propose Semantics-based Unpaired Multiview Learning (SUM-L) to tackle this\nunpaired multiview learning problem. The key idea is to build cross-view\npseudo-pairs and do view-invariant alignment by leveraging the semantic\ninformation of videos. To facilitate the data efficiency of multiview learning,\nwe further perform video-text alignment for first-person and third-person\nvideos, to fully leverage the semantic knowledge to improve video\nrepresentations. Extensive experiments on multiple benchmark datasets verify\nthe effectiveness of our framework. Our method also outperforms multiple\nexisting view-alignment methods, under the more challenging scenario than\ntypical paired or unpaired multimodal or multiview learning. Our code is\navailable at https://github.com/wqtwjt1996/SUM-L.\n","authors":["Qitong Wang","Long Zhao","Liangzhe Yuan","Ting Liu","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2308.11489v2.pdf","comment":"Proceedings of IEEE International Conference on Computer Vision\n (ICCV) 2023"},{"id":"http://arxiv.org/abs/2308.12234v1","updated":"2023-08-23T16:16:11Z","published":"2023-08-23T16:16:11Z","title":"MolGrapher: Graph-based Visual Recognition of Chemical Structures","summary":" The automatic analysis of chemical literature has immense potential to\naccelerate the discovery of new materials and drugs. Much of the critical\ninformation in patent documents and scientific articles is contained in\nfigures, depicting the molecule structures. However, automatically parsing the\nexact chemical structure is a formidable challenge, due to the amount of\ndetailed information, the diversity of drawing styles, and the need for\ntraining data. In this work, we introduce MolGrapher to recognize chemical\nstructures visually. First, a deep keypoint detector detects the atoms. Second,\nwe treat all candidate atoms and bonds as nodes and put them in a graph. This\nconstruct allows a natural graph representation of the molecule. Last, we\nclassify atom and bond nodes in the graph with a Graph Neural Network. To\naddress the lack of real training data, we propose a synthetic data generation\npipeline producing diverse and realistic results. In addition, we introduce a\nlarge-scale benchmark of annotated real molecule images, USPTO-30K, to spur\nresearch on this critical topic. Extensive experiments on five datasets show\nthat our approach significantly outperforms classical and learning-based\nmethods in most settings. Code, models, and datasets are available.\n","authors":["Lucas Morin","Martin Danelljan","Maria Isabel Agea","Ahmed Nassar","Valery Weber","Ingmar Meijer","Peter Staar","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2308.12234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12231v1","updated":"2023-08-23T16:13:58Z","published":"2023-08-23T16:13:58Z","title":"SPPNet: A Single-Point Prompt Network for Nuclei Image Segmentation","summary":" Image segmentation plays an essential role in nuclei image analysis.\nRecently, the segment anything model has made a significant breakthrough in\nsuch tasks. However, the current model exists two major issues for cell\nsegmentation: (1) the image encoder of the segment anything model involves a\nlarge number of parameters. Retraining or even fine-tuning the model still\nrequires expensive computational resources. (2) in point prompt mode, points\nare sampled from the center of the ground truth and more than one set of points\nis expected to achieve reliable performance, which is not efficient for\npractical applications. In this paper, a single-point prompt network is\nproposed for nuclei image segmentation, called SPPNet. We replace the original\nimage encoder with a lightweight vision transformer. Also, an effective\nconvolutional block is added in parallel to extract the low-level semantic\ninformation from the image and compensate for the performance degradation due\nto the small image encoder. We propose a new point-sampling method based on the\nGaussian kernel. The proposed model is evaluated on the MoNuSeg-2018 dataset.\nThe result demonstrated that SPPNet outperforms existing U-shape architectures\nand shows faster convergence in training. Compared to the segment anything\nmodel, SPPNet shows roughly 20 times faster inference, with 1/70 parameters and\ncomputational cost. Particularly, only one set of points is required in both\nthe training and inference phases, which is more reasonable for clinical\napplications. The code for our work and more technical details can be found at\nhttps://github.com/xq141839/SPPNet.\n","authors":["Qing Xu","Wenwei Kuang","Zeyu Zhang","Xueyao Bao","Haoran Chen","Wenting Duan"],"pdf_url":"https://arxiv.org/pdf/2308.12231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08913v2","updated":"2023-08-23T16:07:52Z","published":"2023-06-15T07:32:10Z","title":"Advancing Volumetric Medical Image Segmentation via Global-Local Masked\n Autoencoder","summary":" Masked autoencoder (MAE) is a promising self-supervised pre-training\ntechnique that can improve the representation learning of a neural network\nwithout human intervention. However, applying MAE directly to volumetric\nmedical images poses two challenges: (i) a lack of global information that is\ncrucial for understanding the clinical context of the holistic data, (ii) no\nguarantee of stabilizing the representations learned from randomly masked\ninputs. To address these limitations, we propose the\n\\textbf{G}lobal-\\textbf{L}ocal \\textbf{M}asked \\textbf{A}uto\\textbf{E}ncoder\n(GL-MAE), a simple yet effective self-supervised pre-training strategy. In\naddition to reconstructing masked local views, as in previous methods, GL-MAE\nincorporates global context learning by reconstructing masked global views.\nFurthermore, a complete global view is integrated as an anchor to guide the\nreconstruction and stabilize the learning process through global-to-global\nconsistency learning and global-to-local consistency learning. Finetuning\nresults on multiple datasets demonstrate the superiority of our method over\nother state-of-the-art self-supervised algorithms, highlighting its\neffectiveness on versatile volumetric medical image segmentation tasks, even\nwhen annotations are scarce. Our codes and models will be released upon\nacceptance.\n","authors":["Jia-Xin Zhuang","Luyang Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08913v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12218v1","updated":"2023-08-23T15:56:26Z","published":"2023-08-23T15:56:26Z","title":"CIParsing: Unifying Causality Properties into Multiple Human Parsing","summary":" Existing methods of multiple human parsing (MHP) apply statistical models to\nacquire underlying associations between images and labeled body parts. However,\nacquired associations often contain many spurious correlations that degrade\nmodel generalization, leading statistical models to be vulnerable to visually\ncontextual variations in images (e.g., unseen image styles/external\ninterventions). To tackle this, we present a causality inspired parsing\nparadigm termed CIParsing, which follows fundamental causal principles\ninvolving two causal properties for human parsing (i.e., the causal diversity\nand the causal invariance). Specifically, we assume that an input image is\nconstructed by a mix of causal factors (the characteristics of body parts) and\nnon-causal factors (external contexts), where only the former ones cause the\ngeneration process of human parsing.Since causal/non-causal factors are\nunobservable, a human parser in proposed CIParsing is required to construct\nlatent representations of causal factors and learns to enforce representations\nto satisfy the causal properties. In this way, the human parser is able to rely\non causal factors w.r.t relevant evidence rather than non-causal factors w.r.t\nspurious correlations, thus alleviating model degradation and yielding improved\nparsing ability. Notably, the CIParsing is designed in a plug-and-play fashion\nand can be integrated into any existing MHP models. Extensive experiments\nconducted on two widely used benchmarks demonstrate the effectiveness and\ngeneralizability of our method.\n","authors":["Xiaojia Chen","Xuanhan Wang","Lianli Gao","Beitao Chen","Jingkuan Song","HenTao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.12218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12216v1","updated":"2023-08-23T15:52:45Z","published":"2023-08-23T15:52:45Z","title":"SG-Former: Self-guided Transformer with Evolving Token Reallocation","summary":" Vision Transformer has demonstrated impressive success across various vision\ntasks. However, its heavy computation cost, which grows quadratically with\nrespect to the token sequence length, largely limits its power in handling\nlarge feature maps. To alleviate the computation cost, previous works rely on\neither fine-grained self-attentions restricted to local small regions, or\nglobal self-attentions but to shorten the sequence length resulting in coarse\ngranularity. In this paper, we propose a novel model, termed as Self-guided\nTransformer~(SG-Former), towards effective global self-attention with adaptive\nfine granularity. At the heart of our approach is to utilize a significance\nmap, which is estimated through hybrid-scale self-attention and evolves itself\nduring training, to reallocate tokens based on the significance of each region.\nIntuitively, we assign more tokens to the salient regions for achieving\nfine-grained attention, while allocating fewer tokens to the minor regions in\nexchange for efficiency and global receptive fields. The proposed SG-Former\nachieves performance superior to state of the art: our base size model achieves\n\\textbf{84.7\\%} Top-1 accuracy on ImageNet-1K, \\textbf{51.2mAP} bbAP on CoCo,\n\\textbf{52.7mIoU} on ADE20K surpassing the Swin Transformer by \\textbf{+1.3\\% /\n+2.7 mAP/ +3 mIoU}, with lower computation costs and fewer parameters. The code\nis available at\n\\href{https://github.com/OliverRensu/SG-Former}{https://github.com/OliverRensu/SG-Former}\n","authors":["Sucheng Ren","Xingyi Yang","Songhua Liu","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12216v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12213v1","updated":"2023-08-23T15:51:36Z","published":"2023-08-23T15:51:36Z","title":"CLIPN for Zero-Shot OOD Detection: Teaching CLIP to Say No","summary":" Out-of-distribution (OOD) detection refers to training the model on an\nin-distribution (ID) dataset to classify whether the input images come from\nunknown classes. Considerable effort has been invested in designing various OOD\ndetection methods based on either convolutional neural networks or\ntransformers. However, zero-shot OOD detection methods driven by CLIP, which\nonly require class names for ID, have received less attention. This paper\npresents a novel method, namely CLIP saying \"no\" (\\textbf{CLIPN}), which\nempowers the logic of saying \"no\" within CLIP. Our key motivation is to equip\nCLIP with the capability of distinguishing OOD and ID samples using\npositive-semantic prompts and negation-semantic prompts. Specifically, we\ndesign a novel learnable \"no\" prompt and a \"no\" text encoder to capture\nnegation semantics within images. Subsequently, we introduce two loss\nfunctions: the image-text binary-opposite loss and the text semantic-opposite\nloss, which we use to teach CLIPN to associate images with \"no\" prompts,\nthereby enabling it to identify unknown samples. Furthermore, we propose two\nthreshold-free inference algorithms to perform OOD detection by utilizing\nnegation semantics from \"no\" prompts and the text encoder. Experimental results\non 9 benchmark datasets (3 ID datasets and 6 OOD datasets) for the OOD\ndetection task demonstrate that CLIPN, based on ViT-B-16, outperforms 7\nwell-used algorithms by at least 2.34\\% and 11.64\\% in terms of AUROC and FPR95\nfor zero-shot OOD detection on ImageNet-1K. Our CLIPN can serve as a solid\nfoundation for effectively leveraging CLIP in downstream OOD tasks. The code is\navailable on\nhttps://github.com/xmed-lab/CLIPN}{https://github.com/xmed-lab/CLIPN.\n","authors":["Hualiang Wang","Yi Li","Huifeng Yao","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2308.12213v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2206.00309v2","updated":"2023-08-23T15:51:28Z","published":"2022-06-01T08:22:34Z","title":"Label-Efficient Online Continual Object Detection in Streaming Video","summary":" Humans can watch a continuous video stream and effortlessly perform continual\nacquisition and transfer of new knowledge with minimal supervision yet\nretaining previously learnt experiences. In contrast, existing continual\nlearning (CL) methods require fully annotated labels to effectively learn from\nindividual frames in a video stream. Here, we examine a more realistic and\nchallenging problem$\\unicode{x2014}$Label-Efficient Online Continual Object\nDetection (LEOCOD) in streaming video. We propose a plug-and-play module,\nEfficient-CLS, that can be easily inserted into and improve existing continual\nlearners for object detection in video streams with reduced data annotation\ncosts and model retraining time. We show that our method has achieved\nsignificant improvement with minimal forgetting across all supervision levels\non two challenging CL benchmarks for streaming real-world videos. Remarkably,\nwith only 25% annotated video frames, our method still outperforms the base CL\nlearners, which are trained with 100% annotations on all video frames. The data\nand source code will be publicly available at\nhttps://github.com/showlab/Efficient-CLS.\n","authors":["Jay Zhangjie Wu","David Junhao Zhang","Wynne Hsu","Mengmi Zhang","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2206.00309v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12199v1","updated":"2023-08-23T15:38:26Z","published":"2023-08-23T15:38:26Z","title":"Towards Real-Time Analysis of Broadcast Badminton Videos","summary":" Analysis of player movements is a crucial subset of sports analysis. Existing\nplayer movement analysis methods use recorded videos after the match is over.\nIn this work, we propose an end-to-end framework for player movement analysis\nfor badminton matches on live broadcast match videos. We only use the visual\ninputs from the match and, unlike other approaches which use multi-modal sensor\ndata, our approach uses only visual cues. We propose a method to calculate the\non-court distance covered by both the players from the video feed of a live\nbroadcast badminton match. To perform this analysis, we focus on the gameplay\nby removing replays and other redundant parts of the broadcast match. We then\nperform player tracking to identify and track the movements of both players in\neach frame. Finally, we calculate the distance covered by each player and the\naverage speed with which they move on the court. We further show a heatmap of\nthe areas covered by the player on the court which is useful for analyzing the\ngameplay of the player. Our proposed framework was successfully used to analyze\nlive broadcast matches in real-time during the Premier Badminton League 2019\n(PBL 2019), with commentators and broadcasters appreciating the utility.\n","authors":["Nitin Nilesh","Tushar Sharma","Anurag Ghosh","C. V. Jawahar"],"pdf_url":"https://arxiv.org/pdf/2308.12199v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12191v1","updated":"2023-08-23T15:27:50Z","published":"2023-08-23T15:27:50Z","title":"Sign Language Translation with Iterative Prototype","summary":" This paper presents IP-SLT, a simple yet effective framework for sign\nlanguage translation (SLT). Our IP-SLT adopts a recurrent structure and\nenhances the semantic representation (prototype) of the input sign language\nvideo via an iterative refinement manner. Our idea mimics the behavior of human\nreading, where a sentence can be digested repeatedly, till reaching accurate\nunderstanding. Technically, IP-SLT consists of feature extraction, prototype\ninitialization, and iterative prototype refinement. The initialization module\ngenerates the initial prototype based on the visual feature extracted by the\nfeature extraction module. Then, the iterative refinement module leverages the\ncross-attention mechanism to polish the previous prototype by aggregating it\nwith the original video feature. Through repeated refinement, the prototype\nfinally converges to a more stable and accurate state, leading to a fluent and\nappropriate translation. In addition, to leverage the sequential dependence of\nprototypes, we further propose an iterative distillation loss to compress the\nknowledge of the final iteration into previous ones. As the autoregressive\ndecoding process is executed only once in inference, our IP-SLT is ready to\nimprove various SLT systems with acceptable overhead. Extensive experiments are\nconducted on public benchmarks to demonstrate the effectiveness of the IP-SLT.\n","authors":["Huijie Yao","Wengang Zhou","Hao Feng","Hezhen Hu","Hao Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2308.12191v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2303.10891v2","updated":"2023-08-23T15:16:42Z","published":"2023-03-20T06:16:22Z","title":"Non-Exemplar Online Class-incremental Continual Learning via\n Dual-prototype Self-augment and Refinement","summary":" This paper investigates a new, practical, but challenging problem named\nNon-exemplar Online Class-incremental continual Learning (NO-CL), which aims to\npreserve the discernibility of base classes without buffering data examples and\nefficiently learn novel classes continuously in a single-pass (i.e., online)\ndata stream. The challenges of this task are mainly two-fold: (1) Both base and\nnovel classes suffer from severe catastrophic forgetting as no previous samples\nare available for replay. (2) As the online data can only be observed once,\nthere is no way to fully re-train the whole model, e.g., re-calibrate the\ndecision boundaries via prototype alignment or feature distillation. In this\npaper, we propose a novel Dual-prototype Self-augment and Refinement method\n(DSR) for NO-CL problem, which consists of two strategies: 1) Dual class\nprototypes: vanilla and high-dimensional prototypes are exploited to utilize\nthe pre-trained information and obtain robust quasi-orthogonal representations\nrather than example buffers for both privacy preservation and memory reduction.\n2) Self-augment and refinement: Instead of updating the whole network, we\noptimize high-dimensional prototypes alternatively with the extra projection\nmodule based on self-augment vanilla prototypes, through a bi-level\noptimization problem. Extensive experiments demonstrate the effectiveness and\nsuperiority of the proposed DSR in NO-CL.\n","authors":["Fushuo Huo","Wenchao Xu","Jingcai Guo","Haozhao Wang","Yunfeng Fan","Song Guo"],"pdf_url":"https://arxiv.org/pdf/2303.10891v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10410v2","updated":"2023-08-23T15:15:59Z","published":"2023-04-20T15:48:50Z","title":"Radar-Camera Fusion for Object Detection and Semantic Segmentation in\n Autonomous Driving: A Comprehensive Review","summary":" Driven by deep learning techniques, perception technology in autonomous\ndriving has developed rapidly in recent years, enabling vehicles to accurately\ndetect and interpret surrounding environment for safe and efficient navigation.\nTo achieve accurate and robust perception capabilities, autonomous vehicles are\noften equipped with multiple sensors, making sensor fusion a crucial part of\nthe perception system. Among these fused sensors, radars and cameras enable a\ncomplementary and cost-effective perception of the surrounding environment\nregardless of lighting and weather conditions. This review aims to provide a\ncomprehensive guideline for radar-camera fusion, particularly concentrating on\nperception tasks related to object detection and semantic segmentation.Based on\nthe principles of the radar and camera sensors, we delve into the data\nprocessing process and representations, followed by an in-depth analysis and\nsummary of radar-camera fusion datasets. In the review of methodologies in\nradar-camera fusion, we address interrogative questions, including \"why to\nfuse\", \"what to fuse\", \"where to fuse\", \"when to fuse\", and \"how to fuse\",\nsubsequently discussing various challenges and potential research directions\nwithin this domain. To ease the retrieval and comparison of datasets and fusion\nmethods, we also provide an interactive website:\nhttps://radar-camera-fusion.github.io.\n","authors":["Shanliang Yao","Runwei Guan","Xiaoyu Huang","Zhuoxiao Li","Xiangyu Sha","Yong Yue","Eng Gee Lim","Hyungjoon Seo","Ka Lok Man","Xiaohui Zhu","Yutao Yue"],"pdf_url":"https://arxiv.org/pdf/2304.10410v2.pdf","comment":"Accepted by IEEE Transactions on Intelligent Vehicles (T-IV)"},{"id":"http://arxiv.org/abs/2305.07881v3","updated":"2023-08-23T14:53:59Z","published":"2023-05-13T10:00:24Z","title":"Black-box Source-free Domain Adaptation via Two-stage Knowledge\n Distillation","summary":" Source-free domain adaptation aims to adapt deep neural networks using only\npre-trained source models and target data. However, accessing the source model\nstill has a potential concern about leaking the source data, which reveals the\npatient's privacy. In this paper, we study the challenging but practical\nproblem: black-box source-free domain adaptation where only the outputs of the\nsource model and target data are available. We propose a simple but effective\ntwo-stage knowledge distillation method. In Stage\n\\uppercase\\expandafter{\\romannumeral1}, we train the target model from scratch\nwith soft pseudo-labels generated by the source model in a knowledge\ndistillation manner. In Stage \\uppercase\\expandafter{\\romannumeral2}, we\ninitialize another model as the new student model to avoid the error\naccumulation caused by noisy pseudo-labels. We feed the images with weak\naugmentation to the teacher model to guide the learning of the student model.\nOur method is simple and flexible, and achieves surprising results on three\ncross-domain segmentation tasks.\n","authors":["Shuai Wang","Daoan Zhang","Zipei Yan","Shitong Shao","Rui Li"],"pdf_url":"https://arxiv.org/pdf/2305.07881v3.pdf","comment":"The short version is accepted by IJCAI 1st International Workshop on\n Generalizing from Limited Resources in the Open World. (This version is long\n version)"},{"id":"http://arxiv.org/abs/2209.11355v3","updated":"2023-08-23T14:51:47Z","published":"2022-09-23T00:35:22Z","title":"Learning Interpretable Dynamics from Images of a Freely Rotating 3D\n Rigid Body","summary":" In many real-world settings, image observations of freely rotating 3D rigid\nbodies, such as satellites, may be available when low-dimensional measurements\nare not. However, the high-dimensionality of image data precludes the use of\nclassical estimation techniques to learn the dynamics and a lack of\ninterpretability reduces the usefulness of standard deep learning methods. In\nthis work, we present a physics-informed neural network model to estimate and\npredict 3D rotational dynamics from image sequences. We achieve this using a\nmulti-stage prediction pipeline that maps individual images to a latent\nrepresentation homeomorphic to $\\mathbf{SO}(3)$, computes angular velocities\nfrom latent pairs, and predicts future latent states using the Hamiltonian\nequations of motion with a learned representation of the Hamiltonian. We\ndemonstrate the efficacy of our approach on a new rotating rigid-body dataset\nwith sequences of rotating cubes and rectangular prisms with uniform and\nnon-uniform density.\n","authors":["Justice Mason","Christine Allen-Blanchette","Nicholas Zolman","Elizabeth Davison","Naomi Leonard"],"pdf_url":"https://arxiv.org/pdf/2209.11355v3.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.12168v1","updated":"2023-08-23T14:35:03Z","published":"2023-08-23T14:35:03Z","title":"Tumor-Centered Patching for Enhanced Medical Image Segmentation","summary":" The realm of medical image diagnosis has advanced significantly with the\nintegration of computer-aided diagnosis and surgical systems. However,\nchallenges persist, particularly in achieving precise image segmentation. While\ndeep learning techniques show potential, obstacles like limited resources, slow\nconvergence, and class imbalance impede their effectiveness. Traditional\npatch-based methods, though common, struggle to capture intricate tumor\nboundaries and often lead to redundant samples, compromising computational\nefficiency and feature quality. To tackle these issues, this research\nintroduces an innovative approach centered on the tumor itself for patch-based\nimage analysis. This novel tumor-centered patching method aims to address the\nclass imbalance and boundary deficiencies, enabling focused and accurate tumor\nsegmentation. By aligning patches with the tumor's anatomical context, this\ntechnique enhances feature extraction accuracy and reduces computational load.\nExperimental results demonstrate improved class imbalance, with segmentation\nscores of 0.78, 0.76, and 0.71 for whole, core, and enhancing tumors,\nrespectively using a lightweight simple U-Net. This approach shows potential\nfor enhancing medical image segmentation and improving computer-aided diagnosis\nsystems.\n","authors":["Mutyyba Asghar","Ahmad Raza Shahid","Akhtar Jamil","Kiran Aftab","Syed Ather Enam"],"pdf_url":"https://arxiv.org/pdf/2308.12168v1.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.12163v1","updated":"2023-08-23T14:25:22Z","published":"2023-08-23T14:25:22Z","title":"NPF-200: A Multi-Modal Eye Fixation Dataset and Method for\n Non-Photorealistic Videos","summary":" Non-photorealistic videos are in demand with the wave of the metaverse, but\nlack of sufficient research studies. This work aims to take a step forward to\nunderstand how humans perceive non-photorealistic videos with eye fixation\n(\\ie, saliency detection), which is critical for enhancing media production,\nartistic design, and game user experience. To fill in the gap of missing a\nsuitable dataset for this research line, we present NPF-200, the first\nlarge-scale multi-modal dataset of purely non-photorealistic videos with eye\nfixations. Our dataset has three characteristics: 1) it contains soundtracks\nthat are essential according to vision and psychological studies; 2) it\nincludes diverse semantic content and videos are of high-quality; 3) it has\nrich motions across and within videos. We conduct a series of analyses to gain\ndeeper insights into this task and compare several state-of-the-art methods to\nexplore the gap between natural images and non-photorealistic data.\nAdditionally, as the human attention system tends to extract visual and audio\nfeatures with different frequencies, we propose a universal frequency-aware\nmulti-modal non-photorealistic saliency detection model called NPSNet,\ndemonstrating the state-of-the-art performance of our task. The results uncover\nstrengths and weaknesses of multi-modal network design and multi-domain\ntraining, opening up promising directions for future works. {Our dataset and\ncode can be found at \\url{https://github.com/Yangziyu/NPF200}}.\n","authors":["Ziyu Yang","Sucheng Ren","Zongwei Wu","Nanxuan Zhao","Junle Wang","Jing Qin","Shengfeng He"],"pdf_url":"https://arxiv.org/pdf/2308.12163v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2303.11702v4","updated":"2023-08-23T14:18:30Z","published":"2023-03-21T09:42:27Z","title":"On the link between generative semi-supervised learning and generative\n open-set recognition","summary":" This study investigates the relationship between semi-supervised learning\n(SSL, which is training off partially labelled datasets) and open-set\nrecognition (OSR, which is classification with simultaneous novelty detection)\nunder the context of generative adversarial networks (GANs). Although no\nprevious study has formally linked SSL and OSR, their respective methods share\nstriking similarities. Specifically, SSL-GANs and OSR-GANs require their\ngenerators to produce 'bad-looking' samples which are used to regularise their\nclassifier networks. We hypothesise that the definitions of bad-looking samples\nin SSL and OSR represents the same concept and realises the same goal. More\nformally, bad-looking samples lie in the complementary space, which is the area\nbetween and around the boundaries of the labelled categories within the\nclassifier's embedding space. By regularising a classifier with samples in the\ncomplementary space, classifiers achieve improved generalisation for SSL and\nalso generalise the open space for OSR. To test this hypothesis, we compare a\nfoundational SSL-GAN with the state-of-the-art OSR-GAN under the same SSL-OSR\nexperimental conditions. Our results find that SSL-GANs achieve near identical\nresults to OSR-GANs, proving the SSL-OSR link. Subsequently, to further this\nnew research path, we compare several SSL-GANs various SSL-OSR setups which\nthis first benchmark results. A combined framework of SSL-OSR certainly\nimproves the practicality and cost-efficiency of classifier training, and so\nfurther theoretical and application studies are also discussed.\n","authors":["Emile Reyn Engelbrecht","Johan du Preez"],"pdf_url":"https://arxiv.org/pdf/2303.11702v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12156v1","updated":"2023-08-23T14:17:44Z","published":"2023-08-23T14:17:44Z","title":"Multimodal Latent Emotion Recognition from Micro-expression and\n Physiological Signals","summary":" This paper discusses the benefits of incorporating multimodal data for\nimproving latent emotion recognition accuracy, focusing on micro-expression\n(ME) and physiological signals (PS). The proposed approach presents a novel\nmultimodal learning framework that combines ME and PS, including a 1D separable\nand mixable depthwise inception network, a standardised normal distribution\nweighted feature fusion method, and depth/physiology guided attention modules\nfor multimodal learning. Experimental results show that the proposed approach\noutperforms the benchmark method, with the weighted fusion method and guided\nattention modules both contributing to enhanced performance.\n","authors":["Liangfei Zhang","Yifei Qian","Ognjen Arandjelovic","Anthony Zhu"],"pdf_url":"https://arxiv.org/pdf/2308.12156v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12143v1","updated":"2023-08-23T14:00:58Z","published":"2023-08-23T14:00:58Z","title":"A Probabilistic Fluctuation based Membership Inference Attack for\n Generative Models","summary":" Membership Inference Attack (MIA) identifies whether a record exists in a\nmachine learning model's training set by querying the model. MIAs on the\nclassic classification models have been well-studied, and recent works have\nstarted to explore how to transplant MIA onto generative models. Our\ninvestigation indicates that existing MIAs designed for generative models\nmainly depend on the overfitting in target models. However, overfitting can be\navoided by employing various regularization techniques, whereas existing MIAs\ndemonstrate poor performance in practice. Unlike overfitting, memorization is\nessential for deep learning models to attain optimal performance, making it a\nmore prevalent phenomenon. Memorization in generative models leads to an\nincreasing trend in the probability distribution of generating records around\nthe member record. Therefore, we propose a Probabilistic Fluctuation Assessing\nMembership Inference Attack (PFAMI), a black-box MIA that infers memberships by\ndetecting these trends via analyzing the overall probabilistic fluctuations\naround given records. We conduct extensive experiments across multiple\ngenerative models and datasets, which demonstrate PFAMI can improve the attack\nsuccess rate (ASR) by about 27.9% when compared with the best baseline.\n","authors":["Wenjie Fu","Huandong Wang","Chen Gao","Guanghua Liu","Yong Li","Tao Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.12143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10718v2","updated":"2023-08-23T13:56:52Z","published":"2023-08-21T13:39:04Z","title":"Backdooring Textual Inversion for Concept Censorship","summary":" Recent years have witnessed success in AIGC (AI Generated Content). People\ncan make use of a pre-trained diffusion model to generate images of high\nquality or freely modify existing pictures with only prompts in nature\nlanguage. More excitingly, the emerging personalization techniques make it\nfeasible to create specific-desired images with only a few images as\nreferences. However, this induces severe threats if such advanced techniques\nare misused by malicious users, such as spreading fake news or defaming\nindividual reputations. Thus, it is necessary to regulate personalization\nmodels (i.e., concept censorship) for their development and advancement.\n In this paper, we focus on the personalization technique dubbed Textual\nInversion (TI), which is becoming prevailing for its lightweight nature and\nexcellent performance. TI crafts the word embedding that contains detailed\ninformation about a specific object. Users can easily download the word\nembedding from public websites like Civitai and add it to their own stable\ndiffusion model without fine-tuning for personalization. To achieve the concept\ncensorship of a TI model, we propose leveraging the backdoor technique for good\nby injecting backdoors into the Textual Inversion embeddings. Briefly, we\nselect some sensitive words as triggers during the training of TI, which will\nbe censored for normal use. In the subsequent generation stage, if the triggers\nare combined with personalized embeddings as final prompts, the model will\noutput a pre-defined target image rather than images including the desired\nmalicious concept.\n To demonstrate the effectiveness of our approach, we conduct extensive\nexperiments on Stable Diffusion, a prevailing open-sourced text-to-image model.\nOur code, data, and results are available at\nhttps://concept-censorship.github.io.\n","authors":["Yutong Wu","Jie Zhang","Florian Kerschbaum","Tianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.10718v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12139v1","updated":"2023-08-23T13:54:15Z","published":"2023-08-23T13:54:15Z","title":"Mesh Conflation of Oblique Photogrammetric Models using Virtual Cameras\n and Truncated Signed Distance Field","summary":" Conflating/stitching 2.5D raster digital surface models (DSM) into a large\none has been a running practice in geoscience applications, however, conflating\nfull-3D mesh models, such as those from oblique photogrammetry, is extremely\nchallenging. In this letter, we propose a novel approach to address this\nchallenge by conflating multiple full-3D oblique photogrammetric models into a\nsingle, and seamless mesh for high-resolution site modeling. Given two or more\nindividually collected and created photogrammetric meshes, we first propose to\ncreate a virtual camera field (with a panoramic field of view) to incubate\nvirtual spaces represented by Truncated Signed Distance Field (TSDF), an\nimplicit volumetric field friendly for linear 3D fusion; then we adaptively\nleverage the truncated bound of meshes in TSDF to conflate them into a single\nand accurate full 3D site model. With drone-based 3D meshes, we show that our\napproach significantly improves upon traditional methods for model conflations,\nto drive new potentials to create excessively large and accurate full 3D mesh\nmodels in support of geoscience and environmental applications.\n","authors":["Shuang Song","Rongjun Qin"],"pdf_url":"https://arxiv.org/pdf/2308.12139v1.pdf","comment":"5 Figures"},{"id":"http://arxiv.org/abs/2308.12138v1","updated":"2023-08-23T13:51:54Z","published":"2023-08-23T13:51:54Z","title":"Select-and-Combine (SAC): A Novel Multi-Stereo Depth Fusion Algorithm\n for Point Cloud Generation via Efficient Local Markov Netlets","summary":" Many practical systems for image-based surface reconstruction employ a\nstereo/multi-stereo paradigm, due to its ability to scale for large scenes and\nits ease of implementation for out-of-core operations. In this process,\nmultiple and abundant depth maps from stereo matching must be combined and\nfused into a single, consistent, and clean point cloud. However, the noises and\noutliers caused by stereo matching and the heterogenous geometric errors of the\nposes present a challenge for existing fusion algorithms, since they mostly\nassume Gaussian errors and predict fused results based on data from local\nspatial neighborhoods, which may inherit uncertainties from multiple depths\nresulting in lowered accuracy. In this paper, we propose a novel depth fusion\nparadigm, that instead of numerically fusing points from multiple depth maps,\nselects the best depth map per point, and combines them into a single and clean\npoint cloud. This paradigm, called select-and-combine (SAC), is achieved\nthrough modeling the point level fusion using local Markov Netlets, a\nmicro-network over point across neighboring views for depth/view selection,\nfollowed by a Netlets collapse process for point combination. The Markov\nNetlets are optimized such that they can inherently leverage spatial\nconsistencies among depth maps of neighboring views, thus they can address\nerrors beyond Gaussian ones. Our experiment results show that our approach\noutperforms existing depth fusion approaches by increasing the F1 score that\nconsiders both accuracy and completeness by 2.07% compared to the best existing\nmethod. Finally, our approach generates clearer point clouds that are 18% less\nredundant while with a higher accuracy before fusion\n","authors":["Mostafa Elhashash","Rongjun Qin"],"pdf_url":"https://arxiv.org/pdf/2308.12138v1.pdf","comment":"6 Figures"},{"id":"http://arxiv.org/abs/2308.12133v1","updated":"2023-08-23T13:43:42Z","published":"2023-08-23T13:43:42Z","title":"Lite-HRNet Plus: Fast and Accurate Facial Landmark Detection","summary":" Facial landmark detection is an essential technology for driver status\ntracking and has been in demand for real-time estimations. As a landmark\ncoordinate prediction, heatmap-based methods are known to achieve a high\naccuracy, and Lite-HRNet can achieve a fast estimation. However, with\nLite-HRNet, the problem of a heavy computational cost of the fusion block,\nwhich connects feature maps with different resolutions, has yet to be solved.\nIn addition, the strong output module used in HRNetV2 is not applied to\nLite-HRNet. Given these problems, we propose a novel architecture called\nLite-HRNet Plus. Lite-HRNet Plus achieves two improvements: a novel fusion\nblock based on a channel attention and a novel output module with less\ncomputational intensity using multi-resolution feature maps. Through\nexperiments conducted on two facial landmark datasets, we confirmed that\nLite-HRNet Plus further improved the accuracy in comparison with conventional\nmethods, and achieved a state-of-the-art accuracy with a computational\ncomplexity with the range of 10M FLOPs.\n","authors":["Sota Kato","Kazuhiro Hotta","Yuhki Hatakeyama","Yoshinori Konishi"],"pdf_url":"https://arxiv.org/pdf/2308.12133v1.pdf","comment":"Accepted at ICIP2023"},{"id":"http://arxiv.org/abs/2308.12127v1","updated":"2023-08-23T13:33:39Z","published":"2023-08-23T13:33:39Z","title":"Masking Strategies for Background Bias Removal in Computer Vision Models","summary":" Models for fine-grained image classification tasks, where the difference\nbetween some classes can be extremely subtle and the number of samples per\nclass tends to be low, are particularly prone to picking up background-related\nbiases and demand robust methods to handle potential examples with\nout-of-distribution (OOD) backgrounds. To gain deeper insights into this\ncritical problem, our research investigates the impact of background-induced\nbias on fine-grained image classification, evaluating standard backbone models\nsuch as Convolutional Neural Network (CNN) and Vision Transformers (ViT). We\nexplore two masking strategies to mitigate background-induced bias: Early\nmasking, which removes background information at the (input) image level, and\nlate masking, which selectively masks high-level spatial features corresponding\nto the background. Extensive experiments assess the behavior of CNN and ViT\nmodels under different masking strategies, with a focus on their generalization\nto OOD backgrounds. The obtained findings demonstrate that both proposed\nstrategies enhance OOD performance compared to the baseline models, with early\nmasking consistently exhibiting the best OOD performance. Notably, a ViT\nvariant employing GAP-Pooled Patch token-based classification combined with\nearly masking achieves the highest OOD robustness.\n","authors":["Ananthu Aniraj","Cassio F. Dantas","Dino Ienco","Diego Marcos"],"pdf_url":"https://arxiv.org/pdf/2308.12127v1.pdf","comment":"Accepted at the 2023 IEEE/CVF International Conference on Computer\n Vision Workshop (ICCVW) on Out Of Distribution Generalization in Computer\n Vision (OOD-CV)"},{"id":"http://arxiv.org/abs/2211.13579v2","updated":"2023-08-23T13:20:44Z","published":"2022-11-24T13:08:43Z","title":"Knowledge-Aware Federated Active Learning with Non-IID Data","summary":" Federated learning enables multiple decentralized clients to learn\ncollaboratively without sharing the local training data. However, the expensive\nannotation cost to acquire data labels on local clients remains an obstacle in\nutilizing local data. In this paper, we propose a federated active learning\nparadigm to efficiently learn a global model with limited annotation budget\nwhile protecting data privacy in a decentralized learning way. The main\nchallenge faced by federated active learning is the mismatch between the active\nsampling goal of the global model on the server and that of the asynchronous\nlocal clients. This becomes even more significant when data is distributed\nnon-IID across local clients. To address the aforementioned challenge, we\npropose Knowledge-Aware Federated Active Learning (KAFAL), which consists of\nKnowledge-Specialized Active Sampling (KSAS) and Knowledge-Compensatory\nFederated Update (KCFU). KSAS is a novel active sampling method tailored for\nthe federated active learning problem. It deals with the mismatch challenge by\nsampling actively based on the discrepancies between local and global models.\nKSAS intensifies specialized knowledge in local clients, ensuring the sampled\ndata to be informative for both the local clients and the global model. KCFU,\nin the meantime, deals with the client heterogeneity caused by limited data and\nnon-IID data distributions. It compensates for each client's ability in weak\nclasses by the assistance of the global model. Extensive experiments and\nanalyses are conducted to show the superiority of KSAS over the\nstate-of-the-art active learning methods and the efficiency of KCFU under the\nfederated active learning framework.\n","authors":["Yu-Tong Cao","Ye Shi","Baosheng Yu","Jingya Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2211.13579v2.pdf","comment":"14 pages, 12 figures, ICCV23"},{"id":"http://arxiv.org/abs/2304.04521v3","updated":"2023-08-23T13:11:20Z","published":"2023-04-10T11:35:42Z","title":"Zero-Shot In-Distribution Detection in Multi-Object Settings Using\n Vision-Language Foundation Models","summary":" Extracting in-distribution (ID) images from noisy images scraped from the\nInternet is an important preprocessing for constructing datasets, which has\ntraditionally been done manually. Automating this preprocessing with deep\nlearning techniques presents two key challenges. First, images should be\ncollected using only the name of the ID class without training on the ID data.\nSecond, as we can see why COCO was created, it is crucial to identify images\ncontaining not only ID objects but also both ID and out-of-distribution (OOD)\nobjects as ID images to create robust recognizers. In this paper, we propose a\nnovel problem setting called zero-shot in-distribution (ID) detection, where we\nidentify images containing ID objects as ID images (even if they contain OOD\nobjects), and images lacking ID objects as OOD images without any training. To\nsolve this problem, we leverage the powerful zero-shot capability of CLIP and\npresent a simple and effective approach, Global-Local Maximum Concept Matching\n(GL-MCM), based on both global and local visual-text alignments of CLIP\nfeatures. Extensive experiments demonstrate that GL-MCM outperforms comparison\nmethods on both multi-object datasets and single-object ImageNet benchmarks.\nThe code will be available via https://github.com/AtsuMiyai/GL-MCM.\n","authors":["Atsuyuki Miyai","Qing Yu","Go Irie","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2304.04521v3.pdf","comment":"v3: I fixed some typos from v2"},{"id":"http://arxiv.org/abs/2308.12116v1","updated":"2023-08-23T13:10:33Z","published":"2023-08-23T13:10:33Z","title":"The TYC Dataset for Understanding Instance-Level Semantics and Motions\n of Cells in Microstructures","summary":" Segmenting cells and tracking their motion over time is a common task in\nbiomedical applications. However, predicting accurate instance-wise\nsegmentation and cell motions from microscopy imagery remains a challenging\ntask. Using microstructured environments for analyzing single cells in a\nconstant flow of media adds additional complexity. While large-scale labeled\nmicroscopy datasets are available, we are not aware of any large-scale dataset,\nincluding both cells and microstructures. In this paper, we introduce the\ntrapped yeast cell (TYC) dataset, a novel dataset for understanding\ninstance-level semantics and motions of cells in microstructures. We release\n$105$ dense annotated high-resolution brightfield microscopy images, including\nabout $19$k instance masks. We also release $261$ curated video clips composed\nof $1293$ high-resolution microscopy images to facilitate unsupervised\nunderstanding of cell motions and morphology. TYC offers ten times more\ninstance annotations than the previously largest dataset, including cells and\nmicrostructures. Our effort also exceeds previous attempts in terms of\nmicrostructure variability, resolution, complexity, and capturing device\n(microscopy) variability. We facilitate a unified comparison on our novel\ndataset by introducing a standardized evaluation strategy. TYC and evaluation\ncode are publicly available under CC BY 4.0 license.\n","authors":["Christoph Reich","Tim Prangemeier","Heinz Koeppl"],"pdf_url":"https://arxiv.org/pdf/2308.12116v1.pdf","comment":"Accepted at ICCV 2023 Workshop on BioImage Computing. Project page\n (with links to the dataset and code):\n https://christophreich1996.github.io/tyc_dataset/"},{"id":"http://arxiv.org/abs/2308.12114v1","updated":"2023-08-23T13:09:03Z","published":"2023-08-23T13:09:03Z","title":"Less is More -- Towards parsimonious multi-task models using structured\n sparsity","summary":" Group sparsity in Machine Learning (ML) encourages simpler, more\ninterpretable models with fewer active parameter groups. This work aims to\nincorporate structured group sparsity into the shared parameters of a\nMulti-Task Learning (MTL) framework, to develop parsimonious models that can\neffectively address multiple tasks with fewer parameters while maintaining\ncomparable or superior performance to a dense model. Sparsifying the model\nduring training helps decrease the model's memory footprint, computation\nrequirements, and prediction time during inference. We use channel-wise l1/l2\ngroup sparsity in the shared layers of the Convolutional Neural Network (CNN).\nThis approach not only facilitates the elimination of extraneous groups\n(channels) but also imposes a penalty on the weights, thereby enhancing the\nlearning of all tasks. We compare the outcomes of single-task and multi-task\nexperiments under group sparsity on two publicly available MTL datasets, NYU-v2\nand CelebAMask-HQ. We also investigate how changing the sparsification degree\nimpacts both the performance of the model and the sparsity of groups.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2308.12114v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2308.12113v1","updated":"2023-08-23T13:06:59Z","published":"2023-08-23T13:06:59Z","title":"Advancements in Point Cloud Data Augmentation for Deep Learning: A\n Survey","summary":" Point cloud has a wide range of applications in areas such as autonomous\ndriving, mapping, navigation, scene reconstruction, and medical imaging. Due to\nits great potentials in these applications, point cloud processing has gained\ngreat attention in the field of computer vision. Among various point cloud\nprocessing techniques, deep learning (DL) has become one of the mainstream and\neffective methods for tasks such as detection, segmentation and classification.\nTo reduce overfitting during training DL models and improve model performance\nespecially when the amount and/or diversity of training data are limited,\naugmentation is often crucial. Although various point cloud data augmentation\nmethods have been widely used in different point cloud processing tasks, there\nare currently no published systematic surveys or reviews of these methods.\nTherefore, this article surveys and discusses these methods and categorizes\nthem into a taxonomy framework. Through the comprehensive evaluation and\ncomparison of the augmentation methods, this article identifies their\npotentials and limitations and suggests possible future research directions.\nThis work helps researchers gain a holistic understanding of the current status\nof point cloud data augmentation and promotes its wider application and\ndevelopment.\n","authors":["Qinfeng Zhu","Lei Fan","Ningxin Weng"],"pdf_url":"https://arxiv.org/pdf/2308.12113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12112v1","updated":"2023-08-23T13:02:52Z","published":"2023-08-23T13:02:52Z","title":"Generalized Continual Category Discovery","summary":" Most of Continual Learning (CL) methods push the limit of supervised learning\nsettings, where an agent is expected to learn new labeled tasks and not forget\nprevious knowledge. However, these settings are not well aligned with real-life\nscenarios, where a learning agent has access to a vast amount of unlabeled data\nencompassing both novel (entirely unlabeled) classes and examples from known\nclasses. Drawing inspiration from Generalized Category Discovery (GCD), we\nintroduce a novel framework that relaxes this assumption. Precisely, in any\ntask, we allow for the existence of novel and known classes, and one must use\ncontinual version of unsupervised learning methods to discover them. We call\nthis setting Generalized Continual Category Discovery (GCCD). It unifies CL and\nGCD, bridging the gap between synthetic benchmarks and real-life scenarios.\nWith a series of experiments, we present that existing methods fail to\naccumulate knowledge from subsequent tasks in which unlabeled samples of novel\nclasses are present. In light of these limitations, we propose a method that\nincorporates both supervised and unsupervised signals and mitigates the\nforgetting through the use of centroid adaptation. Our method surpasses strong\nCL methods adopted for GCD techniques and presents a superior representation\nlearning performance.\n","authors":["Daniel Marczak","Grzegorz Rypeść","Sebastian Cygert","Tomasz Trzciński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2308.12112v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.12111v1","updated":"2023-08-23T12:58:51Z","published":"2023-08-23T12:58:51Z","title":"Cross-Modality Proposal-guided Feature Mining for Unregistered\n RGB-Thermal Pedestrian Detection","summary":" RGB-Thermal (RGB-T) pedestrian detection aims to locate the pedestrians in\nRGB-T image pairs to exploit the complementation between the two modalities for\nimproving detection robustness in extreme conditions. Most existing algorithms\nassume that the RGB-T image pairs are well registered, while in the real world\nthey are not aligned ideally due to parallax or different field-of-view of the\ncameras. The pedestrians in misaligned image pairs may locate at different\npositions in two images, which results in two challenges: 1) how to achieve\ninter-modality complementation using spatially misaligned RGB-T pedestrian\npatches, and 2) how to recognize the unpaired pedestrians at the boundary. To\ndeal with these issues, we propose a new paradigm for unregistered RGB-T\npedestrian detection, which predicts two separate pedestrian locations in the\nRGB and thermal images, respectively. Specifically, we propose a cross-modality\nproposal-guided feature mining (CPFM) mechanism to extract the two precise\nfusion features for representing the pedestrian in the two modalities, even if\nthe RGB-T image pair is unaligned. It enables us to effectively exploit the\ncomplementation between the two modalities. With the CPFM mechanism, we build a\ntwo-stream dense detector; it predicts the two pedestrian locations in the two\nmodalities based on the corresponding fusion feature mined by the CPFM\nmechanism. Besides, we design a data augmentation method, named Homography, to\nsimulate the discrepancy in scales and views between images. We also\ninvestigate two non-maximum suppression (NMS) methods for post-processing.\nFavorable experimental results demonstrate the effectiveness and robustness of\nour method in dealing with unregistered pedestrians with different shifts.\n","authors":["Chao Tian","Zikun Zhou","Yuqing Huang","Gaojun Li","Zhenyu He"],"pdf_url":"https://arxiv.org/pdf/2308.12111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02051v2","updated":"2023-08-23T12:45:27Z","published":"2023-04-04T18:03:04Z","title":"Multimodal Garment Designer: Human-Centric Latent Diffusion Models for\n Fashion Image Editing","summary":" Fashion illustration is used by designers to communicate their vision and to\nbring the design idea from conceptualization to realization, showing how\nclothes interact with the human body. In this context, computer vision can thus\nbe used to improve the fashion design process. Differently from previous works\nthat mainly focused on the virtual try-on of garments, we propose the task of\nmultimodal-conditioned fashion image editing, guiding the generation of\nhuman-centric fashion images by following multimodal prompts, such as text,\nhuman body poses, and garment sketches. We tackle this problem by proposing a\nnew architecture based on latent diffusion models, an approach that has not\nbeen used before in the fashion domain. Given the lack of existing datasets\nsuitable for the task, we also extend two existing fashion datasets, namely\nDress Code and VITON-HD, with multimodal annotations collected in a\nsemi-automatic manner. Experimental results on these new datasets demonstrate\nthe effectiveness of our proposal, both in terms of realism and coherence with\nthe given multimodal inputs. Source code and collected multimodal annotations\nare publicly available at:\nhttps://github.com/aimagelab/multimodal-garment-designer.\n","authors":["Alberto Baldrati","Davide Morelli","Giuseppe Cartella","Marcella Cornia","Marco Bertini","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2304.02051v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12084v1","updated":"2023-08-23T12:07:39Z","published":"2023-08-23T12:07:39Z","title":"DISGAN: Wavelet-informed Discriminator Guides GAN to MRI\n Super-resolution with Noise Cleaning","summary":" MRI super-resolution (SR) and denoising tasks are fundamental challenges in\nthe field of deep learning, which have traditionally been treated as distinct\ntasks with separate paired training data. In this paper, we propose an\ninnovative method that addresses both tasks simultaneously using a single deep\nlearning model, eliminating the need for explicitly paired noisy and clean\nimages during training. Our proposed model is primarily trained for SR, but\nalso exhibits remarkable noise-cleaning capabilities in the super-resolved\nimages. Instead of conventional approaches that introduce frequency-related\noperations into the generative process, our novel approach involves the use of\na GAN model guided by a frequency-informed discriminator. To achieve this, we\nharness the power of the 3D Discrete Wavelet Transform (DWT) operation as a\nfrequency constraint within the GAN framework for the SR task on magnetic\nresonance imaging (MRI) data. Specifically, our contributions include: 1) a 3D\ngenerator based on residual-in-residual connected blocks; 2) the integration of\nthe 3D DWT with $1\\times 1$ convolution into a DWT+conv unit within a 3D Unet\nfor the discriminator; 3) the use of the trained model for high-quality image\nSR, accompanied by an intrinsic denoising process. We dub the model \"Denoising\nInduced Super-resolution GAN (DISGAN)\" due to its dual effects of SR image\ngeneration and simultaneous denoising. Departing from the traditional approach\nof training SR and denoising tasks as separate models, our proposed DISGAN is\ntrained only on the SR task, but also achieves exceptional performance in\ndenoising. The model is trained on 3D MRI data from dozens of subjects from the\nHuman Connectome Project (HCP) and further evaluated on previously unseen MRI\ndata from subjects with brain tumours and epilepsy to assess its denoising and\nSR performance.\n","authors":["Qi Wang","Lucas Mahler","Julius Steiglechner","Florian Birk","Klaus Scheffler","Gabriele Lohmann"],"pdf_url":"https://arxiv.org/pdf/2308.12084v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2306.14538v3","updated":"2023-08-23T12:03:04Z","published":"2023-06-26T09:21:13Z","title":"Learnable Differencing Center for Nighttime Depth Perception","summary":" Depth completion is the task of recovering dense depth maps from sparse ones,\nusually with the help of color images. Existing image-guided methods perform\nwell on daytime depth perception self-driving benchmarks, but struggle in\nnighttime scenarios with poor visibility and complex illumination. To address\nthese challenges, we propose a simple yet effective framework called LDCNet.\nOur key idea is to use Recurrent Inter-Convolution Differencing (RICD) and\nIllumination-Affinitive Intra-Convolution Differencing (IAICD) to enhance the\nnighttime color images and reduce the negative effects of the varying\nillumination, respectively. RICD explicitly estimates global illumination by\ndifferencing two convolutions with different kernels, treating the\nsmall-kernel-convolution feature as the center of the large-kernel-convolution\nfeature in a new perspective. IAICD softly alleviates local relative light\nintensity by differencing a single convolution, where the center is dynamically\naggregated based on neighboring pixels and the estimated illumination map in\nRICD. On both nighttime depth completion and depth estimation tasks, extensive\nexperiments demonstrate the effectiveness of our LDCNet, reaching the state of\nthe art.\n","authors":["Zhiqiang Yan","Yupeng Zheng","Chongyi Li","Jun Li","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2306.14538v3.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2303.11225v2","updated":"2023-08-23T11:46:57Z","published":"2023-03-20T16:07:02Z","title":"HiFace: High-Fidelity 3D Face Reconstruction by Learning Static and\n Dynamic Details","summary":" 3D Morphable Models (3DMMs) demonstrate great potential for reconstructing\nfaithful and animatable 3D facial surfaces from a single image. The facial\nsurface is influenced by the coarse shape, as well as the static detail (e,g.,\nperson-specific appearance) and dynamic detail (e.g., expression-driven\nwrinkles). Previous work struggles to decouple the static and dynamic details\nthrough image-level supervision, leading to reconstructions that are not\nrealistic. In this paper, we aim at high-fidelity 3D face reconstruction and\npropose HiFace to explicitly model the static and dynamic details.\nSpecifically, the static detail is modeled as the linear combination of a\ndisplacement basis, while the dynamic detail is modeled as the linear\ninterpolation of two displacement maps with polarized expressions. We exploit\nseveral loss functions to jointly learn the coarse shape and fine details with\nboth synthetic and real-world datasets, which enable HiFace to reconstruct\nhigh-fidelity 3D shapes with animatable details. Extensive quantitative and\nqualitative experiments demonstrate that HiFace presents state-of-the-art\nreconstruction quality and faithfully recovers both the static and dynamic\ndetails. Our project page can be found at https://project-hiface.github.io.\n","authors":["Zenghao Chai","Tianke Zhang","Tianyu He","Xu Tan","Tadas Baltrušaitis","HsiangTao Wu","Runnan Li","Sheng Zhao","Chun Yuan","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2303.11225v2.pdf","comment":"Accepted to ICCV 2023, camera-ready version; Project page:\n https://project-hiface.github.io/"},{"id":"http://arxiv.org/abs/2308.12067v1","updated":"2023-08-23T11:27:30Z","published":"2023-08-23T11:27:30Z","title":"InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4","summary":" Multimodal large language models acquire their instruction-following\ncapabilities through a two-stage training process: pre-training on image-text\npairs and fine-tuning on supervised vision-language instruction data. Recent\nstudies have shown that large language models can achieve satisfactory results\neven with a limited amount of high-quality instruction-following data. In this\npaper, we introduce InstructionGPT-4, which is fine-tuned on a small dataset\ncomprising only 200 examples, amounting to approximately 6% of the\ninstruction-following data used in the alignment dataset for MiniGPT-4. We\nfirst propose several metrics to access the quality of multimodal instruction\ndata. Based on these metrics, we present a simple and effective data selector\nto automatically identify and filter low-quality vision-language data. By\nemploying this method, InstructionGPT-4 outperforms the original MiniGPT-4 on\nvarious evaluations (e.g., visual question answering, GPT-4 preference).\nOverall, our findings demonstrate that less but high-quality instruction tuning\ndata is efficient to enable multimodal large language models to generate better\noutput.\n","authors":["Lai Wei","Zihao Jiang","Weiran Huang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10762v3","updated":"2023-08-23T11:21:29Z","published":"2023-03-19T20:31:38Z","title":"Deep Image Fingerprint: Towards Low Budget Synthetic Image Detection and\n Model Lineage Analysis","summary":" The generation of high-quality images has become widely accessible and is a\nrapidly evolving process. As a result, anyone can generate images that are\nindistinguishable from real ones. This leads to a wide range of applications,\nincluding malicious usage with deceptive intentions. Despite advances in\ndetection techniques for generated images, a robust detection method still\neludes us. Furthermore, model personalization techniques might affect the\ndetection capabilities of existing methods. In this work, we utilize the\narchitectural properties of convolutional neural networks (CNNs) to develop a\nnew detection method. Our method can detect images from a known generative\nmodel and enable us to establish relationships between fine-tuned generative\nmodels. We tested the method on images produced by both Generative Adversarial\nNetworks (GANs) and recent large text-to-image models (LTIMs) that rely on\nDiffusion Models. Our approach outperforms others trained under identical\nconditions and achieves comparable performance to state-of-the-art pre-trained\ndetection methods on images generated by Stable Diffusion and MidJourney, with\nsignificantly fewer required train samples.\n","authors":["Sergey Sinitsa","Ohad Fried"],"pdf_url":"https://arxiv.org/pdf/2303.10762v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12064v1","updated":"2023-08-23T11:16:36Z","published":"2023-08-23T11:16:36Z","title":"SILT: Shadow-aware Iterative Label Tuning for Learning to Detect Shadows\n from Noisy Labels","summary":" Existing shadow detection datasets often contain missing or mislabeled\nshadows, which can hinder the performance of deep learning models trained\ndirectly on such data. To address this issue, we propose SILT, the Shadow-aware\nIterative Label Tuning framework, which explicitly considers noise in shadow\nlabels and trains the deep model in a self-training manner. Specifically, we\nincorporate strong data augmentations with shadow counterfeiting to help the\nnetwork better recognize non-shadow regions and alleviate overfitting. We also\ndevise a simple yet effective label tuning strategy with global-local fusion\nand shadow-aware filtering to encourage the network to make significant\nrefinements on the noisy labels. We evaluate the performance of SILT by\nrelabeling the test set of the SBU dataset and conducting various experiments.\nOur results show that even a simple U-Net trained with SILT can outperform all\nstate-of-the-art methods by a large margin. When trained on SBU / UCF / ISTD,\nour network can successfully reduce the Balanced Error Rate by 25.2% / 36.9% /\n21.3% over the best state-of-the-art method.\n","authors":["Han Yang","Tianyu Wang","Xiaowei Hu","Chi-Wing Fu"],"pdf_url":"https://arxiv.org/pdf/2308.12064v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12061v1","updated":"2023-08-23T11:03:28Z","published":"2023-08-23T11:03:28Z","title":"HarvestNet: A Dataset for Detecting Smallholder Farming Activity Using\n Harvest Piles and Remote Sensing","summary":" Small farms contribute to a large share of the productive land in developing\ncountries. In regions such as sub-Saharan Africa, where 80% of farms are small\n(under 2 ha in size), the task of mapping smallholder cropland is an important\npart of tracking sustainability measures such as crop productivity. However,\nthe visually diverse and nuanced appearance of small farms has limited the\neffectiveness of traditional approaches to cropland mapping. Here we introduce\na new approach based on the detection of harvest piles characteristic of many\nsmallholder systems throughout the world. We present HarvestNet, a dataset for\nmapping the presence of farms in the Ethiopian regions of Tigray and Amhara\nduring 2020-2023, collected using expert knowledge and satellite images,\ntotaling 7k hand-labeled images and 2k ground collected labels. We also\nbenchmark a set of baselines including SOTA models in remote sensing with our\nbest models having around 80% classification performance on hand labelled data\nand 90%, 98% accuracy on ground truth data for Tigray, Amhara respectively. We\nalso perform a visual comparison with a widely used pre-existing coverage map\nand show that our model detects an extra 56,621 hectares of cropland in Tigray.\nWe conclude that remote sensing of harvest piles can contribute to more timely\nand accurate cropland assessments in food insecure region.\n","authors":["Jonathan Xu","Amna Elmustafa","Liya Weldegebriel","Emnet Negash","Richard Lee","Chenlin Meng","Stefano Ermon","David Lobell"],"pdf_url":"https://arxiv.org/pdf/2308.12061v1.pdf","comment":"18 pages, 22 figures"},{"id":"http://arxiv.org/abs/2308.12059v1","updated":"2023-08-23T10:59:41Z","published":"2023-08-23T10:59:41Z","title":"Manipulating Embeddings of Stable Diffusion Prompts","summary":" Generative text-to-image models such as Stable Diffusion allow users to\ngenerate images based on a textual description, the prompt. Changing the prompt\nis still the primary means for the user to change a generated image as desired.\nHowever, changing the image by reformulating the prompt remains a difficult\nprocess of trial and error, which has led to the emergence of prompt\nengineering as a new field of research. We propose and analyze methods to\nchange the embedding of a prompt directly instead of the prompt text. It allows\nfor more fine-grained and targeted control that takes into account user\nintentions. Our approach treats the generative text-to-image model as a\ncontinuous function and passes gradients between the image space and the prompt\nembedding space. By addressing different user interaction problems, we can\napply this idea in three scenarios: (1) Optimization of a metric defined in\nimage space that could measure, for example, image style. (2) Assistance of\nusers in creative tasks by enabling them to navigate the image space along a\nselection of directions of \"near\" prompt embeddings. (3) Changing the embedding\nof the prompt to include information that the user has seen in a particular\nseed but finds difficult to describe in the prompt. Our experiments demonstrate\nthe feasibility of the described methods.\n","authors":["Niklas Deckers","Julia Peters","Martin Potthast"],"pdf_url":"https://arxiv.org/pdf/2308.12059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12058v1","updated":"2023-08-23T10:59:20Z","published":"2023-08-23T10:59:20Z","title":"DR-Tune: Improving Fine-tuning of Pretrained Visual Models by\n Distribution Regularization with Semantic Calibration","summary":" The visual models pretrained on large-scale benchmarks encode general\nknowledge and prove effective in building more powerful representations for\ndownstream tasks. Most existing approaches follow the fine-tuning paradigm,\neither by initializing or regularizing the downstream model based on the\npretrained one. The former fails to retain the knowledge in the successive\nfine-tuning phase, thereby prone to be over-fitting, and the latter imposes\nstrong constraints to the weights or feature maps of the downstream model\nwithout considering semantic drift, often incurring insufficient optimization.\nTo deal with these issues, we propose a novel fine-tuning framework, namely\ndistribution regularization with semantic calibration (DR-Tune). It employs\ndistribution regularization by enforcing the downstream task head to decrease\nits classification error on the pretrained feature distribution, which prevents\nit from over-fitting while enabling sufficient training of downstream encoders.\nFurthermore, to alleviate the interference by semantic drift, we develop the\nsemantic calibration (SC) module to align the global shape and class centers of\nthe pretrained and downstream feature distributions. Extensive experiments on\nwidely used image classification datasets show that DR-Tune consistently\nimproves the performance when combing with various backbones under different\npretraining strategies. Code is available at:\nhttps://github.com/weeknan/DR-Tune.\n","authors":["Nan Zhou","Jiaxin Chen","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2308.12058v1.pdf","comment":"Accepted by ICCV'2023"},{"id":"http://arxiv.org/abs/2307.12907v3","updated":"2023-08-23T10:37:21Z","published":"2023-07-24T16:02:42Z","title":"GridMM: Grid Memory Map for Vision-and-Language Navigation","summary":" Vision-and-language navigation (VLN) enables the agent to navigate to a\nremote location following the natural language instruction in 3D environments.\nTo represent the previously visited environment, most approaches for VLN\nimplement memory using recurrent states, topological maps, or top-down semantic\nmaps. In contrast to these approaches, we build the top-down egocentric and\ndynamically growing Grid Memory Map (i.e., GridMM) to structure the visited\nenvironment. From a global perspective, historical observations are projected\ninto a unified grid map in a top-down view, which can better represent the\nspatial relations of the environment. From a local perspective, we further\npropose an instruction relevance aggregation method to capture fine-grained\nvisual clues in each grid region. Extensive experiments are conducted on both\nthe REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE\ndataset in the continuous environments, showing the superiority of our proposed\nmethod.\n","authors":["Zihan Wang","Xiangyang Li","Jiahao Yang","Yeqi Liu","Shuqiang Jiang"],"pdf_url":"https://arxiv.org/pdf/2307.12907v3.pdf","comment":"Accepted by ICCV 2023. The code is available at\n https://github.com/MrZihan/GridMM"},{"id":"http://arxiv.org/abs/2308.12049v1","updated":"2023-08-23T10:35:37Z","published":"2023-08-23T10:35:37Z","title":"Towards Privacy-Supporting Fall Detection via Deep Unsupervised\n RGB2Depth Adaptation","summary":" Fall detection is a vital task in health monitoring, as it allows the system\nto trigger an alert and therefore enabling faster interventions when a person\nexperiences a fall. Although most previous approaches rely on standard RGB\nvideo data, such detailed appearance-aware monitoring poses significant privacy\nconcerns. Depth sensors, on the other hand, are better at preserving privacy as\nthey merely capture the distance of objects from the sensor or camera, omitting\ncolor and texture information. In this paper, we introduce a privacy-supporting\nsolution that makes the RGB-trained model applicable in depth domain and\nutilizes depth data at test time for fall detection. To achieve cross-modal\nfall detection, we present an unsupervised RGB to Depth (RGB2Depth) cross-modal\ndomain adaptation approach that leverages labelled RGB data and unlabelled\ndepth data during training. Our proposed pipeline incorporates an intermediate\ndomain module for feature bridging, modality adversarial loss for modality\ndiscrimination, classification loss for pseudo-labeled depth data and labeled\nsource data, triplet loss that considers both source and target domains, and a\nnovel adaptive loss weight adjustment method for improved coordination among\nvarious losses. Our approach achieves state-of-the-art results in the\nunsupervised RGB2Depth domain adaptation task for fall detection. Code is\navailable at https://github.com/1015206533/privacy_supporting_fall_detection.\n","authors":["Hejun Xiao","Kunyu Peng","Xiangsheng Huang","Alina Roitberg1","Hao Li","Zhaohui Wang","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2308.12049v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12048v1","updated":"2023-08-23T10:29:25Z","published":"2023-08-23T10:29:25Z","title":"Head-Tail Cooperative Learning Network for Unbiased Scene Graph\n Generation","summary":" Scene Graph Generation (SGG) as a critical task in image understanding,\nfacing the challenge of head-biased prediction caused by the long-tail\ndistribution of predicates. However, current unbiased SGG methods can easily\nprioritize improving the prediction of tail predicates while ignoring the\nsubstantial sacrifice in the prediction of head predicates, leading to a shift\nfrom head bias to tail bias. To address this issue, we propose a model-agnostic\nHead-Tail Collaborative Learning (HTCL) network that includes head-prefer and\ntail-prefer feature representation branches that collaborate to achieve\naccurate recognition of both head and tail predicates. We also propose a\nself-supervised learning approach to enhance the prediction ability of the\ntail-prefer feature representation branch by constraining tail-prefer predicate\nfeatures. Specifically, self-supervised learning converges head predicate\nfeatures to their class centers while dispersing tail predicate features as\nmuch as possible through contrast learning and head center loss. We demonstrate\nthe effectiveness of our HTCL by applying it to various SGG models on VG150,\nOpen Images V6 and GQA200 datasets. The results show that our method achieves\nhigher mean Recall with a minimal sacrifice in Recall and achieves a new\nstate-of-the-art overall performance. Our code is available at\nhttps://github.com/wanglei0618/HTCL.\n","authors":["Lei Wang","Zejian Yuan","Yao Lu","Badong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.12048v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.12045v1","updated":"2023-08-23T10:25:37Z","published":"2023-08-23T10:25:37Z","title":"CgT-GAN: CLIP-guided Text GAN for Image Captioning","summary":" The large-scale visual-language pre-trained model, Contrastive Language-Image\nPre-training (CLIP), has significantly improved image captioning for scenarios\nwithout human-annotated image-caption pairs. Recent advanced CLIP-based image\ncaptioning without human annotations follows a text-only training paradigm,\ni.e., reconstructing text from shared embedding space. Nevertheless, these\napproaches are limited by the training/inference gap or huge storage\nrequirements for text embeddings. Given that it is trivial to obtain images in\nthe real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates\nimages into the training process to enable the model to \"see\" real visual\nmodality. Particularly, we use adversarial training to teach CgT-GAN to mimic\nthe phrases of an external text corpus and CLIP-based reward to provide\nsemantic guidance. The caption generator is jointly rewarded based on the\ncaption naturalness to human language calculated from the GAN's discriminator\nand the semantic guidance reward computed by the CLIP-based reward module. In\naddition to the cosine similarity as the semantic guidance reward (i.e.,\nCLIP-cos), we further introduce a novel semantic guidance reward called\nCLIP-agg, which aligns the generated caption with a weighted text embedding by\nattentively aggregating the entire corpus. Experimental results on three\nsubtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms\nstate-of-the-art methods significantly across all metrics. Code is available at\nhttps://github.com/Lihr747/CgtGAN.\n","authors":["Jiarui Yu","Haoran Li","Yanbin Hao","Bin Zhu","Tong Xu","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2308.12045v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2306.15782v3","updated":"2023-08-23T10:02:15Z","published":"2023-06-27T20:09:56Z","title":"UTRNet: High-Resolution Urdu Text Recognition In Printed Documents","summary":" In this paper, we propose a novel approach to address the challenges of\nprinted Urdu text recognition using high-resolution, multi-scale semantic\nfeature extraction. Our proposed UTRNet architecture, a hybrid CNN-RNN model,\ndemonstrates state-of-the-art performance on benchmark datasets. To address the\nlimitations of previous works, which struggle to generalize to the intricacies\nof the Urdu script and the lack of sufficient annotated real-world data, we\nhave introduced the UTRSet-Real, a large-scale annotated real-world dataset\ncomprising over 11,000 lines and UTRSet-Synth, a synthetic dataset with 20,000\nlines closely resembling real-world and made corrections to the ground truth of\nthe existing IIITH dataset, making it a more reliable resource for future\nresearch. We also provide UrduDoc, a benchmark dataset for Urdu text line\ndetection in scanned documents. Additionally, we have developed an online tool\nfor end-to-end Urdu OCR from printed documents by integrating UTRNet with a\ntext detection model. Our work not only addresses the current limitations of\nUrdu OCR but also paves the way for future research in this area and\nfacilitates the continued advancement of Urdu OCR technology. The project page\nwith source code, datasets, annotations, trained models, and online tool is\navailable at abdur75648.github.io/UTRNet.\n","authors":["Abdur Rahman","Arjun Ghosh","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2306.15782v3.pdf","comment":"Accepted at The 17th International Conference on Document Analysis\n and Recognition (ICDAR 2023)"},{"id":"http://arxiv.org/abs/2308.12038v1","updated":"2023-08-23T09:55:41Z","published":"2023-08-23T09:55:41Z","title":"Large Multilingual Models Pivot Zero-Shot Multimodal Learning across\n Languages","summary":" Recently there has been a significant surge in multimodal learning in terms\nof both image-to-text and text-to-image generation. However, the success is\ntypically limited to English, leaving other languages largely behind. Building\na competitive counterpart in other languages is highly challenging due to the\nlow-resource nature of non-English multimodal data (i.e., lack of large-scale,\nhigh-quality image-text data). In this work, we propose MPM, an effective\ntraining paradigm for training large multimodal models in low-resource\nlanguages. MPM demonstrates that Multilingual language models can Pivot\nzero-shot Multimodal learning across languages. Specifically, based on a strong\nmultilingual large language model, multimodal models pretrained on English-only\nimage-text data can well generalize to other languages in a zero-shot manner\nfor both image-to-text and text-to-image generation, even surpassing models\ntrained on image-text data in native languages. Taking Chinese as a practice of\nMPM, we build large multimodal models VisCPM in image-to-text and text-to-image\ngeneration, which achieve state-of-the-art (open-source) performance in\nChinese. To facilitate future research, we open-source codes and model weights\nat https://github.com/OpenBMB/VisCPM.git.\n","authors":["Jinyi Hu","Yuan Yao","Chongyi Wang","Shan Wang","Yinxu Pan","Qianyu Chen","Tianyu Yu","Hanghao Wu","Yue Zhao","Haoye Zhang","Xu Han","Yankai Lin","Jiao Xue","Dahai Li","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12038v1.pdf","comment":"https://github.com/OpenBMB/VisCPM.git"},{"id":"http://arxiv.org/abs/2308.12035v1","updated":"2023-08-23T09:49:20Z","published":"2023-08-23T09:49:20Z","title":"RefEgo: Referring Expression Comprehension Dataset from First-Person\n Perception of Ego4D","summary":" Grounding textual expressions on scene objects from first-person views is a\ntruly demanding capability in developing agents that are aware of their\nsurroundings and behave following intuitive text instructions. Such capability\nis of necessity for glass-devices or autonomous robots to localize referred\nobjects in the real-world. In the conventional referring expression\ncomprehension tasks of images, however, datasets are mostly constructed based\non the web-crawled data and don't reflect diverse real-world structures on the\ntask of grounding textual expressions in diverse objects in the real world.\nRecently, a massive-scale egocentric video dataset of Ego4D was proposed. Ego4D\ncovers around the world diverse real-world scenes including numerous indoor and\noutdoor situations such as shopping, cooking, walking, talking, manufacturing,\netc. Based on egocentric videos of Ego4D, we constructed a broad coverage of\nthe video-based referring expression comprehension dataset: RefEgo. Our dataset\nincludes more than 12k video clips and 41 hours for video-based referring\nexpression comprehension annotation. In experiments, we combine the\nstate-of-the-art 2D referring expression comprehension models with the object\ntracking algorithm, achieving the video-wise referred object tracking even in\ndifficult conditions: the referred object becomes out-of-frame in the middle of\nthe video or multiple similar objects are presented in the video.\n","authors":["Shuhei Kurita","Naoki Katsura","Eri Onami"],"pdf_url":"https://arxiv.org/pdf/2308.12035v1.pdf","comment":"15 pages, 11 figures. ICCV2023"},{"id":"http://arxiv.org/abs/2209.08996v2","updated":"2023-08-23T09:31:26Z","published":"2022-09-19T13:20:19Z","title":"EDO-Net: Learning Elastic Properties of Deformable Objects from Graph\n Dynamics","summary":" We study the problem of learning graph dynamics of deformable objects that\ngeneralizes to unknown physical properties. Our key insight is to leverage a\nlatent representation of elastic physical properties of cloth-like deformable\nobjects that can be extracted, for example, from a pulling interaction. In this\npaper we propose EDO-Net (Elastic Deformable Object - Net), a model of graph\ndynamics trained on a large variety of samples with different elastic\nproperties that does not rely on ground-truth labels of the properties. EDO-Net\njointly learns an adaptation module, and a forward-dynamics module. The former\nis responsible for extracting a latent representation of the physical\nproperties of the object, while the latter leverages the latent representation\nto predict future states of cloth-like objects represented as graphs. We\nevaluate EDO-Net both in simulation and real world, assessing its capabilities\nof: 1) generalizing to unknown physical properties, 2) transferring the learned\nrepresentation to new downstream tasks.\n","authors":["Alberta Longhini","Marco Moletta","Alfredo Reichlin","Michael C. Welle","David Held","Zackory Erickson","Danica Kragic"],"pdf_url":"https://arxiv.org/pdf/2209.08996v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13991v2","updated":"2023-08-23T09:27:19Z","published":"2023-02-27T17:30:00Z","title":"Learning to Generalize towards Unseen Domains via a Content-Aware Style\n Invariant Model for Disease Detection from Chest X-rays","summary":" Performance degradation due to source domain mismatch is a longstanding\nchallenge in deep learning-based medical image analysis, particularly for chest\nX-rays (CXRs). Several methods (e.g., adversarial training, multi-domain\nmixups) have been proposed to extract domain-invariant high-level features to\naddress this domain shift. However, these methods do not explicitly regularize\nthe content and style characteristics of the extracted domain-invariant\nfeatures. Recent studies have demonstrated that CNN models exhibit a strong\nbias toward styles (e.g., uninformative textures) rather than content (e.g.,\nshape), in stark contrast to the human-vision system. Radiologists tend to\nlearn visual cues from CXRs and thus perform well across multiple domains.\nTherefore, in medical imaging for pathology diagnosis from CXR images, models\nshould extract domain-invariant features that are style-invariant and\ncontent-biased. Motivated by this, we employ the novel style randomization\nmodules (SRMs) at both image and feature levels that work together\nhierarchically to create rich style perturbed features on the fly while keeping\nthe content intact. In addition, we leverage consistency regularizations\nbetween global semantic features and predicted probability distributions,\nrespectively, for with and without style perturbed versions of the same CXR\nimage to tweak the model's sensitivity toward content markers for accurate\npredictions. Extensive experiments with three large-scale thoracic disease\ndatasets, i.e., CheXpert, MIMIC-CXR, and BRAX, demonstrate that our proposed\nframework is more robust in the presence of domain shift and achieves\nstate-of-the-art performance.\n","authors":["Mohammad Zunaed","Md. Aynal Haque","Taufiq Hasan"],"pdf_url":"https://arxiv.org/pdf/2302.13991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12017v1","updated":"2023-08-23T09:20:05Z","published":"2023-08-23T09:20:05Z","title":"Distribution-Aware Calibration for Object Detection with Noisy Bounding\n Boxes","summary":" Large-scale well-annotated datasets are of great importance for training an\neffective object detector. However, obtaining accurate bounding box annotations\nis laborious and demanding. Unfortunately, the resultant noisy bounding boxes\ncould cause corrupt supervision signals and thus diminish detection\nperformance. Motivated by the observation that the real ground-truth is usually\nsituated in the aggregation region of the proposals assigned to a noisy\nground-truth, we propose DIStribution-aware CalibratiOn (DISCO) to model the\nspatial distribution of proposals for calibrating supervision signals. In\nDISCO, spatial distribution modeling is performed to statistically extract the\npotential locations of objects. Based on the modeled distribution, three\ndistribution-aware techniques, i.e., distribution-aware proposal augmentation\n(DA-Aug), distribution-aware box refinement (DA-Ref), and distribution-aware\nconfidence estimation (DA-Est), are developed to improve classification,\nlocalization, and interpretability, respectively. Extensive experiments on\nlarge-scale noisy image datasets (i.e., Pascal VOC and MS-COCO) demonstrate\nthat DISCO can achieve state-of-the-art detection performance, especially at\nhigh noise levels.\n","authors":["Donghao Zhou","Jialin Li","Jinpeng Li","Jiancheng Huang","Qiang Nie","Yong Liu","Bin-Bin Gao","Qiong Wang","Pheng-Ann Heng","Guangyong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.12017v1.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2308.08210v2","updated":"2023-08-23T09:15:20Z","published":"2023-08-16T08:28:01Z","title":"Neural Spherical Harmonics for structurally coherent continuous\n representation of diffusion MRI signal","summary":" We present a novel way to model diffusion magnetic resonance imaging (dMRI)\ndatasets, that benefits from the structural coherence of the human brain while\nonly using data from a single subject. Current methods model the dMRI signal in\nindividual voxels, disregarding the intervoxel coherence that is present. We\nuse a neural network to parameterize a spherical harmonics series (NeSH) to\nrepresent the dMRI signal of a single subject from the Human Connectome Project\ndataset, continuous in both the angular and spatial domain. The reconstructed\ndMRI signal using this method shows a more structurally coherent representation\nof the data. Noise in gradient images is removed and the fiber orientation\ndistribution functions show a smooth change in direction along a fiber tract.\nWe showcase how the reconstruction can be used to calculate mean diffusivity,\nfractional anisotropy, and total apparent fiber density. These results can be\nachieved with a single model architecture, tuning only one hyperparameter. In\nthis paper we also demonstrate how upsampling in both the angular and spatial\ndomain yields reconstructions that are on par or better than existing methods.\n","authors":["Tom Hendriks","Anna Vilanova","Maxime Chamberland"],"pdf_url":"https://arxiv.org/pdf/2308.08210v2.pdf","comment":"12 pages, 6 figures, accepted for cdMRI workshop at MICCAI 2023\n Updated to fix typo in author name (Villanova -> Vilanova)"},{"id":"http://arxiv.org/abs/2308.12009v1","updated":"2023-08-23T09:02:01Z","published":"2023-08-23T09:02:01Z","title":"StofNet: Super-resolution Time of Flight Network","summary":" Time of Flight (ToF) is a prevalent depth sensing technology in the fields of\nrobotics, medical imaging, and non-destructive testing. Yet, ToF sensing faces\nchallenges from complex ambient conditions making an inverse modelling from the\nsparse temporal information intractable. This paper highlights the potential of\nmodern super-resolution techniques to learn varying surroundings for a reliable\nand accurate ToF detection. Unlike existing models, we tailor an architecture\nfor sub-sample precise semi-global signal localization by combining\nsuper-resolution with an efficient residual contraction block to balance\nbetween fine signal details and large scale contextual information. We\nconsolidate research on ToF by conducting a benchmark comparison against six\nstate-of-the-art methods for which we employ two publicly available datasets.\nThis includes the release of our SToF-Chirp dataset captured by an airborne\nultrasound transducer. Results showcase the superior performance of our\nproposed StofNet in terms of precision, reliability and model complexity. Our\ncode is available at https://github.com/hahnec/stofnet.\n","authors":["Christopher Hahne","Michel Hayoz","Raphael Sznitman"],"pdf_url":"https://arxiv.org/pdf/2308.12009v1.pdf","comment":"pre-print"},{"id":"http://arxiv.org/abs/2308.10522v3","updated":"2023-08-23T08:49:54Z","published":"2023-08-21T07:19:47Z","title":"Information Theory-Guided Heuristic Progressive Multi-View Coding","summary":" Multi-view representation learning aims to capture comprehensive information\nfrom multiple views of a shared context. Recent works intuitively apply\ncontrastive learning to different views in a pairwise manner, which is still\nscalable: view-specific noise is not filtered in learning view-shared\nrepresentations; the fake negative pairs, where the negative terms are actually\nwithin the same class as the positive, and the real negative pairs are\ncoequally treated; evenly measuring the similarities between terms might\ninterfere with optimization. Importantly, few works study the theoretical\nframework of generalized self-supervised multi-view learning, especially for\nmore than two views. To this end, we rethink the existing multi-view learning\nparadigm from the perspective of information theory and then propose a novel\ninformation theoretical framework for generalized multi-view learning. Guided\nby it, we build a multi-view coding method with a three-tier progressive\narchitecture, namely Information theory-guided hierarchical Progressive\nMulti-view Coding (IPMC). In the distribution-tier, IPMC aligns the\ndistribution between views to reduce view-specific noise. In the set-tier, IPMC\nconstructs self-adjusted contrasting pools, which are adaptively modified by a\nview filter. Lastly, in the instance-tier, we adopt a designed unified loss to\nlearn representations and reduce the gradient interference. Theoretically and\nempirically, we demonstrate the superiority of IPMC over state-of-the-art\nmethods.\n","authors":["Jiangmeng Li","Hang Gao","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.10522v3.pdf","comment":"This paper is accepted by the jourcal of Neural Networks (Elsevier)\n by 2023. arXiv admin note: substantial text overlap with arXiv:2109.02344"},{"id":"http://arxiv.org/abs/2308.12006v1","updated":"2023-08-23T08:49:43Z","published":"2023-08-23T08:49:43Z","title":"Multi-stage Factorized Spatio-Temporal Representation for RGB-D Action\n and Gesture Recognition","summary":" RGB-D action and gesture recognition remain an interesting topic in\nhuman-centered scene understanding, primarily due to the multiple granularities\nand large variation in human motion. Although many RGB-D based action and\ngesture recognition approaches have demonstrated remarkable results by\nutilizing highly integrated spatio-temporal representations across multiple\nmodalities (i.e., RGB and depth data), they still encounter several challenges.\nFirstly, vanilla 3D convolution makes it hard to capture fine-grained motion\ndifferences between local clips under different modalities. Secondly, the\nintricate nature of highly integrated spatio-temporal modeling can lead to\noptimization difficulties. Thirdly, duplicate and unnecessary information can\nadd complexity and complicate entangled spatio-temporal modeling. To address\nthe above issues, we propose an innovative heuristic architecture called\nMulti-stage Factorized Spatio-Temporal (MFST) for RGB-D action and gesture\nrecognition. The proposed MFST model comprises a 3D Central Difference\nConvolution Stem (CDC-Stem) module and multiple factorized spatio-temporal\nstages. The CDC-Stem enriches fine-grained temporal perception, and the\nmultiple hierarchical spatio-temporal stages construct dimension-independent\nhigher-order semantic primitives. Specifically, the CDC-Stem module captures\nbottom-level spatio-temporal features and passes them successively to the\nfollowing spatio-temporal factored stages to capture the hierarchical spatial\nand temporal features through the Multi- Scale Convolution and Transformer\n(MSC-Trans) hybrid block and Weight-shared Multi-Scale Transformer (WMS-Trans)\nblock. The seamless integration of these innovative designs results in a robust\nspatio-temporal representation that outperforms state-of-the-art approaches on\nRGB-D action and gesture recognition datasets.\n","authors":["Yujun Ma","Benjia Zhou","Ruili Wang","Pichao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12006v1.pdf","comment":"ACM MM'23 has accepted this paper"},{"id":"http://arxiv.org/abs/2308.12001v1","updated":"2023-08-23T08:41:21Z","published":"2023-08-23T08:41:21Z","title":"Local Distortion Aware Efficient Transformer Adaptation for Image\n Quality Assessment","summary":" Image Quality Assessment (IQA) constitutes a fundamental task within the\nfield of computer vision, yet it remains an unresolved challenge, owing to the\nintricate distortion conditions, diverse image contents, and limited\navailability of data. Recently, the community has witnessed the emergence of\nnumerous large-scale pretrained foundation models, which greatly benefit from\ndramatically increased data and parameter capacities. However, it remains an\nopen problem whether the scaling law in high-level tasks is also applicable to\nIQA task which is closely related to low-level clues. In this paper, we\ndemonstrate that with proper injection of local distortion features, a larger\npretrained and fixed foundation model performs better in IQA tasks.\nSpecifically, for the lack of local distortion structure and inductive bias of\nvision transformer (ViT), alongside the large-scale pretrained ViT, we use\nanother pretrained convolution neural network (CNN), which is well known for\ncapturing the local structure, to extract multi-scale image features. Further,\nwe propose a local distortion extractor to obtain local distortion features\nfrom the pretrained CNN and a local distortion injector to inject the local\ndistortion features into ViT. By only training the extractor and injector, our\nmethod can benefit from the rich knowledge in the powerful foundation models\nand achieve state-of-the-art performance on popular IQA datasets, indicating\nthat IQA is not only a low-level problem but also benefits from stronger\nhigh-level features drawn from large-scale pretrained models.\n","authors":["Kangmin Xu","Liang Liao","Jing Xiao","Chaofeng Chen","Haoning Wu","Qiong Yan","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2308.12001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11994v1","updated":"2023-08-23T08:29:10Z","published":"2023-08-23T08:29:10Z","title":"Progressive Feature Mining and External Knowledge-Assisted\n Text-Pedestrian Image Retrieval","summary":" Text-Pedestrian Image Retrieval aims to use the text describing pedestrian\nappearance to retrieve the corresponding pedestrian image. This task involves\nnot only modality discrepancy, but also the challenge of the textual diversity\nof pedestrians with the same identity. At present, although existing research\nprogress has been made in text-pedestrian image retrieval, these methods do not\ncomprehensively consider the above-mentioned problems. Considering these, this\npaper proposes a progressive feature mining and external knowledge-assisted\nfeature purification method. Specifically, we use a progressive mining mode to\nenable the model to mine discriminative features from neglected information,\nthereby avoiding the loss of discriminative information and improving the\nexpression ability of features. In addition, to further reduce the negative\nimpact of modal discrepancy and text diversity on cross-modal matching, we\npropose to use other sample knowledge of the same modality, i.e., external\nknowledge to enhance identity-consistent features and weaken\nidentity-inconsistent features. This process purifies features and alleviates\nthe interference caused by textual diversity and negative sample correlation\nfeatures of the same modal. Extensive experiments on three challenging datasets\ndemonstrate the effectiveness and superiority of the proposed method, and the\nretrieval performance even surpasses that of the large-scale model-based method\non large-scale datasets.\n","authors":["Huafeng Li","Shedan Yang","Yafei Zhang","Dapeng Tao","Zhengtao Yu"],"pdf_url":"https://arxiv.org/pdf/2308.11994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11990v1","updated":"2023-08-23T08:25:30Z","published":"2023-08-23T08:25:30Z","title":"RankMixup: Ranking-Based Mixup Training for Network Calibration","summary":" Network calibration aims to accurately estimate the level of confidences,\nwhich is particularly important for employing deep neural networks in\nreal-world systems. Recent approaches leverage mixup to calibrate the network's\npredictions during training. However, they do not consider the problem that\nmixtures of labels in mixup may not accurately represent the actual\ndistribution of augmented samples. In this paper, we present RankMixup, a novel\nmixup-based framework alleviating the problem of the mixture of labels for\nnetwork calibration. To this end, we propose to use an ordinal ranking\nrelationship between raw and mixup-augmented samples as an alternative\nsupervisory signal to the label mixtures for network calibration. We\nhypothesize that the network should estimate a higher level of confidence for\nthe raw samples than the augmented ones (Fig.1). To implement this idea, we\nintroduce a mixup-based ranking loss (MRL) that encourages lower confidences\nfor augmented samples compared to raw ones, maintaining the ranking\nrelationship. We also propose to leverage the ranking relationship among\nmultiple mixup-augmented samples to further improve the calibration capability.\nAugmented samples with larger mixing coefficients are expected to have higher\nconfidences and vice versa (Fig.1). That is, the order of confidences should be\naligned with that of mixing coefficients. To this end, we introduce a novel\nloss, M-NDCG, in order to reduce the number of misaligned pairs of the\ncoefficients and confidences. Extensive experimental results on standard\nbenchmarks for network calibration demonstrate the effectiveness of RankMixup.\n","authors":["Jongyoun Noh","Hyekang Park","Junghyup Lee","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2308.11990v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11983v1","updated":"2023-08-23T08:15:15Z","published":"2023-08-23T08:15:15Z","title":"Multi-Modal Multi-Task (3MT) Road Segmentation","summary":" Multi-modal systems have the capacity of producing more reliable results than\nsystems with a single modality in road detection due to perceiving different\naspects of the scene. We focus on using raw sensor inputs instead of, as it is\ntypically done in many SOTA works, leveraging architectures that require high\npre-processing costs such as surface normals or dense depth predictions. By\nusing raw sensor inputs, we aim to utilize a low-cost model thatminimizes both\nthe pre-processing andmodel computation costs. This study presents a\ncost-effective and highly accurate solution for road segmentation by\nintegrating data from multiple sensorswithin a multi-task learning\narchitecture.Afusion architecture is proposed in which RGB and LiDAR depth\nimages constitute the inputs of the network. Another contribution of this study\nis to use IMU/GNSS (inertial measurement unit/global navigation satellite\nsystem) inertial navigation system whose data is collected synchronously and\ncalibrated with a LiDAR-camera to compute aggregated dense LiDAR depth images.\nIt has been demonstrated by experiments on the KITTI dataset that the proposed\nmethod offers fast and high-performance solutions. We have also shown the\nperformance of our method on Cityscapes where raw LiDAR data is not available.\nThe segmentation results obtained for both full and half resolution images are\ncompetitive with existing methods. Therefore, we conclude that our method is\nnot dependent only on raw LiDAR data; rather, it can be used with different\nsensor modalities. The inference times obtained in all experiments are very\npromising for real-time experiments.\n","authors":["Erkan Milli","Özgür Erkent","Asım Egemen Yılmaz"],"pdf_url":"https://arxiv.org/pdf/2308.11983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11979v1","updated":"2023-08-23T07:58:20Z","published":"2023-08-23T07:58:20Z","title":"Rotation-Invariant Completion Network","summary":" Real-world point clouds usually suffer from incompleteness and display\ndifferent poses. While current point cloud completion methods excel in\nreproducing complete point clouds with consistent poses as seen in the training\nset, their performance tends to be unsatisfactory when handling point clouds\nwith diverse poses. We propose a network named Rotation-Invariant Completion\nNetwork (RICNet), which consists of two parts: a Dual Pipeline Completion\nNetwork (DPCNet) and an enhancing module. Firstly, DPCNet generates a coarse\ncomplete point cloud. The feature extraction module of DPCNet can extract\nconsistent features, no matter if the input point cloud has undergone rotation\nor translation. Subsequently, the enhancing module refines the fine-grained\ndetails of the final generated point cloud. RICNet achieves better rotation\ninvariance in feature extraction and incorporates structural relationships in\nman-made objects. To assess the performance of RICNet and existing methods on\npoint clouds with various poses, we applied random transformations to the point\nclouds in the MVP dataset and conducted experiments on them. Our experiments\ndemonstrate that RICNet exhibits superior completion performance compared to\nexisting methods.\n","authors":["Yu Chen","Pengcheng Shi"],"pdf_url":"https://arxiv.org/pdf/2308.11979v1.pdf","comment":"12 pages, accepted to PRCV 2023 (The 6th Chinese Conference on\n Pattern Recognition and Computer Vision)"},{"id":"http://arxiv.org/abs/2308.11974v1","updated":"2023-08-23T07:46:44Z","published":"2023-08-23T07:46:44Z","title":"Blending-NeRF: Text-Driven Localized Editing in Neural Radiance Fields","summary":" Text-driven localized editing of 3D objects is particularly difficult as\nlocally mixing the original 3D object with the intended new object and style\neffects without distorting the object's form is not a straightforward process.\nTo address this issue, we propose a novel NeRF-based model, Blending-NeRF,\nwhich consists of two NeRF networks: pretrained NeRF and editable NeRF.\nAdditionally, we introduce new blending operations that allow Blending-NeRF to\nproperly edit target regions which are localized by text. By using a pretrained\nvision-language aligned model, CLIP, we guide Blending-NeRF to add new objects\nwith varying colors and densities, modify textures, and remove parts of the\noriginal object. Our extensive experiments demonstrate that Blending-NeRF\nproduces naturally and locally edited 3D objects from various text prompts.\n","authors":["Hyeonseop Song","Seokhun Choi","Hoseok Do","Chul Lee","Taehyeong Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11974v1.pdf","comment":"Accepted to ICCV 2023. The first two authors contributed equally to\n this work"},{"id":"http://arxiv.org/abs/2308.11971v1","updated":"2023-08-23T07:36:30Z","published":"2023-08-23T07:36:30Z","title":"EVE: Efficient Vision-Language Pre-training with Masked Prediction and\n Modality-Aware MoE","summary":" Building scalable vision-language models to learn from diverse, multimodal\ndata remains an open challenge. In this paper, we introduce an Efficient\nVision-languagE foundation model, namely EVE, which is one unified multimodal\nTransformer pre-trained solely by one unified pre-training task. Specifically,\nEVE encodes both vision and language within a shared Transformer network\nintegrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which\ncapture modality-specific information by selectively switching to different\nexperts. To unify pre-training tasks of vision and language, EVE performs\nmasked signal modeling on image-text pairs to reconstruct masked signals, i.e.,\nimage pixels and text tokens, given visible signals. This simple yet effective\npre-training objective accelerates training by 3.5x compared to the model\npre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing\nto the combination of the unified architecture and pre-training task, EVE is\neasy to scale up, enabling better downstream performance with fewer resources\nand faster training speed. Despite its simplicity, EVE achieves\nstate-of-the-art performance on various vision-language downstream tasks,\nincluding visual question answering, visual reasoning, and image-text\nretrieval.\n","authors":["Junyi Chen","Longteng Guo","Jia Sun","Shuai Shao","Zehuan Yuan","Liang Lin","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11969v1","updated":"2023-08-23T07:30:16Z","published":"2023-08-23T07:30:16Z","title":"Anisotropic Hybrid Networks for liver tumor segmentation with\n uncertainty quantification","summary":" The burden of liver tumors is important, ranking as the fourth leading cause\nof cancer mortality. In case of hepatocellular carcinoma (HCC), the delineation\nof liver and tumor on contrast-enhanced magnetic resonance imaging (CE-MRI) is\nperformed to guide the treatment strategy. As this task is time-consuming,\nneeds high expertise and could be subject to inter-observer variability there\nis a strong need for automatic tools. However, challenges arise from the lack\nof available training data, as well as the high variability in terms of image\nresolution and MRI sequence. In this work we propose to compare two different\npipelines based on anisotropic models to obtain the segmentation of the liver\nand tumors. The first pipeline corresponds to a baseline multi-class model that\nperforms the simultaneous segmentation of the liver and tumor classes. In the\nsecond approach, we train two distinct binary models, one segmenting the liver\nonly and the other the tumors. Our results show that both pipelines exhibit\ndifferent strengths and weaknesses. Moreover we propose an uncertainty\nquantification strategy allowing the identification of potential false positive\ntumor lesions. Both solutions were submitted to the MICCAI 2023 Atlas challenge\nregarding liver and tumor segmentation.\n","authors":["Benjamin Lambert","Pauline Roca","Florence Forbes","Senan Doyle","Michel Dojat"],"pdf_url":"https://arxiv.org/pdf/2308.11969v1.pdf","comment":"Accepted for presentation at MICCAI Workshop on 2nd\n Resource-Efficient Medical Image Analysis (REMIA)"},{"id":"http://arxiv.org/abs/2301.09091v2","updated":"2023-08-23T07:23:17Z","published":"2023-01-22T10:17:02Z","title":"BallGAN: 3D-aware Image Synthesis with a Spherical Background","summary":" 3D-aware GANs aim to synthesize realistic 3D scenes such that they can be\nrendered in arbitrary perspectives to produce images. Although previous methods\nproduce realistic images, they suffer from unstable training or degenerate\nsolutions where the 3D geometry is unnatural. We hypothesize that the 3D\ngeometry is underdetermined due to the insufficient constraint, i.e., being\nclassified as real image to the discriminator is not enough. To solve this\nproblem, we propose to approximate the background as a spherical surface and\nrepresent a scene as a union of the foreground placed in the sphere and the\nthin spherical background. It reduces the degree of freedom in the background\nfield. Accordingly, we modify the volume rendering equation and incorporate\ndedicated constraints to design a novel 3D-aware GAN framework named BallGAN.\nBallGAN has multiple advantages as follows. 1) It produces more reasonable 3D\ngeometry; the images of a scene across different viewpoints have better\nphotometric consistency and fidelity than the state-of-the-art methods. 2) The\ntraining becomes much more stable. 3) The foreground can be separately rendered\non top of different arbitrary backgrounds.\n","authors":["Minjung Shin","Yunji Seo","Jeongmin Bae","Young Sun Choi","Hyunsu Kim","Hyeran Byun","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2301.09091v2.pdf","comment":"ICCV 2023, Project Page: https://minjung-s.github.io/ballgan"},{"id":"http://arxiv.org/abs/2308.11951v1","updated":"2023-08-23T06:49:07Z","published":"2023-08-23T06:49:07Z","title":"Pose Modulated Avatars from Video","summary":" It is now possible to reconstruct dynamic human motion and shape from a\nsparse set of cameras using Neural Radiance Fields (NeRF) driven by an\nunderlying skeleton. However, a challenge remains to model the deformation of\ncloth and skin in relation to skeleton pose. Unlike existing avatar models that\nare learned implicitly or rely on a proxy surface, our approach is motivated by\nthe observation that different poses necessitate unique frequency assignments.\nNeglecting this distinction yields noisy artifacts in smooth areas or blurs\nfine-grained texture and shape details in sharp regions. We develop a\ntwo-branch neural network that is adaptive and explicit in the frequency\ndomain. The first branch is a graph neural network that models correlations\namong body parts locally, taking skeleton pose as input. The second branch\ncombines these correlation features to a set of global frequencies and then\nmodulates the feature encoding. Our experiments demonstrate that our network\noutperforms state-of-the-art methods in terms of preserving details and\ngeneralization capabilities.\n","authors":["Chunjin Song","Bastian Wandt","Helge Rhodin"],"pdf_url":"https://arxiv.org/pdf/2308.11951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11949v1","updated":"2023-08-23T06:45:11Z","published":"2023-08-23T06:45:11Z","title":"High-quality Image Dehazing with Diffusion Model","summary":" Image dehazing is quite challenging in dense-haze scenarios, where quite less\noriginal information remains in the hazy image. Though previous methods have\nmade marvelous progress, they still suffer from information loss in content and\ncolor in dense-haze scenarios. The recently emerged Denoising Diffusion\nProbabilistic Model (DDPM) exhibits strong generation ability, showing\npotential for solving this problem. However, DDPM fails to consider the physics\nproperty of dehazing task, limiting its information completion capacity. In\nthis work, we propose DehazeDDPM: A DDPM-based and physics-aware image dehazing\nframework that applies to complex hazy scenarios. Specifically, DehazeDDPM\nworks in two stages. The former stage physically models the dehazing task with\nthe Atmospheric Scattering Model (ASM), pulling the distribution closer to the\nclear data and endowing DehazeDDPM with fog-aware ability. The latter stage\nexploits the strong generation ability of DDPM to compensate for the\nhaze-induced huge information loss, by working in conjunction with the physical\nmodelling. Extensive experiments demonstrate that our method attains\nstate-of-the-art performance on both synthetic and real-world hazy datasets.\n","authors":["Hu Yu","Jie Huang","Kaiwen Zheng","Man Zhou","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.11949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11948v1","updated":"2023-08-23T06:44:44Z","published":"2023-08-23T06:44:44Z","title":"Efficient Transfer Learning in Diffusion Models via Adversarial Noise","summary":" Diffusion Probabilistic Models (DPMs) have demonstrated substantial promise\nin image generation tasks but heavily rely on the availability of large amounts\nof training data. Previous works, like GANs, have tackled the limited data\nproblem by transferring pre-trained models learned with sufficient data.\nHowever, those methods are hard to be utilized in DPMs since the distinct\ndifferences between DPM-based and GAN-based methods, showing in the unique\niterative denoising process integral and the need for many timesteps with\nno-targeted noise in DPMs. In this paper, we propose a novel DPMs-based\ntransfer learning method, TAN, to address the limited data problem. It includes\ntwo strategies: similarity-guided training, which boosts transfer with a\nclassifier, and adversarial noise selection which adaptive chooses targeted\nnoise based on the input image. Extensive experiments in the context of\nfew-shot image generation tasks demonstrate that our method is not only\nefficient but also excels in terms of image quality and diversity when compared\nto existing GAN-based and DDPM-based methods.\n","authors":["Xiyu Wang","Baijiong Lin","Daochang Liu","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.11948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10632v2","updated":"2023-08-23T06:41:42Z","published":"2023-08-21T11:07:27Z","title":"Foundation Model-oriented Robustness: Robust Image Model Evaluation with\n Pretrained Models","summary":" Machine learning has demonstrated remarkable performance over finite\ndatasets, yet whether the scores over the fixed benchmarks can sufficiently\nindicate the model's performance in the real world is still in discussion. In\nreality, an ideal robust model will probably behave similarly to the oracle\n(e.g., the human users), thus a good evaluation protocol is probably to\nevaluate the models' behaviors in comparison to the oracle. In this paper, we\nintroduce a new robustness measurement that directly measures the image\nclassification model's performance compared with a surrogate oracle (i.e., a\nfoundation model). Besides, we design a simple method that can accomplish the\nevaluation beyond the scope of the benchmarks. Our method extends the image\ndatasets with new samples that are sufficiently perturbed to be distinct from\nthe ones in the original sets, but are still bounded within the same\nimage-label structure the original test image represents, constrained by a\nfoundation model pretrained with a large amount of samples. As a result, our\nnew method will offer us a new way to evaluate the models' robustness\nperformance, free of limitations of fixed benchmarks or constrained\nperturbations, although scoped by the power of the oracle. In addition to the\nevaluation results, we also leverage our generated data to understand the\nbehaviors of the model and our new evaluation strategies.\n","authors":["Peiyan Zhang","Haoyang Liu","Chaozhuo Li","Xing Xie","Sunghun Kim","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11945v1","updated":"2023-08-23T06:37:41Z","published":"2023-08-23T06:37:41Z","title":"LongDanceDiff: Long-term Dance Generation with Conditional Diffusion\n Model","summary":" Dancing with music is always an essential human art form to express emotion.\nDue to the high temporal-spacial complexity, long-term 3D realist dance\ngeneration synchronized with music is challenging. Existing methods suffer from\nthe freezing problem when generating long-term dances due to error accumulation\nand training-inference discrepancy. To address this, we design a conditional\ndiffusion model, LongDanceDiff, for this sequence-to-sequence long-term dance\ngeneration, addressing the challenges of temporal coherency and spatial\nconstraint. LongDanceDiff contains a transformer-based diffusion model, where\nthe input is a concatenation of music, past motions, and noised future motions.\nThis partial noising strategy leverages the full-attention mechanism and learns\nthe dependencies among music and past motions. To enhance the diversity of\ngenerated dance motions and mitigate the freezing problem, we introduce a\nmutual information minimization objective that regularizes the dependency\nbetween past and future motions. We also address common visual quality issues\nin dance generation, such as foot sliding and unsmooth motion, by incorporating\nspatial constraints through a Global-Trajectory Modulation (GTM) layer and\nmotion perceptual losses, thereby improving the smoothness and naturalness of\nmotion generation. Extensive experiments demonstrate a significant improvement\nin our approach over the existing state-of-the-art methods. We plan to release\nour codes and models soon.\n","authors":["Siqi Yang","Zejun Yang","Zhisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01095v2","updated":"2023-08-23T06:26:56Z","published":"2023-08-02T11:58:43Z","title":"AutoPoster: A Highly Automatic and Content-aware Design System for\n Advertising Poster Generation","summary":" Advertising posters, a form of information presentation, combine visual and\nlinguistic modalities. Creating a poster involves multiple steps and\nnecessitates design experience and creativity. This paper introduces\nAutoPoster, a highly automatic and content-aware system for generating\nadvertising posters. With only product images and titles as inputs, AutoPoster\ncan automatically produce posters of varying sizes through four key stages:\nimage cleaning and retargeting, layout generation, tagline generation, and\nstyle attribute prediction. To ensure visual harmony of posters, two\ncontent-aware models are incorporated for layout and tagline generation.\nMoreover, we propose a novel multi-task Style Attribute Predictor (SAP) to\njointly predict visual style attributes. Meanwhile, to our knowledge, we\npropose the first poster generation dataset that includes visual attribute\nannotations for over 76k posters. Qualitative and quantitative outcomes from\nuser studies and experiments substantiate the efficacy of our system and the\naesthetic superiority of the generated posters compared to other poster\ngeneration methods.\n","authors":["Jinpeng Lin","Min Zhou","Ye Ma","Yifan Gao","Chenxi Fei","Yangjian Chen","Zhang Yu","Tiezheng Ge"],"pdf_url":"https://arxiv.org/pdf/2308.01095v2.pdf","comment":"Accepted for ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.11941v1","updated":"2023-08-23T06:22:02Z","published":"2023-08-23T06:22:02Z","title":"Boosting Diffusion Models with an Adaptive Momentum Sampler","summary":" Diffusion probabilistic models (DPMs) have been shown to generate\nhigh-quality images without the need for delicate adversarial training.\nHowever, the current sampling process in DPMs is prone to violent shaking. In\nthis paper, we present a novel reverse sampler for DPMs inspired by the\nwidely-used Adam optimizer. Our proposed sampler can be readily applied to a\npre-trained diffusion model, utilizing momentum mechanisms and adaptive\nupdating to smooth the reverse sampling process and ensure stable generation,\nresulting in outputs of enhanced quality. By implicitly reusing update\ndirections from early steps, our proposed sampler achieves a better balance\nbetween high-level semantics and low-level details. Additionally, this sampler\nis flexible and can be easily integrated into pre-trained DPMs regardless of\nthe sampler used during training. Our experimental results on multiple\nbenchmarks demonstrate that our proposed reverse sampler yields remarkable\nimprovements over different baselines. We will make the source code available.\n","authors":["Xiyu Wang","Anh-Dung Dinh","Daochang Liu","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.11941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09917v2","updated":"2023-08-23T06:19:28Z","published":"2023-08-19T05:49:13Z","title":"Learning Multiscale Consistency for Self-supervised Electron Microscopy\n Instance Segmentation","summary":" Instance segmentation in electron microscopy (EM) volumes poses a significant\nchallenge due to the complex morphology of instances and insufficient\nannotations. Self-supervised learning has recently emerged as a promising\nsolution, enabling the acquisition of prior knowledge of cellular tissue\nstructures that are essential for EM instance segmentation. However, existing\npretraining methods often lack the ability to capture complex visual patterns\nand relationships between voxels, which results in the acquired prior knowledge\nbeing insufficient for downstream EM analysis tasks. In this paper, we propose\na novel pretraining framework that leverages multiscale visual representations\nto capture both voxel-level and feature-level consistency in EM volumes.\nSpecifically, our framework enforces voxel-level consistency between the\noutputs of a Siamese network by a reconstruction function, and incorporates a\ncross-attention mechanism for soft feature matching to achieve fine-grained\nfeature-level consistency. Moreover, we propose a contrastive learning scheme\non the feature pyramid to extract discriminative features across multiple\nscales. We extensively pretrain our method on four large-scale EM datasets,\nachieving promising performance improvements in representative tasks of neuron\nand mitochondria instance segmentation.\n","authors":["Yinda Chen","Wei Huang","Xiaoyu Liu","Qi Chen","Zhiwei Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.09917v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11937v1","updated":"2023-08-23T06:07:56Z","published":"2023-08-23T06:07:56Z","title":"Learning Bottleneck Transformer for Event Image-Voxel Feature Fusion\n based Classification","summary":" Recognizing target objects using an event-based camera draws more and more\nattention in recent years. Existing works usually represent the event streams\ninto point-cloud, voxel, image, etc, and learn the feature representations\nusing various deep neural networks. Their final results may be limited by the\nfollowing factors: monotonous modal expressions and the design of the network\nstructure. To address the aforementioned challenges, this paper proposes a\nnovel dual-stream framework for event representation, extraction, and fusion.\nThis framework simultaneously models two common representations: event images\nand event voxels. By utilizing Transformer and Structured Graph Neural Network\n(GNN) architectures, spatial information and three-dimensional stereo\ninformation can be learned separately. Additionally, a bottleneck Transformer\nis introduced to facilitate the fusion of the dual-stream information.\nExtensive experiments demonstrate that our proposed framework achieves\nstate-of-the-art performance on two widely used event-based classification\ndatasets. The source code of this work is available at:\n\\url{https://github.com/Event-AHU/EFV_event_classification}\n","authors":["Chengguo Yuan","Yu Jin","Zongzhen Wu","Fanting Wei","Yangzirui Wang","Lan Chen","Xiao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11937v1.pdf","comment":"Accepted by PRCV-2023"},{"id":"http://arxiv.org/abs/2303.15749v2","updated":"2023-08-23T06:04:56Z","published":"2023-03-28T06:12:53Z","title":"Iteratively Coupled Multiple Instance Learning from Instance to Bag\n Classifier for Whole Slide Image Classification","summary":" Whole Slide Image (WSI) classification remains a challenge due to their\nextremely high resolution and the absence of fine-grained labels. Presently,\nWSI classification is usually regarded as a Multiple Instance Learning (MIL)\nproblem when only slide-level labels are available. MIL methods involve a patch\nembedding module and a bag-level classification module, but they are\nprohibitively expensive to be trained in an end-to-end manner. Therefore,\nexisting methods usually train them separately, or directly skip the training\nof the embedder. Such schemes hinder the patch embedder's access to slide-level\nsemantic labels, resulting in inconsistency within the entire MIL pipeline. To\novercome this issue, we propose a novel framework called Iteratively Coupled\nMIL (ICMIL), which bridges the loss back-propagation process from the bag-level\nclassifier to the patch embedder. In ICMIL, we use category information in the\nbag-level classifier to guide the patch-level fine-tuning of the patch feature\nextractor. The refined embedder then generates better instance representations\nfor achieving a more accurate bag-level classifier. By coupling the patch\nembedder and bag classifier at a low cost, our proposed framework enables\ninformation exchange between the two modules, benefiting the entire MIL\nclassification model. We tested our framework on two datasets using three\ndifferent backbones, and our experimental results demonstrate consistent\nperformance improvements over state-of-the-art MIL methods. The code is\navailable at: https://github.com/Dootmaan/ICMIL.\n","authors":["Hongyi Wang","Luyang Luo","Fang Wang","Ruofeng Tong","Yen-Wei Chen","Hongjie Hu","Lanfen Lin","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2303.15749v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11561v2","updated":"2023-08-23T05:53:43Z","published":"2023-08-22T16:45:35Z","title":"Target-Grounded Graph-Aware Transformer for Aerial Vision-and-Dialog\n Navigation","summary":" This report details the method of the winning entry of the AVDN Challenge in\nICCV 2023. The competition addresses the Aerial Navigation from Dialog History\n(ANDH) task, which requires a drone agent to associate dialog history with\naerial observations to reach the destination. For better cross-modal grounding\nabilities of the drone agent, we propose a Target-Grounded Graph-Aware\nTransformer (TG-GAT) framework. Concretely, TG-GAT first leverages a\ngraph-aware transformer to capture spatiotemporal dependency, which benefits\nnavigation state tracking and robust action planning. In addition, an auxiliary\nvisual grounding task is devised to boost the agent's awareness of referred\nlandmarks. Moreover, a hybrid augmentation strategy based on large language\nmodels is utilized to mitigate data scarcity limitations. Our TG-GAT framework\nwon the AVDN Challenge 2023, with 2.2% and 3.0% absolute improvements over the\nbaseline on SPL and SR metrics, respectively. The code is available at\nhttps://github.com/yifeisu/avdn-challenge.\n","authors":["Yifei Su","Dong An","Yuan Xu","Kehan Chen","Yan Huang"],"pdf_url":"https://arxiv.org/pdf/2308.11561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11298v2","updated":"2023-08-23T05:44:57Z","published":"2023-08-22T09:20:55Z","title":"BHSD: A 3D Multi-Class Brain Hemorrhage Segmentation Dataset","summary":" Intracranial hemorrhage (ICH) is a pathological condition characterized by\nbleeding inside the skull or brain, which can be attributed to various factors.\nIdentifying, localizing and quantifying ICH has important clinical\nimplications, in a bleed-dependent manner. While deep learning techniques are\nwidely used in medical image segmentation and have been applied to the ICH\nsegmentation task, existing public ICH datasets do not support the multi-class\nsegmentation problem. To address this, we develop the Brain Hemorrhage\nSegmentation Dataset (BHSD), which provides a 3D multi-class ICH dataset\ncontaining 192 volumes with pixel-level annotations and 2200 volumes with\nslice-level annotations across five categories of ICH. To demonstrate the\nutility of the dataset, we formulate a series of supervised and semi-supervised\nICH segmentation tasks. We provide experimental results with state-of-the-art\nmodels as reference benchmarks for further model developments and evaluations\non this dataset.\n","authors":["Biao Wu","Yutong Xie","Zeyu Zhang","Jinchao Ge","Kaspar Yaxley","Suzan Bahadir","Qi Wu","Yifan Liu","Minh-Son To"],"pdf_url":"https://arxiv.org/pdf/2308.11298v2.pdf","comment":"Accepted by MLMI 2023"},{"id":"http://arxiv.org/abs/2308.11932v1","updated":"2023-08-23T05:40:55Z","published":"2023-08-23T05:40:55Z","title":"Synergistic Multiscale Detail Refinement via Intrinsic Supervision for\n Underwater Image Enhancement","summary":" Visual restoration of underwater scenes is crucial for visual tasks, and\navoiding interference from underwater media has become a prominent concern. In\nthis work, we present a synergistic multiscale detail refinement via intrinsic\nsupervision (SMDR-IS) to recover underwater scene details. The low-degradation\nstage provides multiscale detail for original stage, which achieves synergistic\nmultiscale detail refinement through feature propagation via the adaptive\nselective intrinsic supervised feature module (ASISF), which achieves\nsynergistic multiscale detail refinement. ASISF is developed using intrinsic\nsupervision to precisely control and guide feature transmission in the\nmulti-degradation stages. ASISF improves the multiscale detail refinement while\nreducing interference from irrelevant scene information from the\nlow-degradation stage. Additionally, within the multi-degradation\nencoder-decoder of SMDR-IS, we introduce a bifocal intrinsic-context attention\nmodule (BICA). This module is designed to effectively leverage multi-scale\nscene information found in images, using intrinsic supervision principles as\nits foundation. BICA facilitates the guidance of higher-resolution spaces by\nleveraging lower-resolution spaces, considering the significant dependency of\nunderwater image restoration on spatial contextual relationships. During the\ntraining process, the network gains advantages from the integration of a\nmulti-degradation loss function. This function serves as a constraint, enabling\nthe network to effectively exploit information across various scales. When\ncompared with state-of-the-art methods, SMDR-IS demonstrates its outstanding\nperformance. Code will be made publicly available.\n","authors":["Dehuan Zhang","Jingchun Zhou","Weishi Zhang","ChunLe Guo","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2308.11932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16016v2","updated":"2023-08-23T05:39:05Z","published":"2023-06-28T08:44:00Z","title":"Positive Label Is All You Need for Multi-Label Classification","summary":" Multi-label classification (MLC) suffers from the inevitable label noise in\ntraining data due to the difficulty in annotating various semantic labels in\neach image. To mitigate the influence of noisy labels, existing methods mainly\ndevote to identifying and correcting the label mistakes via a trained MLC\nmodel. However, these methods still involve annoying noisy labels in training,\nwhich can result in imprecise recognition of noisy labels and weaken the\nperformance. In this paper, considering that the negative labels are\nsubstantially more than positive labels, and most noisy labels are from the\nnegative labels, we directly discard all the negative labels in the dataset,\nand propose a new method dubbed positive and unlabeled multi-label\nclassification (PU-MLC). By extending positive-unlabeled learning into MLC\ntask, our method trains model with only positive labels and unlabeled data, and\nintroduces adaptive re-balance factor and adaptive temperature coefficient in\nthe loss function to alleviate the catastrophic imbalance in label distribution\nand over-smoothing of probabilities in training. Furthermore, to capture both\nlocal and global dependencies in the image, we also introduce a local-global\nconvolution module, which supplements global information into existing\nconvolution layers with no retraining of backbone required. Our PU-MLC is\nsimple and effective, and it is applicable to both MLC and MLC with partial\nlabels (MLC-PL) tasks. Extensive experiments on MS-COCO and PASCAL VOC datasets\ndemonstrate that our PU-MLC achieves significantly improvements on both MLC and\nMLC-PL settings with even fewer annotations. Code will be released.\n","authors":["Zhixiang Yuan","Kaixin Zhang","Tao Huang"],"pdf_url":"https://arxiv.org/pdf/2306.16016v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11928v1","updated":"2023-08-23T05:32:24Z","published":"2023-08-23T05:32:24Z","title":"OFVL-MS: Once for Visual Localization across Multiple Indoor Scenes","summary":" In this work, we seek to predict camera poses across scenes with a multi-task\nlearning manner, where we view the localization of each scene as a new task. We\npropose OFVL-MS, a unified framework that dispenses with the traditional\npractice of training a model for each individual scene and relieves gradient\nconflict induced by optimizing multiple scenes collectively, enabling efficient\nstorage yet precise visual localization for all scenes. Technically, in the\nforward pass of OFVL-MS, we design a layer-adaptive sharing policy with a\nlearnable score for each layer to automatically determine whether the layer is\nshared or not. Such sharing policy empowers us to acquire task-shared\nparameters for a reduction of storage cost and task-specific parameters for\nlearning scene-related features to alleviate gradient conflict. In the backward\npass of OFVL-MS, we introduce a gradient normalization algorithm that\nhomogenizes the gradient magnitude of the task-shared parameters so that all\ntasks converge at the same pace. Furthermore, a sparse penalty loss is applied\non the learnable scores to facilitate parameter sharing for all tasks without\nperformance degradation. We conduct comprehensive experiments on multiple\nbenchmarks and our new released indoor dataset LIVL, showing that OFVL-MS\nfamilies significantly outperform the state-of-the-arts with fewer parameters.\nWe also verify that OFVL-MS can generalize to a new scene with much few\nparameters while gaining superior localization performance.\n","authors":["Tao Xie","Kun Dai","Siyi Lu","Ke Wang","Zhiqiang Jiang","Jinghan Gao","Dedong Liu","Jie Xu","Lijun Zhao","Ruifeng Li"],"pdf_url":"https://arxiv.org/pdf/2308.11928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11927v1","updated":"2023-08-23T05:26:27Z","published":"2023-08-23T05:26:27Z","title":"Recovering a Molecule's 3D Dynamics from Liquid-phase Electron\n Microscopy Movies","summary":" The dynamics of biomolecules are crucial for our understanding of their\nfunctioning in living systems. However, current 3D imaging techniques, such as\ncryogenic electron microscopy (cryo-EM), require freezing the sample, which\nlimits the observation of their conformational changes in real time. The\ninnovative liquid-phase electron microscopy (liquid-phase EM) technique allows\nmolecules to be placed in the native liquid environment, providing a unique\nopportunity to observe their dynamics. In this paper, we propose TEMPOR, a\nTemporal Electron MicroscoPy Object Reconstruction algorithm for liquid-phase\nEM that leverages an implicit neural representation (INR) and a dynamical\nvariational auto-encoder (DVAE) to recover time series of molecular structures.\nWe demonstrate its advantages in recovering different motion dynamics from two\nsimulated datasets, 7bcq and Cas9. To our knowledge, our work is the first\nattempt to directly recover 3D structures of a temporally-varying particle from\nliquid-phase EM movies. It provides a promising new approach for studying\nmolecules' 3D dynamics in structural biology.\n","authors":["Enze Ye","Yuhang Wang","Hong Zhang","Yiqin Gao","Huan Wang","He Sun"],"pdf_url":"https://arxiv.org/pdf/2308.11927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09991v2","updated":"2023-08-23T05:19:03Z","published":"2023-08-19T11:52:12Z","title":"AltDiffusion: A Multilingual Text-to-Image Diffusion Model","summary":" Large Text-to-Image(T2I) diffusion models have shown a remarkable capability\nto produce photorealistic and diverse images based on text inputs. However,\nexisting works only support limited language input, e.g., English, Chinese, and\nJapanese, leaving users beyond these languages underserved and blocking the\nglobal expansion of T2I models. Therefore, this paper presents AltDiffusion, a\nnovel multilingual T2I diffusion model that supports eighteen different\nlanguages. Specifically, we first train a multilingual text encoder based on\nthe knowledge distillation. Then we plug it into a pretrained English-only\ndiffusion model and train the model with a two-stage schema to enhance the\nmultilingual capability, including concept alignment and quality improvement\nstage on a large-scale multilingual dataset. Furthermore, we introduce a new\nbenchmark, which includes Multilingual-General-18(MG-18) and\nMultilingual-Cultural-18(MC-18) datasets, to evaluate the capabilities of T2I\ndiffusion models for generating high-quality images and capturing\nculture-specific concepts in different languages. Experimental results on both\nMG-18 and MC-18 demonstrate that AltDiffusion outperforms current\nstate-of-the-art T2I models, e.g., Stable Diffusion in multilingual\nunderstanding, especially with respect to culture-specific concepts, while\nstill having comparable capability for generating high-quality images. All\nsource code and checkpoints could be found in\nhttps://github.com/superhero-7/AltDiffuson.\n","authors":["Fulong Ye","Guang Liu","Xinya Wu","Ledell Wu"],"pdf_url":"https://arxiv.org/pdf/2308.09991v2.pdf","comment":"15 pages; 17 figures"},{"id":"http://arxiv.org/abs/2306.14122v3","updated":"2023-08-23T05:04:58Z","published":"2023-06-25T04:33:56Z","title":"Chain-of-Thought Prompt Distillation for Multimodal Named Entity\n Recognition and Multimodal Relation Extraction","summary":" Multimodal Named Entity Recognition (MNER) and Multimodal Relation Extraction\n(MRE) necessitate the fundamental reasoning capacity for intricate linguistic\nand multimodal comprehension. In this study, we explore distilling the\nreasoning ability of large language models (LLMs) into a more compact student\nmodel by generating a \\textit{chain of thought} (CoT) -- a sequence of\nintermediate reasoning steps. Specifically, we commence by exemplifying the\nelicitation of such reasoning ability from LLMs through CoT prompts covering\nmulti-grain (noun, sentence, multimodality) and data-augmentation (style,\nentity, image) dimensions. Subsequently, we present a novel conditional prompt\ndistillation method to assimilate the commonsense reasoning ability from LLMs,\nthereby enhancing the utility of the student model in addressing text-only\ninputs without the requisite addition of image and CoT knowledge. Extensive\nexperiments reveal that our approach attains state-of-the-art accuracy and\nmanifests a plethora of advantages concerning interpretability, data\nefficiency, and cross-domain generalization on MNER and MRE datasets.\n","authors":["Feng Chen","Yujian Feng"],"pdf_url":"https://arxiv.org/pdf/2306.14122v3.pdf","comment":"modification"},{"id":"http://arxiv.org/abs/2308.11920v1","updated":"2023-08-23T05:04:01Z","published":"2023-08-23T05:04:01Z","title":"Concept Bottleneck with Visual Concept Filtering for Explainable Medical\n Image Classification","summary":" Interpretability is a crucial factor in building reliable models for various\nmedical applications. Concept Bottleneck Models (CBMs) enable interpretable\nimage classification by utilizing human-understandable concepts as intermediate\ntargets. Unlike conventional methods that require extensive human labor to\nconstruct the concept set, recent works leveraging Large Language Models (LLMs)\nfor generating concepts made automatic concept generation possible. However,\nthose methods do not consider whether a concept is visually relevant or not,\nwhich is an important factor in computing meaningful concept scores. Therefore,\nwe propose a visual activation score that measures whether the concept contains\nvisual cues or not, which can be easily computed with unlabeled image data.\nComputed visual activation scores are then used to filter out the less visible\nconcepts, thus resulting in a final concept set with visually meaningful\nconcepts. Our experimental results show that adopting the proposed visual\nactivation score for concept filtering consistently boosts performance compared\nto the baseline. Moreover, qualitative analyses also validate that visually\nrelevant concepts are successfully selected with the visual activation score.\n","authors":["Injae Kim","Jongha Kim","Joonmyung Choi","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11920v1.pdf","comment":"Accepted to MedAGI Workshop at MICCAI 2023 (Oral Presentation)"},{"id":"http://arxiv.org/abs/2308.11918v1","updated":"2023-08-23T05:03:45Z","published":"2023-08-23T05:03:45Z","title":"AMSP-UOD: When Vortex Convolution and Stochastic Perturbation Meet\n Underwater Object Detection","summary":" In this paper, we present a novel Amplitude-Modulated Stochastic Perturbation\nand Vortex Convolutional Network, AMSP-UOD, designed for underwater object\ndetection. AMSP-UOD specifically addresses the impact of non-ideal imaging\nfactors on detection accuracy in complex underwater environments. To mitigate\nthe influence of noise on object detection performance, we propose AMSP Vortex\nConvolution (AMSP-VConv) to disrupt the noise distribution, enhance feature\nextraction capabilities, effectively reduce parameters, and improve network\nrobustness. We design the Feature Association Decoupling Cross Stage Partial\n(FAD-CSP) module, which strengthens the association of long and short-range\nfeatures, improving the network performance in complex underwater environments.\nAdditionally, our sophisticated post-processing method, based on non-maximum\nsuppression with aspect-ratio similarity thresholds, optimizes detection in\ndense scenes, such as waterweed and schools of fish, improving object detection\naccuracy. Extensive experiments on the URPC and RUOD datasets demonstrate that\nour method outperforms existing state-of-the-art methods in terms of accuracy\nand noise immunity. AMSP-UOD proposes an innovative solution with the potential\nfor real-world applications. Code will be made publicly available.\n","authors":["Jingchun Zhou","Zongxin He","Kin-Man Lam","Yudong Wang","Weishi Zhang","ChunLe Guo","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2308.11918v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11917v1","updated":"2023-08-23T05:03:06Z","published":"2023-08-23T05:03:06Z","title":"LFS-GAN: Lifelong Few-Shot Image Generation","summary":" We address a challenging lifelong few-shot image generation task for the\nfirst time. In this situation, a generative model learns a sequence of tasks\nusing only a few samples per task. Consequently, the learned model encounters\nboth catastrophic forgetting and overfitting problems at a time. Existing\nstudies on lifelong GANs have proposed modulation-based methods to prevent\ncatastrophic forgetting. However, they require considerable additional\nparameters and cannot generate high-fidelity and diverse images from limited\ndata. On the other hand, the existing few-shot GANs suffer from severe\ncatastrophic forgetting when learning multiple tasks. To alleviate these\nissues, we propose a framework called Lifelong Few-Shot GAN (LFS-GAN) that can\ngenerate high-quality and diverse images in lifelong few-shot image generation\ntask. Our proposed framework learns each task using an efficient task-specific\nmodulator - Learnable Factorized Tensor (LeFT). LeFT is rank-constrained and\nhas a rich representation ability due to its unique reconstruction technique.\nFurthermore, we propose a novel mode seeking loss to improve the diversity of\nour model in low-data circumstances. Extensive experiments demonstrate that the\nproposed LFS-GAN can generate high-fidelity and diverse images without any\nforgetting and mode collapse in various domains, achieving state-of-the-art in\nlifelong few-shot image generation task. Surprisingly, we find that our LFS-GAN\neven outperforms the existing few-shot GANs in the few-shot image generation\ntask. The code is available at Github.\n","authors":["Juwon Seo","Ji-Su Kang","Gyeong-Moon Park"],"pdf_url":"https://arxiv.org/pdf/2308.11917v1.pdf","comment":"20 pages, 19 figures, 14 tables, ICCV 2023 Poster"},{"id":"http://arxiv.org/abs/2308.11916v1","updated":"2023-08-23T05:02:17Z","published":"2023-08-23T05:02:17Z","title":"Semantic-Aware Implicit Template Learning via Part Deformation\n Consistency","summary":" Learning implicit templates as neural fields has recently shown impressive\nperformance in unsupervised shape correspondence. Despite the success, we\nobserve current approaches, which solely rely on geometric information, often\nlearn suboptimal deformation across generic object shapes, which have high\nstructural variability. In this paper, we highlight the importance of part\ndeformation consistency and propose a semantic-aware implicit template learning\nframework to enable semantically plausible deformation. By leveraging semantic\nprior from a self-supervised feature extractor, we suggest local conditioning\nwith novel semantic-aware deformation code and deformation consistency\nregularizations regarding part deformation, global deformation, and global\nscaling. Our extensive experiments demonstrate the superiority of the proposed\nmethod over baselines in various tasks: keypoint transfer, part label transfer,\nand texture transfer. More interestingly, our framework shows a larger\nperformance gain under more challenging settings. We also provide qualitative\nanalyses to validate the effectiveness of semantic-aware deformation. The code\nis available at https://github.com/mlvlab/PDC.\n","authors":["Sihyeon Kim","Minseok Joo","Jaewon Lee","Juyeon Ko","Juhan Cha","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11916v1.pdf","comment":"ICCV camera-ready version"},{"id":"http://arxiv.org/abs/2308.11911v1","updated":"2023-08-23T04:52:48Z","published":"2023-08-23T04:52:48Z","title":"ACLS: Adaptive and Conditional Label Smoothing for Network Calibration","summary":" We address the problem of network calibration adjusting miscalibrated\nconfidences of deep neural networks. Many approaches to network calibration\nadopt a regularization-based method that exploits a regularization term to\nsmooth the miscalibrated confidences. Although these approaches have shown the\neffectiveness on calibrating the networks, there is still a lack of\nunderstanding on the underlying principles of regularization in terms of\nnetwork calibration. We present in this paper an in-depth analysis of existing\nregularization-based methods, providing a better understanding on how they\naffect to network calibration. Specifically, we have observed that 1) the\nregularization-based methods can be interpreted as variants of label smoothing,\nand 2) they do not always behave desirably. Based on the analysis, we introduce\na novel loss function, dubbed ACLS, that unifies the merits of existing\nregularization methods, while avoiding the limitations. We show extensive\nexperimental results for image classification and semantic segmentation on\nstandard benchmarks, including CIFAR10, Tiny-ImageNet, ImageNet, and PASCAL\nVOC, demonstrating the effectiveness of our loss function.\n","authors":["Hyekang Park","Jongyoun Noh","Youngmin Oh","Donghyeon Baek","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2308.11911v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.02237v2","updated":"2023-08-23T04:40:45Z","published":"2023-08-04T10:26:59Z","title":"MSECNet: Accurate and Robust Normal Estimation for 3D Point Clouds by\n Multi-Scale Edge Conditioning","summary":" Estimating surface normals from 3D point clouds is critical for various\napplications, including surface reconstruction and rendering. While existing\nmethods for normal estimation perform well in regions where normals change\nslowly, they tend to fail where normals vary rapidly. To address this issue, we\npropose a novel approach called MSECNet, which improves estimation in normal\nvarying regions by treating normal variation modeling as an edge detection\nproblem. MSECNet consists of a backbone network and a multi-scale edge\nconditioning (MSEC) stream. The MSEC stream achieves robust edge detection\nthrough multi-scale feature fusion and adaptive edge detection. The detected\nedges are then combined with the output of the backbone network using the edge\nconditioning module to produce edge-aware representations. Extensive\nexperiments show that MSECNet outperforms existing methods on both synthetic\n(PCPNet) and real-world (SceneNN) datasets while running significantly faster.\nWe also conduct various analyses to investigate the contribution of each\ncomponent in the MSEC stream. Finally, we demonstrate the effectiveness of our\napproach in surface reconstruction.\n","authors":["Haoyi Xiu","Xin Liu","Weimin Wang","Kyoung-Sook Kim","Masashi Matsuoka"],"pdf_url":"https://arxiv.org/pdf/2308.02237v2.pdf","comment":"Accepted for ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.11909v1","updated":"2023-08-23T04:29:40Z","published":"2023-08-23T04:29:40Z","title":"Edge-aware Hard Clustering Graph Pooling for Brain Imaging Data","summary":" Graph Convolutional Networks (GCNs) can capture non-Euclidean spatial\ndependence between different brain regions, and the graph pooling operator in\nGCNs is key to enhancing the representation learning capability and acquiring\nabnormal brain maps. However, the majority of existing research designs graph\npooling operators only from the perspective of nodes while disregarding the\noriginal edge features, in a way that not only confines graph pooling\napplication scenarios, but also diminishes its ability to capture critical\nsubstructures. In this study, a clustering graph pooling method that first\nsupports multidimensional edge features, called Edge-aware hard clustering\ngraph pooling (EHCPool), is developed. EHCPool proposes the first\n'Edge-to-node' score evaluation criterion based on edge features to assess node\nfeature significance. To more effectively capture the critical subgraphs, a\nnovel Iteration n-top strategy is further designed to adaptively learn sparse\nhard clustering assignments for graphs. Subsequently, an innovative N-E\nAggregation strategy is presented to aggregate node and edge feature\ninformation in each independent subgraph. The proposed model was evaluated on\nmulti-site brain imaging public datasets and yielded state-of-the-art\nperformance. We believe this method is the first deep learning tool with the\npotential to probe different types of abnormal functional brain networks from\ndata-driven perspective.\n","authors":["Cheng Zhu","Jiayi Zhu","Lijuan Zhang","Xi Wu","Shuqi Yang","Ping Liang","Honghan Chen","Ying Tan"],"pdf_url":"https://arxiv.org/pdf/2308.11909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08942v2","updated":"2023-08-23T04:12:26Z","published":"2023-03-15T21:22:21Z","title":"Spherical Space Feature Decomposition for Guided Depth Map\n Super-Resolution","summary":" Guided depth map super-resolution (GDSR), as a hot topic in multi-modal image\nprocessing, aims to upsample low-resolution (LR) depth maps with additional\ninformation involved in high-resolution (HR) RGB images from the same scene.\nThe critical step of this task is to effectively extract domain-shared and\ndomain-private RGB/depth features. In addition, three detailed issues, namely\nblurry edges, noisy surfaces, and over-transferred RGB texture, need to be\naddressed. In this paper, we propose the Spherical Space feature Decomposition\nNetwork (SSDNet) to solve the above issues. To better model cross-modality\nfeatures, Restormer block-based RGB/depth encoders are employed for extracting\nlocal-global features. Then, the extracted features are mapped to the spherical\nspace to complete the separation of private features and the alignment of\nshared features. Shared features of RGB are fused with the depth features to\ncomplete the GDSR task. Subsequently, a spherical contrast refinement (SCR)\nmodule is proposed to further address the detail issues. Patches that are\nclassified according to imperfect categories are input into the SCR module,\nwhere the patch features are pulled closer to the ground truth and pushed away\nfrom the corresponding imperfect samples in the spherical feature space via\ncontrastive learning. Extensive experiments demonstrate that our method can\nachieve state-of-the-art results on four test datasets, as well as successfully\ngeneralize to real-world scenes. The code is available at\n\\url{https://github.com/Zhaozixiang1228/GDSR-SSDNet}.\n","authors":["Zixiang Zhao","Jiangshe Zhang","Xiang Gu","Chengli Tan","Shuang Xu","Yulun Zhang","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2303.08942v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11903v1","updated":"2023-08-23T04:08:53Z","published":"2023-08-23T04:08:53Z","title":"Rethinking Data Perturbation and Model Stabilization for Semi-supervised\n Medical Image Segmentation","summary":" Studies on semi-supervised medical image segmentation (SSMIS) have seen fast\nprogress recently. Due to the limited labelled data, SSMIS methods mainly focus\non effectively leveraging unlabeled data to enhance the segmentation\nperformance. However, despite their promising performance, current\nstate-of-the-art methods often prioritize integrating complex techniques and\nloss terms rather than addressing the core challenges of semi-supervised\nscenarios directly. We argue that the key to SSMIS lies in generating\nsubstantial and appropriate prediction disagreement on unlabeled data. To this\nend, we emphasize the crutiality of data perturbation and model stabilization\nin semi-supervised segmentation, and propose a simple yet effective approach to\nboost SSMIS performance significantly, dubbed DPMS. Specifically, we first\nrevisit SSMIS from three distinct perspectives: the data, the model, and the\nloss, and conduct a comprehensive study of corresponding strategies to examine\ntheir effectiveness. Based on these examinations, we then propose DPMS, which\nadopts a plain teacher-student framework with a standard supervised loss and\nunsupervised consistency loss. To produce appropriate prediction disagreements,\nDPMS perturbs the unlabeled data via strong augmentations to enlarge prediction\ndisagreements considerably. On the other hand, using EMA teacher when strong\naugmentation is applied does not necessarily improve performance. DPMS further\nutilizes a forwarding-twice and momentum updating strategies for normalization\nstatistics to stabilize the training on unlabeled data effectively. Despite its\nsimplicity, DPMS can obtain new state-of-the-art performance on the public 2D\nACDC and 3D LA datasets across various semi-supervised settings, e.g. obtaining\na remarkable 22.62% improvement against previous SOTA on ACDC with 5% labels.\n","authors":["Zhen Zhao","Ye Liu","Meng Zhao","Di Yin","Yixuan Yuan","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.11903v1.pdf","comment":"Code and logs are available at https://github.com/ZhenZHAO/DPMS"},{"id":"http://arxiv.org/abs/2308.11901v1","updated":"2023-08-23T04:01:56Z","published":"2023-08-23T04:01:56Z","title":"Camera-Driven Representation Learning for Unsupervised Domain Adaptive\n Person Re-identification","summary":" We present a novel unsupervised domain adaption method for person\nre-identification (reID) that generalizes a model trained on a labeled source\ndomain to an unlabeled target domain. We introduce a camera-driven curriculum\nlearning (CaCL) framework that leverages camera labels of person images to\ntransfer knowledge from source to target domains progressively. To this end, we\ndivide target domain dataset into multiple subsets based on the camera labels,\nand initially train our model with a single subset (i.e., images captured by a\nsingle camera). We then gradually exploit more subsets for training, according\nto a curriculum sequence obtained with a camera-driven scheduling rule. The\nscheduler considers maximum mean discrepancies (MMD) between each subset and\nthe source domain dataset, such that the subset closer to the source domain is\nexploited earlier within the curriculum. For each curriculum sequence, we\ngenerate pseudo labels of person images in a target domain to train a reID\nmodel in a supervised way. We have observed that the pseudo labels are highly\nbiased toward cameras, suggesting that person images obtained from the same\ncamera are likely to have the same pseudo labels, even for different IDs. To\naddress the camera bias problem, we also introduce a camera-diversity (CD) loss\nencouraging person images of the same pseudo label, but captured across various\ncameras, to involve more for discriminative feature learning, providing person\nrepresentations robust to inter-camera variations. Experimental results on\nstandard benchmarks, including real-to-real and synthetic-to-real scenarios,\ndemonstrate the effectiveness of our framework.\n","authors":["Geon Lee","Sanghoon Lee","Dohyung Kim","Younghoon Shin","Yongsang Yoon","Bumsub Ham"],"pdf_url":"https://arxiv.org/pdf/2308.11901v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11900v1","updated":"2023-08-23T04:01:54Z","published":"2023-08-23T04:01:54Z","title":"HashReID: Dynamic Network with Binary Codes for Efficient Person\n Re-identification","summary":" Biometric applications, such as person re-identification (ReID), are often\ndeployed on energy constrained devices. While recent ReID methods prioritize\nhigh retrieval performance, they often come with large computational costs and\nhigh search time, rendering them less practical in real-world settings. In this\nwork, we propose an input-adaptive network with multiple exit blocks, that can\nterminate computation early if the retrieval is straightforward or noisy,\nsaving a lot of computation. To assess the complexity of the input, we\nintroduce a temporal-based classifier driven by a new training strategy.\nFurthermore, we adopt a binary hash code generation approach instead of relying\non continuous-valued features, which significantly improves the search process\nby a factor of 20. To ensure similarity preservation, we utilize a new ranking\nregularizer that bridges the gap between continuous and binary features.\nExtensive analysis of our proposed method is conducted on three datasets:\nMarket1501, MSMT17 (Multi-Scene Multi-Time), and the BGC1 (BRIAR Government\nCollection). Using our approach, more than 70% of the samples with compact hash\ncodes exit early on the Market1501 dataset, saving 80% of the networks\ncomputational cost and improving over other hash-based methods by 60%. These\nresults demonstrate a significant improvement over dynamic networks and\nshowcase comparable accuracy performance to conventional ReID methods. Code\nwill be made available.\n","authors":["Kshitij Nikhal","Yujunrong Ma","Shuvra S. Bhattacharyya","Benjamin S. Riggan"],"pdf_url":"https://arxiv.org/pdf/2308.11900v1.pdf","comment":"WACV 2024"},{"id":"http://arxiv.org/abs/2302.06845v2","updated":"2023-08-23T03:56:24Z","published":"2023-02-14T05:47:45Z","title":"SEAM: Searching Transferable Mixed-Precision Quantization Policy through\n Large Margin Regularization","summary":" Mixed-precision quantization (MPQ) suffers from the time-consuming process of\nsearching the optimal bit-width allocation i.e., the policy) for each layer,\nespecially when using large-scale datasets such as ISLVRC-2012. This limits the\npracticality of MPQ in real-world deployment scenarios. To address this issue,\nthis paper proposes a novel method for efficiently searching for effective MPQ\npolicies using a small proxy dataset instead of the large-scale dataset used\nfor training the model. Deviating from the established norm of employing a\nconsistent dataset for both model training and MPQ policy search stages, our\napproach, therefore, yields a substantial enhancement in the efficiency of MPQ\nexploration. Nonetheless, using discrepant datasets poses challenges in\nsearching for a transferable MPQ policy. Driven by the observation that\nquantization noise of sub-optimal policy exerts a detrimental influence on the\ndiscriminability of feature representations -- manifesting as diminished class\nmargins and ambiguous decision boundaries -- our method aims to identify\npolicies that uphold the discriminative nature of feature representations,\ni.e., intra-class compactness and inter-class separation. This general and\ndataset-independent property makes us search for the MPQ policy over a rather\nsmall-scale proxy dataset and then the policy can be directly used to quantize\nthe model trained on a large-scale dataset. Our method offers several\nadvantages, including high proxy data utilization, no excessive hyper-parameter\ntuning, and high searching efficiency. We search high-quality MPQ policies with\nthe proxy dataset that has only 4% of the data scale compared to the\nlarge-scale target dataset, achieving the same accuracy as searching directly\non the latter, improving MPQ searching efficiency by up to 300 times.\n","authors":["Chen Tang","Kai Ouyang","Zenghao Chai","Yunpeng Bai","Yuan Meng","Zhi Wang","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.06845v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11898v1","updated":"2023-08-23T03:46:04Z","published":"2023-08-23T03:46:04Z","title":"Exploring the Optimization Objective of One-Class Classification for\n Anomaly Detection","summary":" One-class classification (OCC) is a longstanding method for anomaly\ndetection. With the powerful representation capability of the pre-trained\nbackbone, OCC methods have witnessed significant performance improvements.\nTypically, most of these OCC methods employ transfer learning to enhance the\ndiscriminative nature of the pre-trained backbone's features, thus achieving\nremarkable efficacy. While most current approaches emphasize feature transfer\nstrategies, we argue that the optimization objective space within OCC methods\ncould also be an underlying critical factor influencing performance. In this\nwork, we conducted a thorough investigation into the optimization objective of\nOCC. Through rigorous theoretical analysis and derivation, we unveil a key\ninsights: any space with the suitable norm can serve as an equivalent\nsubstitute for the hypersphere center, without relying on the distribution\nassumption of training samples. Further, we provide guidelines for determining\nthe feasible domain of norms for the OCC optimization objective. This novel\ninsight sparks a simple and data-agnostic deep one-class classification method.\nOur method is straightforward, with a single 1x1 convolutional layer as a\ntrainable projector and any space with suitable norm as the optimization\nobjective. Extensive experiments validate the reliability and efficacy of our\nfindings and the corresponding methodology, resulting in state-of-the-art\nperformance in both one-class classification and industrial vision anomaly\ndetection and segmentation tasks.\n","authors":["Han Gao","Huiyuan Luo","Fei Shen","Zhengtao Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11898v1.pdf","comment":"15 paegs, 10 figures"},{"id":"http://arxiv.org/abs/2308.11896v1","updated":"2023-08-23T03:43:34Z","published":"2023-08-23T03:43:34Z","title":"Age Prediction From Face Images Via Contrastive Learning","summary":" This paper presents a novel approach for accurately estimating age from face\nimages, which overcomes the challenge of collecting a large dataset of\nindividuals with the same identity at different ages. Instead, we leverage\nreadily available face datasets of different people at different ages and aim\nto extract age-related features using contrastive learning. Our method\nemphasizes these relevant features while suppressing identity-related features\nusing a combination of cosine similarity and triplet margin losses. We\ndemonstrate the effectiveness of our proposed approach by achieving\nstate-of-the-art performance on two public datasets, FG-NET and MORPH-II.\n","authors":["Yeongnam Chae","Poulami Raha","Mijung Kim","Bjorn Stenger"],"pdf_url":"https://arxiv.org/pdf/2308.11896v1.pdf","comment":"MVA2023"},{"id":"http://arxiv.org/abs/2308.11894v1","updated":"2023-08-23T03:40:47Z","published":"2023-08-23T03:40:47Z","title":"Does Physical Adversarial Example Really Matter to Autonomous Driving?\n Towards System-Level Effect of Adversarial Object Evasion Attack","summary":" In autonomous driving (AD), accurate perception is indispensable to achieving\nsafe and secure driving. Due to its safety-criticality, the security of AD\nperception has been widely studied. Among different attacks on AD perception,\nthe physical adversarial object evasion attacks are especially severe. However,\nwe find that all existing literature only evaluates their attack effect at the\ntargeted AI component level but not at the system level, i.e., with the entire\nsystem semantics and context such as the full AD pipeline. Thereby, this raises\na critical research question: can these existing researches effectively achieve\nsystem-level attack effects (e.g., traffic rule violations) in the real-world\nAD context? In this work, we conduct the first measurement study on whether and\nhow effectively the existing designs can lead to system-level effects,\nespecially for the STOP sign-evasion attacks due to their popularity and\nseverity. Our evaluation results show that all the representative prior works\ncannot achieve any system-level effects. We observe two design limitations in\nthe prior works: 1) physical model-inconsistent object size distribution in\npixel sampling and 2) lack of vehicle plant model and AD system model\nconsideration. Then, we propose SysAdv, a novel system-driven attack design in\nthe AD context and our evaluation results show that the system-level effects\ncan be significantly improved, i.e., the violation rate increases by around\n70%.\n","authors":["Ningfei Wang","Yunpeng Luo","Takami Sato","Kaidi Xu","Qi Alfred Chen"],"pdf_url":"https://arxiv.org/pdf/2308.11894v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.16680v4","updated":"2023-08-23T03:28:30Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n A Comprehensive Survey","summary":" Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v4.pdf","comment":"Draft Version"},{"id":"http://arxiv.org/abs/2210.01055v3","updated":"2023-08-23T03:24:13Z","published":"2022-10-03T16:13:14Z","title":"CLIP2Point: Transfer CLIP to Point Cloud Classification with Image-Depth\n Pre-training","summary":" Pre-training across 3D vision and language remains under development because\nof limited training data. Recent works attempt to transfer vision-language\npre-training models to 3D vision. PointCLIP converts point cloud data to\nmulti-view depth maps, adopting CLIP for shape classification. However, its\nperformance is restricted by the domain gap between rendered depth maps and\nimages, as well as the diversity of depth distributions. To address this issue,\nwe propose CLIP2Point, an image-depth pre-training method by contrastive\nlearning to transfer CLIP to the 3D domain, and adapt it to point cloud\nclassification. We introduce a new depth rendering setting that forms a better\nvisual effect, and then render 52,460 pairs of images and depth maps from\nShapeNet for pre-training. The pre-training scheme of CLIP2Point combines\ncross-modality learning to enforce the depth features for capturing expressive\nvisual and textual features and intra-modality learning to enhance the\ninvariance of depth aggregation. Additionally, we propose a novel Dual-Path\nAdapter (DPA) module, i.e., a dual-path structure with simplified adapters for\nfew-shot learning. The dual-path structure allows the joint use of CLIP and\nCLIP2Point, and the simplified adapter can well fit few-shot tasks without\npost-search. Experimental results show that CLIP2Point is effective in\ntransferring CLIP knowledge to 3D vision. Our CLIP2Point outperforms PointCLIP\nand other self-supervised 3D networks, achieving state-of-the-art results on\nzero-shot and few-shot classification.\n","authors":["Tianyu Huang","Bowen Dong","Yunhan Yang","Xiaoshui Huang","Rynson W. H. Lau","Wanli Ouyang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2210.01055v3.pdf","comment":"Accepted by ICCV2023"},{"id":"http://arxiv.org/abs/2308.11887v1","updated":"2023-08-23T03:20:31Z","published":"2023-08-23T03:20:31Z","title":"A Unified Framework for 3D Point Cloud Visual Grounding","summary":" 3D point cloud visual grounding plays a critical role in 3D scene\ncomprehension, encompassing 3D referring expression comprehension (3DREC) and\nsegmentation (3DRES). We argue that 3DREC and 3DRES should be unified in one\nframework, which is also a natural progression in the community. To explain,\n3DREC can help 3DRES locate the referent, while 3DRES can also facilitate 3DREC\nvia more finegrained language-visual alignment. To achieve this, this paper\ntakes the initiative step to integrate 3DREC and 3DRES into a unified\nframework, termed 3D Referring Transformer (3DRefTR). Its key idea is to build\nupon a mature 3DREC model and leverage ready query embeddings and visual tokens\nfrom the 3DREC model to construct a dedicated mask branch. Specially, we\npropose Superpoint Mask Branch, which serves a dual purpose: i) By leveraging\nthe heterogeneous CPU-GPU parallelism, while the GPU is occupied generating\nvisual tokens, the CPU concurrently produces superpoints, equivalently\naccomplishing the upsampling computation; ii) By harnessing on the inherent\nassociation between the superpoints and point cloud, it eliminates the heavy\ncomputational overhead on the high-resolution visual features for upsampling.\nThis elegant design enables 3DRefTR to achieve both well-performing 3DRES and\n3DREC capacities with only a 6% additional latency compared to the original\n3DREC model. Empirical evaluations affirm the superiority of 3DRefTR.\nSpecifically, on the ScanRefer dataset, 3DRefTR surpasses the state-of-the-art\n3DRES method by 12.43% in mIoU and improves upon the SOTA 3DREC method by 0.6%\nAcc@0.25IoU.\n","authors":["Haojia Lin","Yongdong Luo","Xiawu Zheng","Lijiang Li","Fei Chao","Taisong Jin","Donghao Luo","Chengjie Wang","Yan Wang","Liujuan Cao"],"pdf_url":"https://arxiv.org/pdf/2308.11887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11579v2","updated":"2023-08-23T03:07:49Z","published":"2023-03-21T04:00:47Z","title":"Diffusion-Based 3D Human Pose Estimation with Multi-Hypothesis\n Aggregation","summary":" In this paper, a novel Diffusion-based 3D Pose estimation (D3DP) method with\nJoint-wise reProjection-based Multi-hypothesis Aggregation (JPMA) is proposed\nfor probabilistic 3D human pose estimation. On the one hand, D3DP generates\nmultiple possible 3D pose hypotheses for a single 2D observation. It gradually\ndiffuses the ground truth 3D poses to a random distribution, and learns a\ndenoiser conditioned on 2D keypoints to recover the uncontaminated 3D poses.\nThe proposed D3DP is compatible with existing 3D pose estimators and supports\nusers to balance efficiency and accuracy during inference through two\ncustomizable parameters. On the other hand, JPMA is proposed to assemble\nmultiple hypotheses generated by D3DP into a single 3D pose for practical use.\nIt reprojects 3D pose hypotheses to the 2D camera plane, selects the best\nhypothesis joint-by-joint based on the reprojection errors, and combines the\nselected joints into the final pose. The proposed JPMA conducts aggregation at\nthe joint level and makes use of the 2D prior information, both of which have\nbeen overlooked by previous approaches. Extensive experiments on Human3.6M and\nMPI-INF-3DHP datasets show that our method outperforms the state-of-the-art\ndeterministic and probabilistic approaches by 1.5% and 8.9%, respectively. Code\nis available at https://github.com/paTRICK-swk/D3DP.\n","authors":["Wenkang Shan","Zhenhua Liu","Xinfeng Zhang","Zhao Wang","Kai Han","Shanshe Wang","Siwei Ma","Wen Gao"],"pdf_url":"https://arxiv.org/pdf/2303.11579v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.02989v2","updated":"2023-08-23T02:58:53Z","published":"2023-08-06T02:15:19Z","title":"Novel Class Discovery for Long-tailed Recognition","summary":" While the novel class discovery has recently made great progress, existing\nmethods typically focus on improving algorithms on class-balanced benchmarks.\nHowever, in real-world recognition tasks, the class distributions of their\ncorresponding datasets are often imbalanced, which leads to serious performance\ndegeneration of those methods. In this paper, we consider a more realistic\nsetting for novel class discovery where the distributions of novel and known\nclasses are long-tailed. One main challenge of this new problem is to discover\nimbalanced novel classes with the help of long-tailed known classes. To tackle\nthis problem, we propose an adaptive self-labeling strategy based on an\nequiangular prototype representation of classes. Our method infers high-quality\npseudo-labels for the novel classes by solving a relaxed optimal transport\nproblem and effectively mitigates the class biases in learning the known and\nnovel classes. We perform extensive experiments on CIFAR100, ImageNet100,\nHerbarium19 and large-scale iNaturalist18 datasets, and the results demonstrate\nthe superiority of our method. Our code is available at\nhttps://github.com/kleinzcy/NCDLR.\n","authors":["Zhang Chuyu","Xu Ruijie","He Xuming"],"pdf_url":"https://arxiv.org/pdf/2308.02989v2.pdf","comment":"TMLR2023, Final version"},{"id":"http://arxiv.org/abs/2308.11880v1","updated":"2023-08-23T02:57:58Z","published":"2023-08-23T02:57:58Z","title":"SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal\n Targets","summary":" Scene understanding using multi-modal data is necessary in many applications,\ne.g., autonomous navigation. To achieve this in a variety of situations,\nexisting models must be able to adapt to shifting data distributions without\narduous data annotation. Current approaches assume that the source data is\navailable during adaptation and that the source consists of paired multi-modal\ndata. Both these assumptions may be problematic for many applications. Source\ndata may not be available due to privacy, security, or economic concerns.\nAssuming the existence of paired multi-modal data for training also entails\nsignificant data collection costs and fails to take advantage of widely\navailable freely distributed pre-trained uni-modal models. In this work, we\nrelax both of these assumptions by addressing the problem of adapting a set of\nmodels trained independently on uni-modal data to a target domain consisting of\nunlabeled multi-modal data, without having access to the original source\ndataset. Our proposed approach solves this problem through a switching\nframework which automatically chooses between two complementary methods of\ncross-modal pseudo-label fusion -- agreement filtering and entropy weighting --\nbased on the estimated domain gap. We demonstrate our work on the semantic\nsegmentation problem. Experiments across seven challenging adaptation scenarios\nverify the efficacy of our approach, achieving results comparable to, and in\nsome cases outperforming, methods which assume access to source data. Our\nmethod achieves an improvement in mIoU of up to 12% over competing baselines.\nOur code is publicly available at https://github.com/csimo005/SUMMIT.\n","authors":["Cody Simons","Dripta S. Raychaudhuri","Sk Miraj Ahmed","Suya You","Konstantinos Karydis","Amit K. Roy-Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2308.11880v1.pdf","comment":"12 pages, 5 figures, 9 tables, ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11877v1","updated":"2023-08-23T02:49:22Z","published":"2023-08-23T02:49:22Z","title":"Integrated Image and Location Analysis for Wound Classification: A Deep\n Learning Approach","summary":" The global burden of acute and chronic wounds presents a compelling case for\nenhancing wound classification methods, a vital step in diagnosing and\ndetermining optimal treatments. Recognizing this need, we introduce an\ninnovative multi-modal network based on a deep convolutional neural network for\ncategorizing wounds into four categories: diabetic, pressure, surgical, and\nvenous ulcers. Our multi-modal network uses wound images and their\ncorresponding body locations for more precise classification. A unique aspect\nof our methodology is incorporating a body map system that facilitates accurate\nwound location tagging, improving upon traditional wound image classification\ntechniques. A distinctive feature of our approach is the integration of models\nsuch as VGG16, ResNet152, and EfficientNet within a novel architecture. This\narchitecture includes elements like spatial and channel-wise\nSqueeze-and-Excitation modules, Axial Attention, and an Adaptive Gated\nMulti-Layer Perceptron, providing a robust foundation for classification. Our\nmulti-modal network was trained and evaluated on two distinct datasets\ncomprising relevant images and corresponding location information. Notably, our\nproposed network outperformed traditional methods, reaching an accuracy range\nof 74.79% to 100% for Region of Interest (ROI) without location\nclassifications, 73.98% to 100% for ROI with location classifications, and\n78.10% to 100% for whole image classifications. This marks a significant\nenhancement over previously reported performance metrics in the literature. Our\nresults indicate the potential of our multi-modal network as an effective\ndecision-support tool for wound image classification, paving the way for its\napplication in various clinical contexts.\n","authors":["Yash Patel","Tirth Shah","Mrinal Kanti Dhar","Taiyu Zhang","Jeffrey Niezgoda","Sandeep Gopalkrishnan","Zeyun Yu"],"pdf_url":"https://arxiv.org/pdf/2308.11877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11875v1","updated":"2023-08-23T02:40:51Z","published":"2023-08-23T02:40:51Z","title":"Motion-to-Matching: A Mixed Paradigm for 3D Single Object Tracking","summary":" 3D single object tracking with LiDAR points is an important task in the\ncomputer vision field. Previous methods usually adopt the matching-based or\nmotion-centric paradigms to estimate the current target status. However, the\nformer is sensitive to the similar distractors and the sparseness of point\ncloud due to relying on appearance matching, while the latter usually focuses\non short-term motion clues (eg. two frames) and ignores the long-term motion\npattern of target. To address these issues, we propose a mixed paradigm with\ntwo stages, named MTM-Tracker, which combines motion modeling with feature\nmatching into a single network. Specifically, in the first stage, we exploit\nthe continuous historical boxes as motion prior and propose an encoder-decoder\nstructure to locate target coarsely. Then, in the second stage, we introduce a\nfeature interaction module to extract motion-aware features from consecutive\npoint clouds and match them to refine target movement as well as regress other\ntarget states. Extensive experiments validate that our paradigm achieves\ncompetitive performance on large-scale datasets (70.9% in KITTI and 51.70% in\nNuScenes). The code will be open soon at\nhttps://github.com/LeoZhiheng/MTM-Tracker.git.\n","authors":["Zhiheng Li","Yu Lin","Yubo Cui","Shuo Li","Zheng Fang"],"pdf_url":"https://arxiv.org/pdf/2308.11875v1.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.11874v1","updated":"2023-08-23T02:37:34Z","published":"2023-08-23T02:37:34Z","title":"Semi-Supervised Learning via Weight-aware Distillation under Class\n Distribution Mismatch","summary":" Semi-Supervised Learning (SSL) under class distribution mismatch aims to\ntackle a challenging problem wherein unlabeled data contain lots of unknown\ncategories unseen in the labeled ones. In such mismatch scenarios, traditional\nSSL suffers severe performance damage due to the harmful invasion of the\ninstances with unknown categories into the target classifier. In this study, by\nstrict mathematical reasoning, we reveal that the SSL error under class\ndistribution mismatch is composed of pseudo-labeling error and invasion error,\nboth of which jointly bound the SSL population risk. To alleviate the SSL\nerror, we propose a robust SSL framework called Weight-Aware Distillation (WAD)\nthat, by weights, selectively transfers knowledge beneficial to the target task\nfrom unsupervised contrastive representation to the target classifier.\nSpecifically, WAD captures adaptive weights and high-quality pseudo labels to\ntarget instances by exploring point mutual information (PMI) in representation\nspace to maximize the role of unlabeled data and filter unknown categories.\nTheoretically, we prove that WAD has a tight upper bound of population risk\nunder class distribution mismatch. Experimentally, extensive results\ndemonstrate that WAD outperforms five state-of-the-art SSL approaches and one\nstandard baseline on two benchmark datasets, CIFAR10 and CIFAR100, and an\nartificial cross-dataset. The code is available at\nhttps://github.com/RUC-DWBI-ML/research/tree/main/WAD-master.\n","authors":["Pan Du","Suyun Zhao","Zisen Sheng","Cuiping Li","Hong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.11874v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2305.19301v2","updated":"2023-08-23T02:18:51Z","published":"2023-05-30T14:24:40Z","title":"On the Choice of Perception Loss Function for Learned Video Compression","summary":" We study causal, low-latency, sequential video compression when the output is\nsubjected to both a mean squared-error (MSE) distortion loss as well as a\nperception loss to target realism. Motivated by prior approaches, we consider\ntwo different perception loss functions (PLFs). The first, PLF-JD, considers\nthe joint distribution (JD) of all the video frames up to the current one,\nwhile the second metric, PLF-FMD, considers the framewise marginal\ndistributions (FMD) between the source and reconstruction. Using information\ntheoretic analysis and deep-learning based experiments, we demonstrate that the\nchoice of PLF can have a significant effect on the reconstruction, especially\nat low-bit rates. In particular, while the reconstruction based on PLF-JD can\nbetter preserve the temporal correlation across frames, it also imposes a\nsignificant penalty in distortion compared to PLF-FMD and further makes it more\ndifficult to recover from errors made in the earlier output frames. Although\nthe choice of PLF decisively affects reconstruction quality, we also\ndemonstrate that it may not be essential to commit to a particular PLF during\nencoding and the choice of PLF can be delegated to the decoder. In particular,\nencoded representations generated by training a system to minimize the MSE\n(without requiring either PLF) can be {\\em near universal} and can generate\nclose to optimal reconstructions for either choice of PLF at the decoder. We\nvalidate our results using (one-shot) information-theoretic analysis, detailed\nstudy of the rate-distortion-perception tradeoff of the Gauss-Markov source\nmodel as well as deep-learning based experiments on moving MNIST and KTH\ndatasets.\n","authors":["Sadaf Salehkalaibar","Buu Phan","Jun Chen","Wei Yu","Ashish Khisti"],"pdf_url":"https://arxiv.org/pdf/2305.19301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11857v1","updated":"2023-08-23T01:19:58Z","published":"2023-08-23T01:19:58Z","title":"CoC-GAN: Employing Context Cluster for Unveiling a New Pathway in Image\n Generation","summary":" Image generation tasks are traditionally undertaken using Convolutional\nNeural Networks (CNN) or Transformer architectures for feature aggregating and\ndispatching. Despite the frequent application of convolution and attention\nstructures, these structures are not fundamentally required to solve the\nproblem of instability and the lack of interpretability in image generation. In\nthis paper, we propose a unique image generation process premised on the\nperspective of converting images into a set of point clouds. In other words, we\ninterpret an image as a set of points. As such, our methodology leverages\nsimple clustering methods named Context Clustering (CoC) to generate images\nfrom unordered point sets, which defies the convention of using convolution or\nattention mechanisms. Hence, we exclusively depend on this clustering\ntechnique, combined with the multi-layer perceptron (MLP) in a generative\nmodel. Furthermore, we implement the integration of a module termed the 'Point\nIncreaser' for the model. This module is just an MLP tasked with generating\nadditional points for clustering, which are subsequently integrated within the\nparadigm of the Generative Adversarial Network (GAN). We introduce this model\nwith the novel structure as the Context Clustering Generative Adversarial\nNetwork (CoC-GAN), which offers a distinctive viewpoint in the domain of\nfeature aggregating and dispatching. Empirical evaluations affirm that our\nCoC-GAN, devoid of convolution and attention mechanisms, exhibits outstanding\nperformance. Its interpretability, endowed by the CoC module, also allows for\nvisualization in our experiments. The promising results underscore the\nfeasibility of our method and thus warrant future investigations of applying\nContext Clustering to more novel and interpretable image generation.\n","authors":["Zihao Wang","Yiming Huang","Ziyu Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.11857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.05184v2","updated":"2023-08-23T01:14:30Z","published":"2022-06-10T15:25:00Z","title":"SERE: Exploring Feature Self-relation for Self-supervised Transformer","summary":" Learning representations with self-supervision for convolutional networks\n(CNN) has been validated to be effective for vision tasks. As an alternative to\nCNN, vision transformers (ViT) have strong representation ability with spatial\nself-attention and channel-level feedforward networks. Recent works reveal that\nself-supervised learning helps unleash the great potential of ViT. Still, most\nworks follow self-supervised strategies designed for CNN, e.g., instance-level\ndiscrimination of samples, but they ignore the properties of ViT. We observe\nthat relational modeling on spatial and channel dimensions distinguishes ViT\nfrom other networks. To enforce this property, we explore the feature\nSElf-RElation (SERE) for training self-supervised ViT. Specifically, instead of\nconducting self-supervised learning solely on feature embeddings from multiple\nviews, we utilize the feature self-relations, i.e., spatial/channel\nself-relations, for self-supervised learning. Self-relation based learning\nfurther enhances the relation modeling ability of ViT, resulting in stronger\nrepresentations that stably improve performance on multiple downstream tasks.\nOur source code will be made publicly available.\n","authors":["Zhong-Yu Li","Shanghua Gao","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2206.05184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08345v3","updated":"2023-08-23T01:10:43Z","published":"2023-08-16T13:10:32Z","title":"GAEI-UNet: Global Attention and Elastic Interaction U-Net for Vessel\n Image Segmentation","summary":" Vessel image segmentation plays a pivotal role in medical diagnostics, aiding\nin the early detection and treatment of vascular diseases. While segmentation\nbased on deep learning has shown promising results, effectively segmenting\nsmall structures and maintaining connectivity between them remains challenging.\nTo address these limitations, we propose GAEI-UNet, a novel model that combines\nglobal attention and elastic interaction-based techniques. GAEI-UNet leverages\nglobal spatial and channel context information to enhance high-level semantic\nunderstanding within the U-Net architecture, enabling precise segmentation of\nsmall vessels. Additionally, we adopt an elastic interaction-based loss\nfunction to improve connectivity among these fine structures. By capturing the\nforces generated by misalignment between target and predicted shapes, our model\neffectively learns to preserve the correct topology of vessel networks.\nEvaluation on retinal vessel dataset -- DRIVE demonstrates the superior\nperformance of GAEI-UNet in terms of SE and connectivity of small structures,\nwithout significantly increasing computational complexity. This research aims\nto advance the field of vessel image segmentation, providing more accurate and\nreliable diagnostic tools for the medical community. The implementation code is\navailable on Code.\n","authors":["Ruiqiang Xiao","Zhuoyue Wan"],"pdf_url":"https://arxiv.org/pdf/2308.08345v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2004.03696 by other authors"},{"id":"http://arxiv.org/abs/2308.11840v1","updated":"2023-08-23T00:17:50Z","published":"2023-08-23T00:17:50Z","title":"Compressed Models Decompress Race Biases: What Quantized Models Forget\n for Fair Face Recognition","summary":" With the ever-growing complexity of deep learning models for face\nrecognition, it becomes hard to deploy these systems in real life. Researchers\nhave two options: 1) use smaller models; 2) compress their current models.\nSince the usage of smaller models might lead to concerning biases, compression\ngains relevance. However, compressing might be also responsible for an increase\nin the bias of the final model. We investigate the overall performance, the\nperformance on each ethnicity subgroup and the racial bias of a\nState-of-the-Art quantization approach when used with synthetic and real data.\nThis analysis provides a few more details on potential benefits of performing\nquantization with synthetic data, for instance, the reduction of biases on the\nmajority of test scenarios. We tested five distinct architectures and three\ndifferent training datasets. The models were evaluated on a fourth dataset\nwhich was collected to infer and compare the performance of face recognition\nmodels on different ethnicity.\n","authors":["Pedro C. Neto","Eduarda Caldeira","Jaime S. Cardoso","Ana F. Sequeira"],"pdf_url":"https://arxiv.org/pdf/2308.11840v1.pdf","comment":"Accepted for Oral at BIOSIG 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2308.12256v1","updated":"2023-08-23T17:16:07Z","published":"2023-08-23T17:16:07Z","title":"Learning from Negative User Feedback and Measuring Responsiveness for\n Sequential Recommenders","summary":" Sequential recommenders have been widely used in industry due to their\nstrength in modeling user preferences. While these models excel at learning a\nuser's positive interests, less attention has been paid to learning from\nnegative user feedback. Negative user feedback is an important lever of user\ncontrol, and comes with an expectation that recommenders should respond quickly\nand reduce similar recommendations to the user. However, negative feedback\nsignals are often ignored in the training objective of sequential retrieval\nmodels, which primarily aim at predicting positive user interactions. In this\nwork, we incorporate explicit and implicit negative user feedback into the\ntraining objective of sequential recommenders in the retrieval stage using a\n\"not-to-recommend\" loss function that optimizes for the log-likelihood of not\nrecommending items with negative feedback. We demonstrate the effectiveness of\nthis approach using live experiments on a large-scale industrial recommender\nsystem. Furthermore, we address a challenge in measuring recommender\nresponsiveness to negative feedback by developing a counterfactual simulation\nframework to compare recommender responses between different user actions,\nshowing improved responsiveness from the modeling change.\n","authors":["Yueqi Wang","Yoni Halpern","Shuo Chang","Jingchen Feng","Elaine Ya Le","Longfei Li","Xujian Liang","Min-Cheng Huang","Shane Li","Alex Beutel","Yaping Zhang","Shuchao Bi"],"pdf_url":"https://arxiv.org/pdf/2308.12256v1.pdf","comment":"RecSys 2023 Industry Track"},{"id":"http://arxiv.org/abs/2308.12241v1","updated":"2023-08-23T16:32:54Z","published":"2023-08-23T16:32:54Z","title":"LLMRec: Benchmarking Large Language Models on Recommendation Task","summary":" Recently, the fast development of Large Language Models (LLMs) such as\nChatGPT has significantly advanced NLP tasks by enhancing the capabilities of\nconversational models. However, the application of LLMs in the recommendation\ndomain has not been thoroughly investigated. To bridge this gap, we propose\nLLMRec, a LLM-based recommender system designed for benchmarking LLMs on\nvarious recommendation tasks. Specifically, we benchmark several popular\noff-the-shelf LLMs, such as ChatGPT, LLaMA, ChatGLM, on five recommendation\ntasks, including rating prediction, sequential recommendation, direct\nrecommendation, explanation generation, and review summarization. Furthermore,\nwe investigate the effectiveness of supervised finetuning to improve LLMs'\ninstruction compliance ability. The benchmark results indicate that LLMs\ndisplayed only moderate proficiency in accuracy-based tasks such as sequential\nand direct recommendation. However, they demonstrated comparable performance to\nstate-of-the-art methods in explainability-based tasks. We also conduct\nqualitative evaluations to further evaluate the quality of contents generated\nby different models, and the results show that LLMs can truly understand the\nprovided information and generate clearer and more reasonable results. We\naspire that this benchmark will serve as an inspiration for researchers to\ndelve deeper into the potential of LLMs in enhancing recommendation\nperformance. Our codes, processed data and benchmark results are available at\nhttps://github.com/williamliujl/LLMRec.\n","authors":["Junling Liu","Chao Liu","Peilin Zhou","Qichen Ye","Dading Chong","Kang Zhou","Yueqi Xie","Yuwei Cao","Shoujin Wang","Chenyu You","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2308.12241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12083v1","updated":"2023-08-23T12:05:48Z","published":"2023-08-23T12:05:48Z","title":"Counterfactual Graph Augmentation for Consumer Unfairness Mitigation in\n Recommender Systems","summary":" In recommendation literature, explainability and fairness are becoming two\nprominent perspectives to consider. However, prior works have mostly addressed\nthem separately, for instance by explaining to consumers why a certain item was\nrecommended or mitigating disparate impacts in recommendation utility. None of\nthem has leveraged explainability techniques to inform unfairness mitigation.\nIn this paper, we propose an approach that relies on counterfactual\nexplanations to augment the set of user-item interactions, such that using them\nwhile inferring recommendations leads to fairer outcomes. Modeling user-item\ninteractions as a bipartite graph, our approach augments the latter by\nidentifying new user-item edges that not only can explain the original\nunfairness by design, but can also mitigate it. Experiments on two public data\nsets show that our approach effectively leads to a better trade-off between\nfairness and recommendation utility compared with state-of-the-art mitigation\nprocedures. We further analyze the characteristics of added edges to highlight\nkey unfairness patterns. Source code available at\nhttps://github.com/jackmedda/RS-BGExplainer/tree/cikm2023.\n","authors":["Ludovico Boratto","Francesco Fabbri","Gianni Fenu","Mirko Marras","Giacomo Medda"],"pdf_url":"https://arxiv.org/pdf/2308.12083v1.pdf","comment":"Accepted as a short paper at CIKM 2023"},{"id":"http://arxiv.org/abs/2308.12039v1","updated":"2023-08-23T09:56:59Z","published":"2023-08-23T09:56:59Z","title":"Hybrid Retrieval and Multi-stage Text Ranking Solution at TREC 2022 Deep\n Learning Track","summary":" Large-scale text retrieval technology has been widely used in various\npractical business scenarios. This paper presents our systems for the TREC 2022\nDeep Learning Track. We explain the hybrid text retrieval and multi-stage text\nranking method adopted in our solution. The retrieval stage combined the two\nstructures of traditional sparse retrieval and neural dense retrieval. In the\nranking stage, in addition to the full interaction-based ranking model built on\nlarge pre-trained language model, we also proposes a lightweight sub-ranking\nmodule to further enhance the final text ranking performance. Evaluation\nresults demonstrate the effectiveness of our proposed approach. Our models\nachieve the 1st and 4th rank on the test set of passage ranking and document\nranking respectively.\n","authors":["Guangwei Xu","Yangzhao Zhang","Longhui Zhang","Dingkun Long","Pengjun Xie","Ruijie Guo"],"pdf_url":"https://arxiv.org/pdf/2308.12039v1.pdf","comment":"TREC 2022 Deep Learning Track"},{"id":"http://arxiv.org/abs/2308.12028v1","updated":"2023-08-23T09:39:18Z","published":"2023-08-23T09:39:18Z","title":"LKPNR: LLM and KG for Personalized News Recommendation Framework","summary":" Accurately recommending candidate news articles to users is a basic challenge\nfaced by personalized news recommendation systems. Traditional methods are\nusually difficult to grasp the complex semantic information in news texts,\nresulting in unsatisfactory recommendation results. Besides, these traditional\nmethods are more friendly to active users with rich historical behaviors.\nHowever, they can not effectively solve the \"long tail problem\" of inactive\nusers. To address these issues, this research presents a novel general\nframework that combines Large Language Models (LLM) and Knowledge Graphs (KG)\ninto semantic representations of traditional methods. In order to improve\nsemantic understanding in complex news texts, we use LLMs' powerful text\nunderstanding ability to generate news representations containing rich semantic\ninformation. In addition, our method combines the information about news\nentities and mines high-order structural information through multiple hops in\nKG, thus alleviating the challenge of long tail distribution. Experimental\nresults demonstrate that compared with various traditional models, the\nframework significantly improves the recommendation effect. The successful\nintegration of LLM and KG in our framework has established a feasible path for\nachieving more accurate personalized recommendations in the news field. Our\ncode is available at https://github.com/Xuan-ZW/LKPNR.\n","authors":["Chen hao","Xie Runfeng","Cui Xiangyang","Yan Zhou","Wang Xin","Xuan Zhanwei","Zhang Kai"],"pdf_url":"https://arxiv.org/pdf/2308.12028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11998v1","updated":"2023-08-23T08:35:59Z","published":"2023-08-23T08:35:59Z","title":"Economic Recommender Systems -- A Systematic Review","summary":" Many of today's online services provide personalized recommendations to their\nusers. Such recommendations are typically designed to serve certain user needs,\ne.g., to quickly find relevant content in situations of information overload.\nCorrespondingly, the academic literature in the field largely focuses on the\nvalue of recommender systems for the end user. In this context, one underlying\nassumption is that the improved service that is achieved through the\nrecommendations will in turn positively impact the organization's goals, e.g.,\nin the form of higher customer retention or loyalty. However, in reality,\nrecommender systems can be used to target organizational economic goals more\ndirectly by incorporating monetary considerations such as price awareness and\nprofitability aspects into the underlying recommendation models. In this work,\nwe survey the existing literature on what we call Economic Recommender Systems\nbased on a systematic review approach that helped us identify 133 relevant\npapers. We first categorize existing works along different dimensions and then\nreview the most important technical approaches from the literature.\nFurthermore, we discuss common methodologies to evaluate such systems and\nfinally outline the limitations of today's research and future directions.\n","authors":["Alvise De Biasio","Nicolò Navarin","Dietmar Jannach"],"pdf_url":"https://arxiv.org/pdf/2308.11998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01792v3","updated":"2023-08-23T07:43:03Z","published":"2023-06-01T08:10:03Z","title":"Task Relation-aware Continual User Representation Learning","summary":" User modeling, which learns to represent users into a low-dimensional\nrepresentation space based on their past behaviors, got a surge of interest\nfrom the industry for providing personalized services to users. Previous\nefforts in user modeling mainly focus on learning a task-specific user\nrepresentation that is designed for a single task. However, since learning\ntask-specific user representations for every task is infeasible, recent studies\nintroduce the concept of universal user representation, which is a more\ngeneralized representation of a user that is relevant to a variety of tasks.\nDespite their effectiveness, existing approaches for learning universal user\nrepresentations are impractical in real-world applications due to the data\nrequirement, catastrophic forgetting and the limited learning capability for\ncontinually added tasks. In this paper, we propose a novel continual user\nrepresentation learning method, called TERACON, whose learning capability is\nnot limited as the number of learned tasks increases while capturing the\nrelationship between the tasks. The main idea is to introduce an embedding for\neach task, i.e., task embedding, which is utilized to generate task-specific\nsoft masks that not only allow the entire model parameters to be updated until\nthe end of training sequence, but also facilitate the relationship between the\ntasks to be captured. Moreover, we introduce a novel knowledge retention module\nwith pseudo-labeling strategy that successfully alleviates the long-standing\nproblem of continual learning, i.e., catastrophic forgetting. Extensive\nexperiments on public and proprietary real-world datasets demonstrate the\nsuperiority and practicality of TERACON. Our code is available at\nhttps://github.com/Sein-Kim/TERACON.\n","authors":["Sein Kim","Namkyeong Lee","Donghyun Kim","Minchul Yang","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2306.01792v3.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2211.06924v3","updated":"2023-08-23T04:02:28Z","published":"2022-11-13T15:11:03Z","title":"A Tale of Two Graphs: Freezing and Denoising Graph Structures for\n Multimodal Recommendation","summary":" Multimodal recommender systems utilizing multimodal features (e.g., images\nand textual descriptions) typically show better recommendation accuracy than\ngeneral recommendation models based solely on user-item interactions.\nGenerally, prior work fuses multimodal features into item ID embeddings to\nenrich item representations, thus failing to capture the latent semantic\nitem-item structures. In this context, LATTICE proposes to learn the latent\nstructure between items explicitly and achieves state-of-the-art performance\nfor multimodal recommendations. However, we argue the latent graph structure\nlearning of LATTICE is both inefficient and unnecessary. Experimentally, we\ndemonstrate that freezing its item-item structure before training can also\nachieve competitive performance. Based on this finding, we propose a simple yet\neffective model, dubbed as FREEDOM, that FREEzes the item-item graph and\nDenOises the user-item interaction graph simultaneously for Multimodal\nrecommendation. Theoretically, we examine the design of FREEDOM through a graph\nspectral perspective and demonstrate that it possesses a tighter upper bound on\nthe graph spectrum. In denoising the user-item interaction graph, we devise a\ndegree-sensitive edge pruning method, which rejects possibly noisy edges with a\nhigh probability when sampling the graph. We evaluate the proposed model on\nthree real-world datasets and show that FREEDOM can significantly outperform\ncurrent strongest baselines. Compared with LATTICE, FREEDOM achieves an average\nimprovement of 19.07% in recommendation accuracy while reducing its memory cost\nup to 6$\\times$ on large graphs. The source code is available at:\nhttps://github.com/enoche/FREEDOM.\n","authors":["Xin Zhou","Zhiqi Shen"],"pdf_url":"https://arxiv.org/pdf/2211.06924v3.pdf","comment":"Accepted to ACM Multimedia (MM) 2023"},{"id":"http://arxiv.org/abs/2308.11884v1","updated":"2023-08-23T03:03:14Z","published":"2023-08-23T03:03:14Z","title":"Integrating the Wikidata Taxonomy into YAGO","summary":" Wikidata is one of the largest public general-purpose Knowledge Bases (KBs).\nYet, due to its collaborative nature, its schema and taxonomy have become\nconvoluted. For the YAGO 4 KB, we combined Wikidata with the ontology from\nSchema.org, which reduced and cleaned up the taxonomy and constraints and made\nit possible to run automated reasoners on the data. However, it also cut away\nlarge parts of the Wikidata taxonomy. In this paper, we present our effort to\nmerge the entire Wikidata taxonomy into the YAGO KB as much as possible. We pay\nparticular attention to logical constraints and a careful distinction of\nclasses and instances. Our work creates YAGO 4.5, which adds a rich layer of\ninformative classes to YAGO, while at the same time keeping the KB logically\nconsistent.\n","authors":["Fabian Suchanek","Mehwish Alam","Thomas Bonald","Pierre-Henri Paris","Jules Soria"],"pdf_url":"https://arxiv.org/pdf/2308.11884v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2308.12284v1","updated":"2023-08-23T17:58:14Z","published":"2023-08-23T17:58:14Z","title":"D4: Improving LLM Pretraining via Document De-Duplication and\n Diversification","summary":" Over recent years, an increasing amount of compute and data has been poured\ninto training large language models (LLMs), usually by doing one-pass learning\non as many tokens as possible randomly selected from large-scale web corpora.\nWhile training on ever-larger portions of the internet leads to consistent\nperformance improvements, the size of these improvements diminishes with scale,\nand there has been little work exploring the effect of data selection on\npre-training and downstream performance beyond simple de-duplication methods\nsuch as MinHash. Here, we show that careful data selection (on top of\nde-duplicated data) via pre-trained model embeddings can speed up training (20%\nefficiency gains) and improves average downstream accuracy on 16 NLP tasks (up\nto 2%) at the 6.7B model scale. Furthermore, we show that repeating data\nintelligently consistently outperforms baseline training (while repeating\nrandom data performs worse than baseline training). Our results indicate that\nclever data selection can significantly improve LLM pre-training, calls into\nquestion the common practice of training for a single epoch on as much data as\npossible, and demonstrates a path to keep improving our models past the limits\nof randomly sampling web data.\n","authors":["Kushal Tirumala","Daniel Simig","Armen Aghajanyan","Ari S. Morcos"],"pdf_url":"https://arxiv.org/pdf/2308.12284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12280v1","updated":"2023-08-23T17:50:57Z","published":"2023-08-23T17:50:57Z","title":"Extended Linear Regression: A Kalman Filter Approach for Minimizing Loss\n via Area Under the Curve","summary":" This research enhances linear regression models by integrating a Kalman\nfilter and analysing curve areas to minimize loss. The goal is to develop an\noptimal linear regression equation using stochastic gradient descent (SGD) for\nweight updating. Our approach involves a stepwise process, starting with\nuser-defined parameters. The linear regression model is trained using SGD,\ntracking weights and loss separately and zipping them finally. A Kalman filter\nis then trained based on weight and loss arrays to predict the next\nconsolidated weights. Predictions result from multiplying input averages with\nweights, evaluated for loss to form a weight-versus-loss curve. The curve's\nequation is derived using the two-point formula, and area under the curve is\ncalculated via integration. The linear regression equation with minimum area\nbecomes the optimal curve for prediction. Benefits include avoiding constant\nweight updates via gradient descent and working with partial datasets, unlike\nmethods needing the entire set. However, computational complexity should be\nconsidered. The Kalman filter's accuracy might diminish beyond a certain\nprediction range.\n","authors":["Gokulprasath R"],"pdf_url":"https://arxiv.org/pdf/2308.12280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12279v1","updated":"2023-08-23T17:50:50Z","published":"2023-08-23T17:50:50Z","title":"On-Manifold Projected Gradient Descent","summary":" This work provides a computable, direct, and mathematically rigorous\napproximation to the differential geometry of class manifolds for\nhigh-dimensional data, along with nonlinear projections from input space onto\nthese class manifolds. The tools are applied to the setting of neural network\nimage classifiers, where we generate novel, on-manifold data samples, and\nimplement a projected gradient descent algorithm for on-manifold adversarial\ntraining. The susceptibility of neural networks (NNs) to adversarial attack\nhighlights the brittle nature of NN decision boundaries in input space.\nIntroducing adversarial examples during training has been shown to reduce the\nsusceptibility of NNs to adversarial attack; however, it has also been shown to\nreduce the accuracy of the classifier if the examples are not valid examples\nfor that class. Realistic \"on-manifold\" examples have been previously generated\nfrom class manifolds in the latent of an autoencoder. Our work explores these\nphenomena in a geometric and computational setting that is much closer to the\nraw, high-dimensional input space than can be provided by VAE or other black\nbox dimensionality reductions. We employ conformally invariant diffusion maps\n(CIDM) to approximate class manifolds in diffusion coordinates, and develop the\nNystr\\\"{o}m projection to project novel points onto class manifolds in this\nsetting. On top of the manifold approximation, we leverage the spectral\nexterior calculus (SEC) to determine geometric quantities such as tangent\nvectors of the manifold. We use these tools to obtain adversarial examples that\nreside on a class manifold, yet fool a classifier. These misclassifications\nthen become explainable in terms of human-understandable manipulations within\nthe data, by expressing the on-manifold adversary in the semantic basis on the\nmanifold.\n","authors":["Aaron Mahler","Tyrus Berry","Tom Stephens","Harbir Antil","Michael Merritt","Jeanie Schreiber","Ioannis Kevrekidis"],"pdf_url":"https://arxiv.org/pdf/2308.12279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12270v1","updated":"2023-08-23T17:37:51Z","published":"2023-08-23T17:37:51Z","title":"Language Reward Modulation for Pretraining Reinforcement Learning","summary":" Using learned reward functions (LRFs) as a means to solve sparse-reward\nreinforcement learning (RL) tasks has yielded some steady progress in\ntask-complexity through the years. In this work, we question whether today's\nLRFs are best-suited as a direct replacement for task rewards. Instead, we\npropose leveraging the capabilities of LRFs as a pretraining signal for RL.\nConcretely, we propose $\\textbf{LA}$nguage Reward $\\textbf{M}$odulated\n$\\textbf{P}$retraining (LAMP) which leverages the zero-shot capabilities of\nVision-Language Models (VLMs) as a $\\textit{pretraining}$ utility for RL as\nopposed to a downstream task reward. LAMP uses a frozen, pretrained VLM to\nscalably generate noisy, albeit shaped exploration rewards by computing the\ncontrastive alignment between a highly diverse collection of language\ninstructions and the image observations of an agent in its pretraining\nenvironment. LAMP optimizes these rewards in conjunction with standard\nnovelty-seeking exploration rewards with reinforcement learning to acquire a\nlanguage-conditioned, pretrained policy. Our VLM pretraining approach, which is\na departure from previous attempts to use LRFs, can warmstart sample-efficient\nlearning on robot manipulation tasks in RLBench.\n","authors":["Ademi Adeniji","Amber Xie","Carmelo Sferrazza","Younggyo Seo","Stephen James","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2308.12270v1.pdf","comment":"Code available at https://github.com/ademiadeniji/lamp"},{"id":"http://arxiv.org/abs/2308.11601v2","updated":"2023-08-23T17:34:17Z","published":"2023-08-22T17:48:24Z","title":"Tryage: Real-time, intelligent Routing of User Prompts to Large Language\n Models","summary":" The introduction of the transformer architecture and the self-attention\nmechanism has led to an explosive production of language models trained on\nspecific downstream tasks and data domains. With over 200, 000 models in the\nHugging Face ecosystem, users grapple with selecting and optimizing models to\nsuit multifaceted workflows and data domains while addressing computational,\nsecurity, and recency concerns. There is an urgent need for machine learning\nframeworks that can eliminate the burden of model selection and customization\nand unleash the incredible power of the vast emerging model library for end\nusers. Here, we propose a context-aware routing system, Tryage, that leverages\na language model router for optimal selection of expert models from a model\nlibrary based on analysis of individual input prompts. Inspired by the thalamic\nrouter in the brain, Tryage employs a perceptive router to predict down-stream\nmodel performance on prompts and, then, makes a routing decision using an\nobjective function that integrates performance predictions with user goals and\nconstraints that are incorporated through flags (e.g., model size, model\nrecency). Tryage allows users to explore a Pareto front and automatically\ntrade-off between task accuracy and secondary goals including minimization of\nmodel size, recency, security, verbosity, and readability. Across heterogeneous\ndata sets that include code, text, clinical data, and patents, the Tryage\nframework surpasses Gorilla and GPT3.5 turbo in dynamic model selection\nidentifying the optimal model with an accuracy of 50.9% , compared to 23.6% by\nGPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how\nrouting models can be applied to program and control the behavior of\nmulti-model LLM systems to maximize efficient use of the expanding and evolving\nlanguage model ecosystem.\n","authors":["Surya Narayanan Hari","Matt Thomson"],"pdf_url":"https://arxiv.org/pdf/2308.11601v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12264v1","updated":"2023-08-23T17:32:06Z","published":"2023-08-23T17:32:06Z","title":"FECoM: A Step towards Fine-Grained Energy Measurement for Deep Learning","summary":" With the increasing usage, scale, and complexity of Deep Learning (DL)\nmodels, their rapidly growing energy consumption has become a critical concern.\nPromoting green development and energy awareness at different granularities is\nthe need of the hour to limit carbon emissions of DL systems. However, the lack\nof standard and repeatable tools to accurately measure and optimize energy\nconsumption at a fine granularity (e.g., at method level) hinders progress in\nthis area. In this paper, we introduce FECoM (Fine-grained Energy Consumption\nMeter), a framework for fine-grained DL energy consumption measurement.\nSpecifically, FECoM provides researchers and developers a mechanism to profile\nDL APIs. FECoM addresses the challenges of measuring energy consumption at\nfine-grained level by using static instrumentation and considering various\nfactors, including computational load and temperature stability. We assess\nFECoM's capability to measure fine-grained energy consumption for one of the\nmost popular open-source DL frameworks, namely TensorFlow. Using FECoM, we also\ninvestigate the impact of parameter size and execution time on energy\nconsumption, enriching our understanding of TensorFlow APIs' energy profiles.\nFurthermore, we elaborate on the considerations, issues, and challenges that\none needs to consider while designing and implementing a fine-grained energy\nconsumption measurement tool. We hope this work will facilitate further\nadvances in DL energy measurement and the development of energy-aware practices\nfor DL systems.\n","authors":["Saurabhsingh Rajput","Tim Widmayer","Ziyuan Shang","Maria Kechagia","Federica Sarro","Tushar Sharma"],"pdf_url":"https://arxiv.org/pdf/2308.12264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.01575v3","updated":"2023-08-23T17:23:59Z","published":"2021-12-02T19:29:26Z","title":"Towards Interactive Reinforcement Learning with Intrinsic Feedback","summary":" Reinforcement learning (RL) and brain-computer interfaces (BCI) have\nexperienced significant growth over the past decade. With rising interest in\nhuman-in-the-loop (HITL), incorporating human input with RL algorithms has\ngiven rise to the sub-field of interactive RL. Adjacently, the field of BCI has\nlong been interested in extracting informative brain signals from neural\nactivity for use in human-computer interactions. A key link between these\nfields lies in the interpretation of neural activity as feedback such that\ninteractive RL approaches can be employed. We denote this new and emerging\nmedium of feedback as intrinsic feedback. Despite intrinsic feedback's ability\nto be conveyed automatically and even unconsciously, proper exploration\nsurrounding this key link has largely gone unaddressed by both communities.\nThus, to help facilitate a deeper understanding and a more effective\nutilization, we provide a tutorial-style review covering the motivations,\napproaches, and open problems of intrinsic feedback and its foundational\nconcepts.\n","authors":["Benjamin Poole","Minwoo Lee"],"pdf_url":"https://arxiv.org/pdf/2112.01575v3.pdf","comment":"Name change and vast rewrites of the paper"},{"id":"http://arxiv.org/abs/2308.12256v1","updated":"2023-08-23T17:16:07Z","published":"2023-08-23T17:16:07Z","title":"Learning from Negative User Feedback and Measuring Responsiveness for\n Sequential Recommenders","summary":" Sequential recommenders have been widely used in industry due to their\nstrength in modeling user preferences. While these models excel at learning a\nuser's positive interests, less attention has been paid to learning from\nnegative user feedback. Negative user feedback is an important lever of user\ncontrol, and comes with an expectation that recommenders should respond quickly\nand reduce similar recommendations to the user. However, negative feedback\nsignals are often ignored in the training objective of sequential retrieval\nmodels, which primarily aim at predicting positive user interactions. In this\nwork, we incorporate explicit and implicit negative user feedback into the\ntraining objective of sequential recommenders in the retrieval stage using a\n\"not-to-recommend\" loss function that optimizes for the log-likelihood of not\nrecommending items with negative feedback. We demonstrate the effectiveness of\nthis approach using live experiments on a large-scale industrial recommender\nsystem. Furthermore, we address a challenge in measuring recommender\nresponsiveness to negative feedback by developing a counterfactual simulation\nframework to compare recommender responses between different user actions,\nshowing improved responsiveness from the modeling change.\n","authors":["Yueqi Wang","Yoni Halpern","Shuo Chang","Jingchen Feng","Elaine Ya Le","Longfei Li","Xujian Liang","Min-Cheng Huang","Shane Li","Alex Beutel","Yaping Zhang","Shuchao Bi"],"pdf_url":"https://arxiv.org/pdf/2308.12256v1.pdf","comment":"RecSys 2023 Industry Track"},{"id":"http://arxiv.org/abs/2308.12252v1","updated":"2023-08-23T17:01:53Z","published":"2023-08-23T17:01:53Z","title":"How Safe Am I Given What I See? Calibrated Prediction of Safety Chances\n for Image-Controlled Autonomy","summary":" End-to-end learning has emerged as a major paradigm for developing autonomous\nsystems. Unfortunately, with its performance and convenience comes an even\ngreater challenge of safety assurance. A key factor of this challenge is the\nabsence of the notion of a low-dimensional and interpretable dynamical state,\naround which traditional assurance methods revolve. Focusing on the online\nsafety prediction problem, this paper proposes a configurable family of\nlearning pipelines based on generative world models, which do not require\nlow-dimensional states. To implement these pipelines, we overcome the\nchallenges of learning safety-informed latent representations and missing\nsafety labels under prediction-induced distribution shift. These pipelines come\nwith statistical calibration guarantees on their safety chance predictions\nbased on conformal prediction. We perform an extensive evaluation of the\nproposed learning pipelines on two case studies of image-controlled systems: a\nracing car and a cartpole.\n","authors":["Zhenjiang Mao","Carson Sobolewski","Ivan Ruchkin"],"pdf_url":"https://arxiv.org/pdf/2308.12252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.05621v3","updated":"2023-08-23T16:54:03Z","published":"2021-03-09T18:46:01Z","title":"The Common Intuition to Transfer Learning Can Win or Lose: Case Studies\n for Linear Regression","summary":" We study a fundamental transfer learning process from source to target linear\nregression tasks, including overparameterized settings where there are more\nlearned parameters than data samples. The target task learning is addressed by\nusing its training data together with the parameters previously computed for\nthe source task. We define a transfer learning approach to the target task as a\nlinear regression optimization with a regularization on the distance between\nthe to-be-learned target parameters and the already-learned source parameters.\nWe analytically characterize the generalization performance of our transfer\nlearning approach and demonstrate its ability to resolve the peak in\ngeneralization errors in double descent phenomena of the minimum L2-norm\nsolution to linear regression. Moreover, we show that for sufficiently related\ntasks, the optimally tuned transfer learning approach can outperform the\noptimally tuned ridge regression method, even when the true parameter vector\nconforms to an isotropic Gaussian prior distribution. Namely, we demonstrate\nthat transfer learning can beat the minimum mean square error (MMSE) solution\nof the independent target task. Our results emphasize the ability of transfer\nlearning to extend the solution space to the target task and, by that, to have\nan improved MMSE solution. We formulate the linear MMSE solution to our\ntransfer learning setting and point out its key differences from the common\ndesign philosophy to transfer learning.\n","authors":["Yehuda Dar","Daniel LeJeune","Richard G. Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2103.05621v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12247v1","updated":"2023-08-23T16:48:04Z","published":"2023-08-23T16:48:04Z","title":"How to Protect Copyright Data in Optimization of Large Language Models?","summary":" Large language models (LLMs) and generative AI have played a transformative\nrole in computer research and applications. Controversy has arisen as to\nwhether these models output copyrighted data, which can occur if the data the\nmodels are trained on is copyrighted. LLMs are built on the transformer neural\nnetwork architecture, which in turn relies on a mathematical computation called\nAttention that uses the softmax function.\n In this paper, we show that large language model training and optimization\ncan be seen as a softmax regression problem. We then establish a method of\nefficiently performing softmax regression, in a way that prevents the\nregression function from generating copyright data. This establishes a\ntheoretical method of training large language models in a way that avoids\ngenerating copyright data.\n","authors":["Timothy Chu","Zhao Song","Chiwun Yang"],"pdf_url":"https://arxiv.org/pdf/2308.12247v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.03006v4","updated":"2023-08-23T16:47:25Z","published":"2021-10-06T18:25:50Z","title":"Unsupervised Selective Labeling for More Effective Semi-Supervised\n Learning","summary":" Given an unlabeled dataset and an annotation budget, we study how to\nselectively label a fixed number of instances so that semi-supervised learning\n(SSL) on such a partially labeled dataset is most effective. We focus on\nselecting the right data to label, in addition to usual SSL's propagating\nlabels from labeled data to the rest unlabeled data. This instance selection\ntask is challenging, as without any labeled data we do not know what the\nobjective of learning should be. Intuitively, no matter what the downstream\ntask is, instances to be labeled must be representative and diverse: The former\nwould facilitate label propagation to unlabeled data, whereas the latter would\nensure coverage of the entire dataset. We capture this idea by selecting\ncluster prototypes, either in a pretrained feature space, or along with feature\noptimization, both without labels. Our unsupervised selective labeling\nconsistently improves SSL methods over state-of-the-art active learning given\nlabeled data, by 8 to 25 times in label efficiency. For example, it boosts\nFixMatch by 10% (14%) in accuracy on CIFAR-10 (ImageNet-1K) with 0.08% (0.2%)\nlabeled data, demonstrating that small computation spent on selecting what data\nto label brings significant gain especially under a low annotation budget. Our\nwork sets a new standard for practical and efficient SSL.\n","authors":["Xudong Wang","Long Lian","Stella X. Yu"],"pdf_url":"https://arxiv.org/pdf/2110.03006v4.pdf","comment":"Accepted by ECCV 2022; Fixed a few typos"},{"id":"http://arxiv.org/abs/2209.01566v3","updated":"2023-08-23T16:44:27Z","published":"2022-09-04T08:35:16Z","title":"Towards Top-Down Automated Development in Limited Scopes: A\n Neuro-Symbolic Framework from Expressibles to Executables","summary":" Deep code generation is a topic of deep learning for software engineering\n(DL4SE), which adopts neural models to generate code for the intended\nfunctions. Since end-to-end neural methods lack domain knowledge and software\nhierarchy awareness, they tend to perform poorly w.r.t project-level tasks. To\nsystematically explore the potential improvements of code generation, we let it\nparticipate in the whole top-down development from \\emph{expressibles} to\n\\emph{executables}, which is possible in limited scopes. In the process, it\nbenefits from massive samples, features, and knowledge. As the foundation, we\nsuggest building a taxonomy on code data, namely code taxonomy, leveraging the\ncategorization of code information. Moreover, we introduce a three-layer\nsemantic pyramid (SP) to associate text data and code data. It identifies the\ninformation of different abstraction levels, and thus introduces the domain\nknowledge on development and reveals the hierarchy of software. Furthermore, we\npropose a semantic pyramid framework (SPF) as the approach, focusing on\nsoftware of high modularity and low complexity. SPF divides the code generation\nprocess into stages and reserves spots for potential interactions. In addition,\nwe conceived preliminary applications in software development to confirm the\nneuro-symbolic framework.\n","authors":["Jian Gu","Harald C. Gall"],"pdf_url":"https://arxiv.org/pdf/2209.01566v3.pdf","comment":"5 pages, 3 figures, 2 tables, accepted by ESEC/FSE 2023, the\n camera-ready version"},{"id":"http://arxiv.org/abs/2308.12243v1","updated":"2023-08-23T16:42:27Z","published":"2023-08-23T16:42:27Z","title":"Multi-Objective Optimization for Sparse Deep Neural Network Training","summary":" Different conflicting optimization criteria arise naturally in various Deep\nLearning scenarios. These can address different main tasks (i.e., in the\nsetting of Multi-Task Learning), but also main and secondary tasks such as loss\nminimization versus sparsity. The usual approach is a simple weighting of the\ncriteria, which formally only works in the convex setting. In this paper, we\npresent a Multi-Objective Optimization algorithm using a modified Weighted\nChebyshev scalarization for training Deep Neural Networks (DNNs) with respect\nto several tasks. By employing this scalarization technique, the algorithm can\nidentify all optimal solutions of the original problem while reducing its\ncomplexity to a sequence of single-objective problems. The simplified problems\nare then solved using an Augmented Lagrangian method, enabling the use of\npopular optimization techniques such as Adam and Stochastic Gradient Descent,\nwhile efficaciously handling constraints. Our work aims to address the\n(economical and also ecological) sustainability issue of DNN models, with a\nparticular focus on Deep Multi-Task models, which are typically designed with a\nvery large number of weights to perform equally well on multiple tasks. Through\nexperiments conducted on two Machine Learning datasets, we demonstrate the\npossibility of adaptively sparsifying the model during training without\nsignificantly impacting its performance, if we are willing to apply\ntask-specific adaptations to the network weights. Code is available at\nhttps://github.com/salomonhotegni/MDMTN.\n","authors":["S. S. Hotegni","S. Peitz","M. Berkemeier"],"pdf_url":"https://arxiv.org/pdf/2308.12243v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2206.02667v2","updated":"2023-08-23T16:18:03Z","published":"2022-06-06T15:12:56Z","title":"Emergent segmentation from participation dynamics and multi-learner\n retraining","summary":" The choice to participate in a data-driven service, often made on the basis\nof quality of that service, influences the ability of the service to learn and\nimprove. We study the participation and retraining dynamics that arise when\nboth the learners and sub-populations of users are \\emph{risk-reducing}, which\ncover a broad class of updates including gradient descent, multiplicative\nweights, etc. Suppose, for example, that individuals choose to spend their time\namongst social media platforms proportionally to how well each platform works\nfor them. Each platform also gathers data about its active users, which it uses\nto update parameters with a gradient step. For this example and for our general\nclass of dynamics, we show that the only asymptotically stable equilibria are\nsegmented, with sub-populations allocated to a single learner. Under mild\nassumptions, the utilitarian social optimum is a stable equilibrium. In\ncontrast to previous work, which shows that repeated risk minimization can\nresult in representation disparity and high overall loss for a single learner\n\\citep{hashimoto2018fairness,miller2021outside}, we find that repeated myopic\nupdates with multiple learners lead to better outcomes. We illustrate the\nphenomena via a simulated example initialized from real data.\n","authors":["Sarah Dean","Mihaela Curmei","Lillian J. Ratliff","Jamie Morgenstern","Maryam Fazel"],"pdf_url":"https://arxiv.org/pdf/2206.02667v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12221v1","updated":"2023-08-23T16:01:50Z","published":"2023-08-23T16:01:50Z","title":"Critical Learning Periods Emerge Even in Deep Linear Networks","summary":" Critical learning periods are periods early in development where temporary\nsensory deficits can have a permanent effect on behavior and learned\nrepresentations. Despite the radical differences between biological and\nartificial networks, critical learning periods have been empirically observed\nin both systems. This suggests that critical periods may be fundamental to\nlearning and not an accident of biology. Yet, why exactly critical periods\nemerge in deep networks is still an open question, and in particular it is\nunclear whether the critical periods observed in both systems depend on\nparticular architectural or optimization details. To isolate the key underlying\nfactors, we focus on deep linear network models, and show that, surprisingly,\nsuch networks also display much of the behavior seen in biology and artificial\nnetworks, while being amenable to analytical treatment. We show that critical\nperiods depend on the depth of the model and structure of the data\ndistribution. We also show analytically and in simulations that the learning of\nfeatures is tied to competition between sources. Finally, we extend our\nanalysis to multi-task learning to show that pre-training on certain tasks can\ndamage the transfer performance on new tasks, and show how this depends on the\nrelationship between tasks and the duration of the pre-training stage. To the\nbest of our knowledge, our work provides the first analytically tractable model\nthat sheds light into why critical learning periods emerge in biological and\nartificial networks.\n","authors":["Michael Kleinman","Alessandro Achille","Stefano Soatto"],"pdf_url":"https://arxiv.org/pdf/2308.12221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12219v1","updated":"2023-08-23T16:01:12Z","published":"2023-08-23T16:01:12Z","title":"Diffusion Language Models Can Perform Many Tasks with Scaling and\n Instruction-Finetuning","summary":" The recent surge of generative AI has been fueled by the generative power of\ndiffusion probabilistic models and the scalable capabilities of large language\nmodels. Despite their potential, it remains elusive whether diffusion language\nmodels can solve general language tasks comparable to their autoregressive\ncounterparts. This paper demonstrates that scaling diffusion models w.r.t.\ndata, sizes, and tasks can effectively make them strong language learners. We\nbuild competent diffusion language models at scale by first acquiring knowledge\nfrom massive data via masked language modeling pretraining thanks to their\nintrinsic connections. We then reprogram pretrained masked language models into\ndiffusion language models via diffusive adaptation, wherein task-specific\nfinetuning and instruction finetuning are explored to unlock their versatility\nin solving general language tasks. Experiments show that scaling diffusion\nlanguage models consistently improves performance across downstream language\ntasks. We further discover that instruction finetuning can elicit zero-shot and\nfew-shot in-context learning abilities that help tackle many unseen tasks by\nfollowing natural language instructions, and show promise in advanced and\nchallenging abilities such as reasoning\n","authors":["Jiasheng Ye","Zaixiang Zheng","Yu Bao","Lihua Qian","Quanquan Gu"],"pdf_url":"https://arxiv.org/pdf/2308.12219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12215v1","updated":"2023-08-23T15:52:20Z","published":"2023-08-23T15:52:20Z","title":"The Challenges of Machine Learning for Trust and Safety: A Case Study on\n Misinformation Detection","summary":" We examine the disconnect between scholarship and practice in applying\nmachine learning to trust and safety problems, using misinformation detection\nas a case study. We systematize literature on automated detection of\nmisinformation across a corpus of 270 well-cited papers in the field. We then\nexamine subsets of papers for data and code availability, design missteps,\nreproducibility, and generalizability. We find significant shortcomings in the\nliterature that call into question claimed performance and practicality.\nDetection tasks are often meaningfully distinct from the challenges that online\nservices actually face. Datasets and model evaluation are often\nnon-representative of real-world contexts, and evaluation frequently is not\nindependent of model training. Data and code availability is poor. Models do\nnot generalize well to out-of-domain data. Based on these results, we offer\nrecommendations for evaluating machine learning applications to trust and\nsafety problems. Our aim is for future work to avoid the pitfalls that we\nidentify.\n","authors":["Madelyne Xiao","Jonathan Mayer"],"pdf_url":"https://arxiv.org/pdf/2308.12215v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12212v1","updated":"2023-08-23T15:51:29Z","published":"2023-08-23T15:51:29Z","title":"Learning to Learn Financial Networks for Optimising Momentum Strategies","summary":" Network momentum provides a novel type of risk premium, which exploits the\ninterconnections among assets in a financial network to predict future returns.\nHowever, the current process of constructing financial networks relies heavily\non expensive databases and financial expertise, limiting accessibility for\nsmall-sized and academic institutions. Furthermore, the traditional approach\ntreats network construction and portfolio optimisation as separate tasks,\npotentially hindering optimal portfolio performance. To address these\nchallenges, we propose L2GMOM, an end-to-end machine learning framework that\nsimultaneously learns financial networks and optimises trading signals for\nnetwork momentum strategies. The model of L2GMOM is a neural network with a\nhighly interpretable forward propagation architecture, which is derived from\nalgorithm unrolling. The L2GMOM is flexible and can be trained with diverse\nloss functions for portfolio performance, e.g. the negative Sharpe ratio.\nBacktesting on 64 continuous future contracts demonstrates a significant\nimprovement in portfolio profitability and risk control, with a Sharpe ratio of\n1.74 across a 20-year period.\n","authors":[" Xingyue"," Pu","Stefan Zohren","Stephen Roberts","Xiaowen Dong"],"pdf_url":"https://arxiv.org/pdf/2308.12212v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.12210v1","updated":"2023-08-23T15:50:51Z","published":"2023-08-23T15:50:51Z","title":"ULDP-FL: Federated Learning with Across Silo User-Level Differential\n Privacy","summary":" Differentially Private Federated Learning (DP-FL) has garnered attention as a\ncollaborative machine learning approach that ensures formal privacy. Most DP-FL\napproaches ensure DP at the record-level within each silo for cross-silo FL.\nHowever, a single user's data may extend across multiple silos, and the desired\nuser-level DP guarantee for such a setting remains unknown. In this study, we\npresent ULDP-FL, a novel FL framework designed to guarantee user-level DP in\ncross-silo FL where a single user's data may belong to multiple silos. Our\nproposed algorithm directly ensures user-level DP through per-user weighted\nclipping, departing from group-privacy approaches. We provide a theoretical\nanalysis of the algorithm's privacy and utility. Additionally, we enhance the\nalgorithm's utility and showcase its private implementation using cryptographic\nbuilding blocks. Empirical experiments on real-world datasets show substantial\nimprovements in our methods in privacy-utility trade-offs under user-level DP\ncompared to baseline methods. To the best of our knowledge, our work is the\nfirst FL framework that effectively provides user-level DP in the general\ncross-silo FL setting.\n","authors":["Fumiyuki Kato","Li Xiong","Shun Takagi","Yang Cao","Masatoshi Yoshikawa"],"pdf_url":"https://arxiv.org/pdf/2308.12210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12202v1","updated":"2023-08-23T15:39:42Z","published":"2023-08-23T15:39:42Z","title":"Curriculum Learning with Adam: The Devil Is in the Wrong Details","summary":" Curriculum learning (CL) posits that machine learning models -- similar to\nhumans -- may learn more efficiently from data that match their current\nlearning progress. However, CL methods are still poorly understood and, in\nparticular for natural language processing (NLP), have achieved only limited\nsuccess. In this paper, we explore why. Starting from an attempt to replicate\nand extend a number of recent curriculum methods, we find that their results\nare surprisingly brittle when applied to NLP. A deep dive into the\n(in)effectiveness of the curricula in some scenarios shows us why: when\ncurricula are employed in combination with the popular Adam optimisation\nalgorithm, they oftentimes learn to adapt to suboptimally chosen optimisation\nparameters for this algorithm. We present a number of different case studies\nwith different common hand-crafted and automated CL approaches to illustrate\nthis phenomenon, and we find that none of them outperforms optimisation with\nonly Adam with well-chosen hyperparameters. As such, our results contribute to\nunderstanding why CL methods work, but at the same time urge caution when\nclaiming positive results.\n","authors":["Lucas Weber","Jaap Jumelet","Paul Michel","Elia Bruni","Dieuwke Hupkes"],"pdf_url":"https://arxiv.org/pdf/2308.12202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12193v1","updated":"2023-08-23T15:31:38Z","published":"2023-08-23T15:31:38Z","title":"Self-Supervised Knowledge-Driven Deep Learning for 3D Magnetic Inversion","summary":" The magnetic inversion method is one of the non-destructive geophysical\nmethods, which aims to estimate the subsurface susceptibility distribution from\nsurface magnetic anomaly data. Recently, supervised deep learning methods have\nbeen widely utilized in lots of geophysical fields including magnetic\ninversion. However, these methods rely heavily on synthetic training data,\nwhose performance is limited since the synthetic data is not independently and\nidentically distributed with the field data. Thus, we proposed to realize\nmagnetic inversion by self-supervised deep learning. The proposed\nself-supervised knowledge-driven 3D magnetic inversion method (SSKMI) learns on\nthe target field data by a closed loop of the inversion and forward models.\nGiven that the parameters of the forward model are preset, SSKMI can optimize\nthe inversion model by minimizing the mean absolute error between observed and\nre-estimated surface magnetic anomalies. Besides, there is a knowledge-driven\nmodule in the proposed inversion model, which makes the deep learning method\nmore explicable. Meanwhile, comparative experiments demonstrate that the\nknowledge-driven module can accelerate the training of the proposed method and\nachieve better results. Since magnetic inversion is an ill-pose task, SSKMI\nproposed to constrain the inversion model by a guideline in the auxiliary loop.\nThe experimental results demonstrate that the proposed method is a reliable\nmagnetic inversion method with outstanding performance.\n","authors":["Yinshuo Li","Zhuo Jia","Wenkai Lu","Cao Song"],"pdf_url":"https://arxiv.org/pdf/2308.12193v1.pdf","comment":"11 pages, 14 figures"},{"id":"http://arxiv.org/abs/2308.12192v1","updated":"2023-08-23T15:30:44Z","published":"2023-08-23T15:30:44Z","title":"Robustness Analysis of Continuous-Depth Models with Lagrangian\n Techniques","summary":" This paper presents, in a unified fashion, deterministic as well as\nstatistical Lagrangian-verification techniques. They formally quantify the\nbehavioral robustness of any time-continuous process, formulated as a\ncontinuous-depth model. To this end, we review LRT-NG, SLR, and GoTube,\nalgorithms for constructing a tight reachtube, that is, an over-approximation\nof the set of states reachable within a given time-horizon, and provide\nguarantees for the reachtube bounds. We compare the usage of the variational\nequations, associated to the system equations, the mean value theorem, and the\nLipschitz constants, in achieving deterministic and statistical guarantees. In\nLRT-NG, the Lipschitz constant is used as a bloating factor of the initial\nperturbation, to compute the radius of an ellipsoid in an optimal metric, which\nover-approximates the set of reachable states. In SLR and GoTube, we get\nstatistical guarantees, by using the Lipschitz constants to compute local balls\naround samples. These are needed to calculate the probability of having found\nan upper bound, of the true maximum perturbation at every timestep. Our\nexperiments demonstrate the superior performance of Lagrangian techniques, when\ncompared to LRT, Flow*, and CAPD, and illustrate their use in the robustness\nanalysis of various continuous-depth models.\n","authors":["Sophie A. Neubauer","Radu Grosu"],"pdf_url":"https://arxiv.org/pdf/2308.12192v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2107.08467"},{"id":"http://arxiv.org/abs/2308.12188v1","updated":"2023-08-23T15:25:17Z","published":"2023-08-23T15:25:17Z","title":"Development and external validation of a lung cancer risk estimation\n tool using gradient-boosting","summary":" Lung cancer is a significant cause of mortality worldwide, emphasizing the\nimportance of early detection for improved survival rates. In this study, we\npropose a machine learning (ML) tool trained on data from the PLCO Cancer\nScreening Trial and validated on the NLST to estimate the likelihood of lung\ncancer occurrence within five years. The study utilized two datasets, the PLCO\n(n=55,161) and NLST (n=48,595), consisting of comprehensive information on risk\nfactors, clinical measurements, and outcomes related to lung cancer. Data\npreprocessing involved removing patients who were not current or former smokers\nand those who had died of causes unrelated to lung cancer. Additionally, a\nfocus was placed on mitigating bias caused by censored data. Feature selection,\nhyper-parameter optimization, and model calibration were performed using\nXGBoost, an ensemble learning algorithm that combines gradient boosting and\ndecision trees. The ML model was trained on the pre-processed PLCO dataset and\ntested on the NLST dataset. The model incorporated features such as age,\ngender, smoking history, medical diagnoses, and family history of lung cancer.\nThe model was well-calibrated (Brier score=0.044). ROC-AUC was 82% on the PLCO\ndataset and 70% on the NLST dataset. PR-AUC was 29% and 11% respectively. When\ncompared to the USPSTF guidelines for lung cancer screening, our model provided\nthe same recall with a precision of 13.1% vs. 9.3% on the PLCO dataset and 3.2%\nvs. 3.1% on the NLST dataset. The developed ML tool provides a freely available\nweb application for estimating the likelihood of developing lung cancer within\nfive years. By utilizing risk factors and clinical data, individuals can assess\ntheir risk and make informed decisions regarding lung cancer screening. This\nresearch contributes to the efforts in early detection and prevention\nstrategies, aiming to reduce lung cancer-related mortality rates.\n","authors":["Pierre-Louis Benveniste","Julie Alberge","Lei Xing","Jean-Emmanuel Bibault"],"pdf_url":"https://arxiv.org/pdf/2308.12188v1.pdf","comment":"14 pages, 4 figures, 4 tables, 1 Github repository, see\n http://github.com/plbenveniste/LungCancerRisk"},{"id":"http://arxiv.org/abs/2210.01860v4","updated":"2023-08-23T15:22:04Z","published":"2022-10-04T19:03:47Z","title":"ProtoBandit: Efficient Prototype Selection via Multi-Armed Bandits","summary":" In this work, we propose a multi-armed bandit-based framework for identifying\na compact set of informative data instances (i.e., the prototypes) from a\nsource dataset $S$ that best represents a given target set $T$. Prototypical\nexamples of a given dataset offer interpretable insights into the underlying\ndata distribution and assist in example-based reasoning, thereby influencing\nevery sphere of human decision-making. Current state-of-the-art prototype\nselection approaches require $O(|S||T|)$ similarity comparisons between source\nand target data points, which becomes prohibitively expensive for large-scale\nsettings. We propose to mitigate this limitation by employing stochastic greedy\nsearch in the space of prototypical examples and multi-armed bandits for\nreducing the number of similarity comparisons. Our randomized algorithm,\nProtoBandit, identifies a set of $k$ prototypes incurring $O(k^3|S|)$\nsimilarity comparisons, which is independent of the size of the target set. An\ninteresting outcome of our analysis is for the $k$-medoids clustering problem\n$T = S$ setting) in which we show that our algorithm ProtoBandit approximates\nthe BUILD step solution of the partitioning around medoids (PAM) method in\n$O(k^3|S|)$ complexity. Empirically, we observe that ProtoBandit reduces the\nnumber of similarity computation calls by several orders of magnitudes\n($100-1000$ times) while obtaining solutions similar in quality to those from\nstate-of-the-art approaches.\n","authors":["Arghya Roy Chaudhuri","Pratik Jawanpuria","Bamdev Mishra"],"pdf_url":"https://arxiv.org/pdf/2210.01860v4.pdf","comment":"Erratum corrected"},{"id":"http://arxiv.org/abs/2305.01975v2","updated":"2023-08-23T15:12:01Z","published":"2023-05-03T08:41:37Z","title":"A Survey on Dataset Distillation: Approaches, Applications and Future\n Directions","summary":" Dataset distillation is attracting more attention in machine learning as\ntraining sets continue to grow and the cost of training state-of-the-art models\nbecomes increasingly high. By synthesizing datasets with high information\ndensity, dataset distillation offers a range of potential applications,\nincluding support for continual learning, neural architecture search, and\nprivacy protection. Despite recent advances, we lack a holistic understanding\nof the approaches and applications. Our survey aims to bridge this gap by first\nproposing a taxonomy of dataset distillation, characterizing existing\napproaches, and then systematically reviewing the data modalities, and related\napplications. In addition, we summarize the challenges and discuss future\ndirections for this field of research.\n","authors":["Jiahui Geng","Zongxiong Chen","Yuandou Wang","Herbert Woisetschlaeger","Sonja Schimmler","Ruben Mayer","Zhiming Zhao","Chunming Rong"],"pdf_url":"https://arxiv.org/pdf/2305.01975v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12175v1","updated":"2023-08-23T14:53:38Z","published":"2023-08-23T14:53:38Z","title":"Unsupervised anomalies detection in IIoT edge devices networks using\n federated learning","summary":" In a connection of many IoT devices that each collect data, normally training\na machine learning model would involve transmitting the data to a central\nserver which requires strict privacy rules. However, some owners are reluctant\nof availing their data out of the company due to data security concerns.\nFederated learning(FL) as a distributed machine learning approach performs\ntraining of a machine learning model on the device that gathered the data\nitself. In this scenario, data is not share over the network for training\npurpose. Fedavg as one of FL algorithms permits a model to be copied to\nparticipating devices during a training session. The devices could be chosen at\nrandom, and a device can be aborted. The resulting models are sent to the\ncoordinating server and then average models from the devices that finished\ntraining. The process is repeated until a desired model accuracy is achieved.\nBy doing this, FL approach solves the privacy problem for IoT/ IIoT devices\nthat held sensitive data for the owners. In this paper, we leverage the\nbenefits of FL and implemented Fedavg algorithm on a recent dataset that\nrepresent the modern IoT/ IIoT device networks. The results were almost the\nsame as the centralized machine learning approach. We also evaluated some\nshortcomings of Fedavg such as unfairness that happens during the training when\nstruggling devices do not participate for every stage of training. This\ninefficient training of local or global model could lead in a high number of\nfalse alarms in intrusion detection systems for IoT/IIoT gadgets developed\nusing Fedavg. Hence, after evaluating the FedAv deep auto encoder with\ncentralized deep auto encoder ML, we further proposed and designed a Fair\nFedavg algorithm that will be evaluated in the future work.\n","authors":["Niyomukiza Thamar","Hossam Samy Elsaid Sharara"],"pdf_url":"https://arxiv.org/pdf/2308.12175v1.pdf","comment":"Accepted for PuBlication in machine learning journals"},{"id":"http://arxiv.org/abs/2209.11355v3","updated":"2023-08-23T14:51:47Z","published":"2022-09-23T00:35:22Z","title":"Learning Interpretable Dynamics from Images of a Freely Rotating 3D\n Rigid Body","summary":" In many real-world settings, image observations of freely rotating 3D rigid\nbodies, such as satellites, may be available when low-dimensional measurements\nare not. However, the high-dimensionality of image data precludes the use of\nclassical estimation techniques to learn the dynamics and a lack of\ninterpretability reduces the usefulness of standard deep learning methods. In\nthis work, we present a physics-informed neural network model to estimate and\npredict 3D rotational dynamics from image sequences. We achieve this using a\nmulti-stage prediction pipeline that maps individual images to a latent\nrepresentation homeomorphic to $\\mathbf{SO}(3)$, computes angular velocities\nfrom latent pairs, and predicts future latent states using the Hamiltonian\nequations of motion with a learned representation of the Hamiltonian. We\ndemonstrate the efficacy of our approach on a new rotating rigid-body dataset\nwith sequences of rotating cubes and rectangular prisms with uniform and\nnon-uniform density.\n","authors":["Justice Mason","Christine Allen-Blanchette","Nicholas Zolman","Elizabeth Davison","Naomi Leonard"],"pdf_url":"https://arxiv.org/pdf/2209.11355v3.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.07221v5","updated":"2023-08-23T14:24:51Z","published":"2023-08-14T15:47:25Z","title":"AudioFormer: Audio Transformer learns audio feature representations from\n discrete acoustic codes","summary":" We propose a method named AudioFormer,which learns audio feature\nrepresentations through the acquisition of discrete acoustic codes and\nsubsequently fine-tunes them for audio classification tasks. Initially,we\nintroduce a novel perspective by considering the audio classification task as a\nform of natural language understanding (NLU). Leveraging an existing neural\naudio codec model,we generate discrete acoustic codes and utilize them to train\na masked language model (MLM),thereby obtaining audio feature representations.\nFurthermore,we pioneer the integration of a Multi-Positive sample Contrastive\n(MPC) learning approach. This method enables the learning of joint\nrepresentations among multiple discrete acoustic codes within the same audio\ninput. In our experiments,we treat discrete acoustic codes as textual data and\ntrain a masked language model using a cloze-like methodology,ultimately\nderiving high-quality audio representations. Notably,the MPC learning technique\neffectively captures collaborative representations among distinct positive\nsamples. Our research outcomes demonstrate that AudioFormer attains\nsignificantly improved performance compared to prevailing monomodal audio\nclassification models across multiple datasets,and even outperforms\naudio-visual multimodal classification models on select datasets.\nSpecifically,our approach achieves remarkable results on datasets including\nAudioSet (2M,20K),and FSD50K,with performance scores of 53.9,45.1,and\n65.6,respectively. We have openly shared both the code and models:\nhttps://github.com/LZH-0225/AudioFormer.git.\n","authors":["Zhaohui Li","Haitao Wang","Xinghua Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.07221v5.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2303.02206v2","updated":"2023-08-23T14:23:48Z","published":"2023-03-03T20:35:38Z","title":"Domain Specific Question Answering Over Knowledge Graphs Using Logical\n Programming and Large Language Models","summary":" Answering questions over domain-specific graphs requires a tailored approach\ndue to the limited number of relations and the specific nature of the domain.\nOur approach integrates classic logical programming languages into large\nlanguage models (LLMs), enabling the utilization of logical reasoning\ncapabilities to tackle the KGQA task. By representing the questions as Prolog\nqueries, which are readable and near close to natural language in\nrepresentation, we facilitate the generation of programmatically derived\nanswers. To validate the effectiveness of our approach, we evaluate it using a\nwell-known benchmark dataset, MetaQA. Our experimental results demonstrate that\nour method achieves accurate identification of correct answer entities for all\ntest questions, even when trained on a small fraction of annotated data.\nOverall, our work presents a promising approach to addressing question\nanswering over domain-specific graphs, offering an explainable and robust\nsolution by incorporating logical programming languages.\n","authors":["Navid Madani","Rohini K. Srihari","Kenneth Joseph"],"pdf_url":"https://arxiv.org/pdf/2303.02206v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12161v1","updated":"2023-08-23T14:23:26Z","published":"2023-08-23T14:23:26Z","title":"Data-driven decision-focused surrogate modeling","summary":" We introduce the concept of decision-focused surrogate modeling for solving\ncomputationally challenging nonlinear optimization problems in real-time\nsettings. The proposed data-driven framework seeks to learn a simpler, e.g.\nconvex, surrogate optimization model that is trained to minimize the decision\nprediction error, which is defined as the difference between the optimal\nsolutions of the original and the surrogate optimization models. The learning\nproblem, formulated as a bilevel program, can be viewed as a data-driven\ninverse optimization problem to which we apply a decomposition-based solution\nalgorithm from previous work. We validate our framework through numerical\nexperiments involving the optimization of common nonlinear chemical processes\nsuch as chemical reactors, heat exchanger networks, and material blending\nsystems. We also present a detailed comparison of decision-focused surrogate\nmodeling with standard data-driven surrogate modeling methods and demonstrate\nthat our approach is significantly more data-efficient while producing simple\nsurrogate models with high decision prediction accuracy.\n","authors":["Rishabh Gupta","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.12161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11702v4","updated":"2023-08-23T14:18:30Z","published":"2023-03-21T09:42:27Z","title":"On the link between generative semi-supervised learning and generative\n open-set recognition","summary":" This study investigates the relationship between semi-supervised learning\n(SSL, which is training off partially labelled datasets) and open-set\nrecognition (OSR, which is classification with simultaneous novelty detection)\nunder the context of generative adversarial networks (GANs). Although no\nprevious study has formally linked SSL and OSR, their respective methods share\nstriking similarities. Specifically, SSL-GANs and OSR-GANs require their\ngenerators to produce 'bad-looking' samples which are used to regularise their\nclassifier networks. We hypothesise that the definitions of bad-looking samples\nin SSL and OSR represents the same concept and realises the same goal. More\nformally, bad-looking samples lie in the complementary space, which is the area\nbetween and around the boundaries of the labelled categories within the\nclassifier's embedding space. By regularising a classifier with samples in the\ncomplementary space, classifiers achieve improved generalisation for SSL and\nalso generalise the open space for OSR. To test this hypothesis, we compare a\nfoundational SSL-GAN with the state-of-the-art OSR-GAN under the same SSL-OSR\nexperimental conditions. Our results find that SSL-GANs achieve near identical\nresults to OSR-GANs, proving the SSL-OSR link. Subsequently, to further this\nnew research path, we compare several SSL-GANs various SSL-OSR setups which\nthis first benchmark results. A combined framework of SSL-OSR certainly\nimproves the practicality and cost-efficiency of classifier training, and so\nfurther theoretical and application studies are also discussed.\n","authors":["Emile Reyn Engelbrecht","Johan du Preez"],"pdf_url":"https://arxiv.org/pdf/2303.11702v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12143v1","updated":"2023-08-23T14:00:58Z","published":"2023-08-23T14:00:58Z","title":"A Probabilistic Fluctuation based Membership Inference Attack for\n Generative Models","summary":" Membership Inference Attack (MIA) identifies whether a record exists in a\nmachine learning model's training set by querying the model. MIAs on the\nclassic classification models have been well-studied, and recent works have\nstarted to explore how to transplant MIA onto generative models. Our\ninvestigation indicates that existing MIAs designed for generative models\nmainly depend on the overfitting in target models. However, overfitting can be\navoided by employing various regularization techniques, whereas existing MIAs\ndemonstrate poor performance in practice. Unlike overfitting, memorization is\nessential for deep learning models to attain optimal performance, making it a\nmore prevalent phenomenon. Memorization in generative models leads to an\nincreasing trend in the probability distribution of generating records around\nthe member record. Therefore, we propose a Probabilistic Fluctuation Assessing\nMembership Inference Attack (PFAMI), a black-box MIA that infers memberships by\ndetecting these trends via analyzing the overall probabilistic fluctuations\naround given records. We conduct extensive experiments across multiple\ngenerative models and datasets, which demonstrate PFAMI can improve the attack\nsuccess rate (ASR) by about 27.9% when compared with the best baseline.\n","authors":["Wenjie Fu","Huandong Wang","Chen Gao","Guanghua Liu","Yong Li","Tao Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.12143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.15498v2","updated":"2023-08-23T13:44:06Z","published":"2022-11-28T16:17:47Z","title":"Physics-informed neural networks with unknown measurement noise","summary":" Physics-informed neural networks (PINNs) constitute a flexible approach to\nboth finding solutions and identifying parameters of partial differential\nequations. Most works on the topic assume noiseless data, or data contaminated\nby weak Gaussian noise. We show that the standard PINN framework breaks down in\ncase of non-Gaussian noise. We give a way of resolving this fundamental issue\nand we propose to jointly train an energy-based model (EBM) to learn the\ncorrect noise distribution. We illustrate the improved performance of our\napproach using multiple examples.\n","authors":["Philipp Pilar","Niklas Wahlström"],"pdf_url":"https://arxiv.org/pdf/2211.15498v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12127v1","updated":"2023-08-23T13:33:39Z","published":"2023-08-23T13:33:39Z","title":"Masking Strategies for Background Bias Removal in Computer Vision Models","summary":" Models for fine-grained image classification tasks, where the difference\nbetween some classes can be extremely subtle and the number of samples per\nclass tends to be low, are particularly prone to picking up background-related\nbiases and demand robust methods to handle potential examples with\nout-of-distribution (OOD) backgrounds. To gain deeper insights into this\ncritical problem, our research investigates the impact of background-induced\nbias on fine-grained image classification, evaluating standard backbone models\nsuch as Convolutional Neural Network (CNN) and Vision Transformers (ViT). We\nexplore two masking strategies to mitigate background-induced bias: Early\nmasking, which removes background information at the (input) image level, and\nlate masking, which selectively masks high-level spatial features corresponding\nto the background. Extensive experiments assess the behavior of CNN and ViT\nmodels under different masking strategies, with a focus on their generalization\nto OOD backgrounds. The obtained findings demonstrate that both proposed\nstrategies enhance OOD performance compared to the baseline models, with early\nmasking consistently exhibiting the best OOD performance. Notably, a ViT\nvariant employing GAP-Pooled Patch token-based classification combined with\nearly masking achieves the highest OOD robustness.\n","authors":["Ananthu Aniraj","Cassio F. Dantas","Dino Ienco","Diego Marcos"],"pdf_url":"https://arxiv.org/pdf/2308.12127v1.pdf","comment":"Accepted at the 2023 IEEE/CVF International Conference on Computer\n Vision Workshop (ICCVW) on Out Of Distribution Generalization in Computer\n Vision (OOD-CV)"},{"id":"http://arxiv.org/abs/2308.12126v1","updated":"2023-08-23T13:32:31Z","published":"2023-08-23T13:32:31Z","title":"An Accelerated Block Proximal Framework with Adaptive Momentum for\n Nonconvex and Nonsmooth Optimization","summary":" We propose an accelerated block proximal linear framework with adaptive\nmomentum (ABPL$^+$) for nonconvex and nonsmooth optimization. We analyze the\npotential causes of the extrapolation step failing in some algorithms, and\nresolve this issue by enhancing the comparison process that evaluates the\ntrade-off between the proximal gradient step and the linear extrapolation step\nin our algorithm. Furthermore, we extends our algorithm to any scenario\ninvolving updating block variables with positive integers, allowing each cycle\nto randomly shuffle the update order of the variable blocks. Additionally,\nunder mild assumptions, we prove that ABPL$^+$ can monotonically decrease the\nfunction value without strictly restricting the extrapolation parameters and\nstep size, demonstrates the viability and effectiveness of updating these\nblocks in a random order, and we also more obviously and intuitively\ndemonstrate that the derivative set of the sequence generated by our algorithm\nis a critical point set. Moreover, we demonstrate the global convergence as\nwell as the linear and sublinear convergence rates of our algorithm by\nutilizing the Kurdyka-Lojasiewicz (K{\\L}) condition. To enhance the\neffectiveness and flexibility of our algorithm, we also expand the study to the\nimprecise version of our algorithm and construct an adaptive extrapolation\nparameter strategy, which improving its overall performance. We apply our\nalgorithm to multiple non-negative matrix factorization with the $\\ell_0$ norm,\nnonnegative tensor decomposition with the $\\ell_0$ norm, and perform extensive\nnumerical experiments to validate its effectiveness and efficiency.\n","authors":["Weifeng Yang","Wenwen Min"],"pdf_url":"https://arxiv.org/pdf/2308.12126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.14981v3","updated":"2023-08-23T13:25:16Z","published":"2022-06-30T02:17:11Z","title":"Randomized Coordinate Subgradient Method for Nonsmooth Composite\n Optimization","summary":" Coordinate-type subgradient methods for addressing nonsmooth optimization\nproblems are relatively underexplored due to the set-valued nature of the\nsubdifferential. In this work, our study focuses on nonsmooth composite\noptimization problems, encompassing a wide class of convex and weakly convex\n(nonconvex nonsmooth) problems. By utilizing the chain rule of the composite\nstructure properly, we introduce the Randomized Coordinate Subgradient method\n(RCS) for tackling this problem class. To the best of our knowledge, this is\nthe first coordinate subgradient method for solving general nonsmooth composite\noptimization problems. In theory, we consider the linearly bounded subgradients\nassumption for the objective function, which is more general than the\ntraditional Lipschitz continuity assumption, to account for practical\nscenarios. We then conduct convergence analysis for RCS in both convex and\nweakly convex cases based on this generalized Lipschitz-type assumption.\nSpecifically, we establish the $\\widetilde{\\mathcal{O}}$$(1/\\sqrt{k})$\nconvergence rate in expectation and the $\\tilde o(1/\\sqrt{k})$ almost sure\nasymptotic convergence rate in terms of the suboptimality gap when $f$ is\nconvex. For the case when $f$ is weakly convex and its subdifferential\nsatisfies the global metric subregularity property, we derive the\n$\\mathcal{O}(\\varepsilon^{-4})$ iteration complexity in expectation. We also\nestablish an asymptotic convergence result. To justify the global metric\nsubregularity property utilized in the analysis, we establish this error bound\ncondition for the concrete (real-valued) robust phase retrieval problem. We\nalso provide a convergence lemma and the relationship between the global metric\nsubregularity properties of a weakly convex function and its Moreau envelope.\nFinally, we conduct several experiments to demonstrate the possible superiority\nof RCS over the subgradient method.\n","authors":["Lei Zhao","Ding Chen","Daoli Zhu","Xiao Li"],"pdf_url":"https://arxiv.org/pdf/2206.14981v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13579v2","updated":"2023-08-23T13:20:44Z","published":"2022-11-24T13:08:43Z","title":"Knowledge-Aware Federated Active Learning with Non-IID Data","summary":" Federated learning enables multiple decentralized clients to learn\ncollaboratively without sharing the local training data. However, the expensive\nannotation cost to acquire data labels on local clients remains an obstacle in\nutilizing local data. In this paper, we propose a federated active learning\nparadigm to efficiently learn a global model with limited annotation budget\nwhile protecting data privacy in a decentralized learning way. The main\nchallenge faced by federated active learning is the mismatch between the active\nsampling goal of the global model on the server and that of the asynchronous\nlocal clients. This becomes even more significant when data is distributed\nnon-IID across local clients. To address the aforementioned challenge, we\npropose Knowledge-Aware Federated Active Learning (KAFAL), which consists of\nKnowledge-Specialized Active Sampling (KSAS) and Knowledge-Compensatory\nFederated Update (KCFU). KSAS is a novel active sampling method tailored for\nthe federated active learning problem. It deals with the mismatch challenge by\nsampling actively based on the discrepancies between local and global models.\nKSAS intensifies specialized knowledge in local clients, ensuring the sampled\ndata to be informative for both the local clients and the global model. KCFU,\nin the meantime, deals with the client heterogeneity caused by limited data and\nnon-IID data distributions. It compensates for each client's ability in weak\nclasses by the assistance of the global model. Extensive experiments and\nanalyses are conducted to show the superiority of KSAS over the\nstate-of-the-art active learning methods and the efficiency of KCFU under the\nfederated active learning framework.\n","authors":["Yu-Tong Cao","Ye Shi","Baosheng Yu","Jingya Wang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2211.13579v2.pdf","comment":"14 pages, 12 figures, ICCV23"},{"id":"http://arxiv.org/abs/2308.12120v1","updated":"2023-08-23T13:16:31Z","published":"2023-08-23T13:16:31Z","title":"An Open-Source ML-Based Full-Stack Optimization Framework for Machine\n Learning Accelerators","summary":" Parameterizable machine learning (ML) accelerators are the product of recent\nbreakthroughs in ML. To fully enable their design space exploration (DSE), we\npropose a physical-design-driven, learning-based prediction framework for\nhardware-accelerated deep neural network (DNN) and non-DNN ML algorithms. It\nadopts a unified approach that combines backend power, performance, and area\n(PPA) analysis with frontend performance simulation, thereby achieving a\nrealistic estimation of both backend PPA and system metrics such as runtime and\nenergy. In addition, our framework includes a fully automated DSE technique,\nwhich optimizes backend and system metrics through an automated search of\narchitectural and backend parameters. Experimental studies show that our\napproach consistently predicts backend PPA and system metrics with an average\n7% or less prediction error for the ASIC implementation of two deep learning\naccelerator platforms, VTA and VeriGOOD-ML, in both a commercial 12 nm process\nand a research-oriented 45 nm process.\n","authors":["Hadi Esmaeilzadeh","Soroush Ghodrati","Andrew B. Kahng","Joon Kyung Kim","Sean Kinzer","Sayak Kundu","Rohan Mahapatra","Susmita Dey Manasi","Sachin Sapatnekar","Zhiang Wang","Ziqing Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.12120v1.pdf","comment":"This is an extended version of our work titled \"Physically Accurate\n Learning-based Performance Prediction of Hardware-accelerated ML Algorithms\"\n published in MLCAD 2022"},{"id":"http://arxiv.org/abs/2308.12114v1","updated":"2023-08-23T13:09:03Z","published":"2023-08-23T13:09:03Z","title":"Less is More -- Towards parsimonious multi-task models using structured\n sparsity","summary":" Group sparsity in Machine Learning (ML) encourages simpler, more\ninterpretable models with fewer active parameter groups. This work aims to\nincorporate structured group sparsity into the shared parameters of a\nMulti-Task Learning (MTL) framework, to develop parsimonious models that can\neffectively address multiple tasks with fewer parameters while maintaining\ncomparable or superior performance to a dense model. Sparsifying the model\nduring training helps decrease the model's memory footprint, computation\nrequirements, and prediction time during inference. We use channel-wise l1/l2\ngroup sparsity in the shared layers of the Convolutional Neural Network (CNN).\nThis approach not only facilitates the elimination of extraneous groups\n(channels) but also imposes a penalty on the weights, thereby enhancing the\nlearning of all tasks. We compare the outcomes of single-task and multi-task\nexperiments under group sparsity on two publicly available MTL datasets, NYU-v2\nand CelebAMask-HQ. We also investigate how changing the sparsification degree\nimpacts both the performance of the model and the sparsity of groups.\n","authors":["Richa Upadhyay","Ronald Phlypo","Rajkumar Saini","Marcus Liwicki"],"pdf_url":"https://arxiv.org/pdf/2308.12114v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2210.14598v2","updated":"2023-08-23T13:04:05Z","published":"2022-10-26T10:12:31Z","title":"Exact Manifold Gaussian Variational Bayes","summary":" We propose an optimization algorithm for Variational Inference (VI) in\ncomplex models. Our approach relies on natural gradient updates where the\nvariational space is a Riemann manifold. We develop an efficient algorithm for\nGaussian Variational Inference that implicitly satisfies the positive definite\nconstraint on the variational covariance matrix. Our Exact manifold Gaussian\nVariational Bayes (EMGVB) provides exact but simple update rules and is\nstraightforward to implement. Due to its black-box nature, EMGVB stands as a\nready-to-use solution for VI in complex models. Over five datasets, we\nempirically validate our feasible approach on different statistical,\neconometric, and deep learning models, discussing its performance with respect\nto baseline methods.\n","authors":["Martin Magris","Mostafa Shabani","Alexandros Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2210.14598v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12112v1","updated":"2023-08-23T13:02:52Z","published":"2023-08-23T13:02:52Z","title":"Generalized Continual Category Discovery","summary":" Most of Continual Learning (CL) methods push the limit of supervised learning\nsettings, where an agent is expected to learn new labeled tasks and not forget\nprevious knowledge. However, these settings are not well aligned with real-life\nscenarios, where a learning agent has access to a vast amount of unlabeled data\nencompassing both novel (entirely unlabeled) classes and examples from known\nclasses. Drawing inspiration from Generalized Category Discovery (GCD), we\nintroduce a novel framework that relaxes this assumption. Precisely, in any\ntask, we allow for the existence of novel and known classes, and one must use\ncontinual version of unsupervised learning methods to discover them. We call\nthis setting Generalized Continual Category Discovery (GCCD). It unifies CL and\nGCD, bridging the gap between synthetic benchmarks and real-life scenarios.\nWith a series of experiments, we present that existing methods fail to\naccumulate knowledge from subsequent tasks in which unlabeled samples of novel\nclasses are present. In light of these limitations, we propose a method that\nincorporates both supervised and unsupervised signals and mitigates the\nforgetting through the use of centroid adaptation. Our method surpasses strong\nCL methods adopted for GCD techniques and presents a superior representation\nlearning performance.\n","authors":["Daniel Marczak","Grzegorz Rypeść","Sebastian Cygert","Tomasz Trzciński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2308.12112v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.12108v1","updated":"2023-08-23T12:55:41Z","published":"2023-08-23T12:55:41Z","title":"Quantifying degeneracy in singular models via the learning coefficient","summary":" Deep neural networks (DNN) are singular statistical models which exhibit\ncomplex degeneracies. In this work, we illustrate how a quantity known as the\n\\emph{learning coefficient} introduced in singular learning theory quantifies\nprecisely the degree of degeneracy in deep neural networks. Importantly, we\nwill demonstrate that degeneracy in DNN cannot be accounted for by simply\ncounting the number of \"flat\" directions. We propose a computationally scalable\napproximation of a localized version of the learning coefficient using\nstochastic gradient Langevin dynamics. To validate our approach, we demonstrate\nits accuracy in low-dimensional models with known theoretical values.\nImportantly, the local learning coefficient can correctly recover the ordering\nof degeneracy between various parameter regions of interest. An experiment on\nMNIST shows the local learning coefficient can reveal the inductive bias of\nstochastic opitmizers for more or less degenerate critical points.\n","authors":["Edmund Lau","Daniel Murfet","Susan Wei"],"pdf_url":"https://arxiv.org/pdf/2308.12108v1.pdf","comment":"22 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.12093v1","updated":"2023-08-23T12:27:55Z","published":"2023-08-23T12:27:55Z","title":"Cached Operator Reordering: A Unified View for Fast GNN Training","summary":" Graph Neural Networks (GNNs) are a powerful tool for handling structured\ngraph data and addressing tasks such as node classification, graph\nclassification, and clustering. However, the sparse nature of GNN computation\nposes new challenges for performance optimization compared to traditional deep\nneural networks. We address these challenges by providing a unified view of GNN\ncomputation, I/O, and memory. By analyzing the computational graphs of the\nGraph Convolutional Network (GCN) and Graph Attention (GAT) layers -- two\nwidely used GNN layers -- we propose alternative computation strategies. We\npresent adaptive operator reordering with caching, which achieves a speedup of\nup to 2.43x for GCN compared to the current state-of-the-art. Furthermore, an\nexploration of different caching schemes for GAT yields a speedup of up to\n1.94x. The proposed optimizations save memory, are easily implemented across\nvarious hardware platforms, and have the potential to alleviate performance\nbottlenecks in training large-scale GNN models.\n","authors":["Julia Bazinska","Andrei Ivanov","Tal Ben-Nun","Nikoli Dryden","Maciej Besta","Siyuan Shen","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2308.12093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10898v2","updated":"2023-08-23T12:27:50Z","published":"2023-02-09T20:46:46Z","title":"Estimating Driver Personality Traits from On-Road Driving Data","summary":" This paper focuses on the estimation of a driver's psychological\ncharacteristics using driving data for driving assistance systems. Driving\nassistance systems that support drivers by adapting individual psychological\ncharacteristics can provide appropriate feedback and prevent traffic accidents.\nAs a first step toward implementing such adaptive assistance systems, this\nresearch aims to develop a model to estimate drivers' psychological\ncharacteristics, such as cognitive function, psychological driving style, and\nworkload sensitivity, from on-road driving behavioral data using machine\nlearning and deep learning techniques. We also investigated the relationship\nbetween driving behavior and various cognitive functions, including the Trail\nMaking Test (TMT) and Useful Field of View (UFOV) test, through regression\nmodeling. The proposed method focuses on road type information and captures\nvarious durations of time-series data observed from driving behaviors. First,\nwe segment the driving time-series data into two road types, namely, arterial\nroads and intersections, to consider driving situations. Second, we further\nsegment data into many sequences of various durations. Third, statistics are\ncalculated from each sequence. Finally, these statistics are used as input\nfeatures of machine learning models to estimate psychological characteristics.\nThe experimental results show that our model can estimate a driver's cognitive\nfunction, namely, the TMT~(B) and UFOV test scores, with Pearson correlation\ncoefficients $r$ of 0.579 and 0.708, respectively. Some characteristics, such\nas psychological driving style and workload sensitivity, are estimated with\nhigh accuracy, but whether various duration segmentation improves accuracy\ndepends on the characteristics, and it is not effective for all\ncharacteristics.\n","authors":["Ryusei Kimura","Takahiro Tanaka","Yuki Yoshihara","Kazuhiro Fujikake","Hitoshi Kanamori","Shogo Okada"],"pdf_url":"https://arxiv.org/pdf/2302.10898v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00009v2","updated":"2023-08-23T12:24:02Z","published":"2023-06-18T20:06:58Z","title":"Comparison of Machine Learning Methods for Assigning Software Issues to\n Team Members","summary":" Software issues contain units of work to fix, improve, or create new threads\nduring the development and facilitate communication among the team members.\nAssigning an issue to the most relevant team member and determining a category\nof an issue is a tedious and challenging task. Wrong classifications cause\ndelays and rework in the project and trouble among the team members. This paper\nproposes a set of carefully curated linguistic features for shallow machine\nlearning methods and compares the performance of shallow and ensemble methods\nwith deep language models. Unlike the state-of-the-art, we assign issues to\nfour roles (designer, developer, tester, and leader) rather than to specific\nindividuals or teams to contribute to the generality of our solution. We also\nconsider the level of experience of the developers to reflect the industrial\npractices in our solution formulation. We collect and annotate five industrial\ndata sets from one of the top three global television producers to evaluate our\nproposal and compare it with deep language models. Our data sets contain 5324\nissues in total. We show that an ensemble classifier of shallow techniques\nachieves 0.92 for issue assignment in accuracy which is statistically\ncomparable to the state-of-the-art deep language models. The contributions\ninclude the public sharing of five annotated industrial issue data sets, the\ndevelopment of a clear and comprehensive feature set, the introduction of a\nnovel label set, and the validation of the efficacy of an ensemble classifier\nof shallow machine learning techniques.\n","authors":["Büşra Tabak","Fatma Başak Aydemir"],"pdf_url":"https://arxiv.org/pdf/2307.00009v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.12423v3","updated":"2023-08-23T12:02:28Z","published":"2022-05-25T00:55:47Z","title":"Deletion and Insertion Tests in Regression Models","summary":" A basic task in explainable AI (XAI) is to identify the most important\nfeatures behind a prediction made by a black box function $f$. The insertion\nand deletion tests of Petsiuk et al. (2018) can be used to judge the quality of\nalgorithms that rank pixels from most to least important for a classification.\nMotivated by regression problems we establish a formula for their area under\nthe curve (AUC) criteria in terms of certain main effects and interactions in\nan anchored decomposition of $f$. We find an expression for the expected value\nof the AUC under a random ordering of inputs to $f$ and propose an alternative\narea above a straight line for the regression setting. We use this criterion to\ncompare feature importances computed by integrated gradients (IG) to those\ncomputed by Kernel SHAP (KS) as well as LIME, DeepLIFT, vanilla gradient and\ninput$\\times$gradient methods. KS has the best overall performance in two\ndatasets we consider but it is very expensive to compute. We find that IG is\nnearly as good as KS while being much faster. Our comparison problems include\nsome binary inputs that pose a challenge to IG because it must use values\nbetween the possible variable levels and so we consider ways to handle binary\nvariables in IG. We show that sorting variables by their Shapley value does not\nnecessarily give the optimal ordering for an insertion-deletion test. It will\nhowever do that for monotone functions of additive models, such as logistic\nregression.\n","authors":["Naofumi Hama","Masayoshi Mase","Art B. Owen"],"pdf_url":"https://arxiv.org/pdf/2205.12423v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12075v1","updated":"2023-08-23T11:48:35Z","published":"2023-08-23T11:48:35Z","title":"Stabilizing RNN Gradients through Pre-training","summary":" Numerous theories of learning suggest to prevent the gradient variance from\nexponential growth with depth or time, to stabilize and improve training.\nTypically, these analyses are conducted on feed-forward fully-connected neural\nnetworks or single-layer recurrent neural networks, given their mathematical\ntractability. In contrast, this study demonstrates that pre-training the\nnetwork to local stability can be effective whenever the architectures are too\ncomplex for an analytical initialization. Furthermore, we extend known\nstability theories to encompass a broader family of deep recurrent networks,\nrequiring minimal assumptions on data and parameter distribution, a theory that\nwe refer to as the Local Stability Condition (LSC). Our investigation reveals\nthat the classical Glorot, He, and Orthogonal initialization schemes satisfy\nthe LSC when applied to feed-forward fully-connected neural networks. However,\nanalysing deep recurrent networks, we identify a new additive source of\nexponential explosion that emerges from counting gradient paths in a\nrectangular grid in depth and time. We propose a new approach to mitigate this\nissue, that consists on giving a weight of a half to the time and depth\ncontributions to the gradient, instead of the classical weight of one. Our\nempirical results confirm that pre-training both feed-forward and recurrent\nnetworks to fulfill the LSC often results in improved final performance across\nmodels. This study contributes to the field by providing a means to stabilize\nnetworks of any complexity. Our approach can be implemented as an additional\nstep before pre-training on large augmented datasets, and as an alternative to\nfinding stable initializations analytically.\n","authors":["Luca Herranz-Celotti","Jean Rouat"],"pdf_url":"https://arxiv.org/pdf/2308.12075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12069v1","updated":"2023-08-23T11:31:50Z","published":"2023-08-23T11:31:50Z","title":"Identifying Reaction-Aware Driving Styles of Stochastic Model Predictive\n Controlled Vehicles by Inverse Reinforcement Learning","summary":" The driving style of an Autonomous Vehicle (AV) refers to how it behaves and\ninteracts with other AVs. In a multi-vehicle autonomous driving system, an AV\ncapable of identifying the driving styles of its nearby AVs can reliably\nevaluate the risk of collisions and make more reasonable driving decisions.\nHowever, there has not been a consistent definition of driving styles for an AV\nin the literature, although it is considered that the driving style is encoded\nin the AV's trajectories and can be identified using Maximum Entropy Inverse\nReinforcement Learning (ME-IRL) methods as a cost function. Nevertheless, an\nimportant indicator of the driving style, i.e., how an AV reacts to its nearby\nAVs, is not fully incorporated in the feature design of previous ME-IRL\nmethods. In this paper, we describe the driving style as a cost function of a\nseries of weighted features. We design additional novel features to capture the\nAV's reaction-aware characteristics. Then, we identify the driving styles from\nthe demonstration trajectories generated by the Stochastic Model Predictive\nControl (SMPC) using a modified ME-IRL method with our newly proposed features.\nThe proposed method is validated using MATLAB simulation and an off-the-shelf\nexperiment.\n","authors":["Ni Dang","Tao Shi","Zengjie Zhang","Wanxin Jin","Marion Leibold","Martin Buss"],"pdf_url":"https://arxiv.org/pdf/2308.12069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12067v1","updated":"2023-08-23T11:27:30Z","published":"2023-08-23T11:27:30Z","title":"InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4","summary":" Multimodal large language models acquire their instruction-following\ncapabilities through a two-stage training process: pre-training on image-text\npairs and fine-tuning on supervised vision-language instruction data. Recent\nstudies have shown that large language models can achieve satisfactory results\neven with a limited amount of high-quality instruction-following data. In this\npaper, we introduce InstructionGPT-4, which is fine-tuned on a small dataset\ncomprising only 200 examples, amounting to approximately 6% of the\ninstruction-following data used in the alignment dataset for MiniGPT-4. We\nfirst propose several metrics to access the quality of multimodal instruction\ndata. Based on these metrics, we present a simple and effective data selector\nto automatically identify and filter low-quality vision-language data. By\nemploying this method, InstructionGPT-4 outperforms the original MiniGPT-4 on\nvarious evaluations (e.g., visual question answering, GPT-4 preference).\nOverall, our findings demonstrate that less but high-quality instruction tuning\ndata is efficient to enable multimodal large language models to generate better\noutput.\n","authors":["Lai Wei","Zihao Jiang","Weiran Huang","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12066v1","updated":"2023-08-23T11:25:37Z","published":"2023-08-23T11:25:37Z","title":"Pre-gated MoE: An Algorithm-System Co-Design for Fast and Scalable\n Mixture-of-Expert Inference","summary":" Large language models (LLMs) based on transformers have made significant\nstrides in recent years, the success of which is driven by scaling up their\nmodel size. Despite their high algorithmic performance, the computational and\nmemory requirements of LLMs present unprecedented challenges. To tackle the\nhigh compute requirements of LLMs, the Mixture-of-Experts (MoE) architecture\nwas introduced which is able to scale its model size without proportionally\nscaling up its computational requirements. Unfortunately, MoE's high memory\ndemands and dynamic activation of sparse experts restrict its applicability to\nreal-world problems. Previous solutions that offload MoE's memory-hungry expert\nparameters to CPU memory fall short because the latency to migrate activated\nexperts from CPU to GPU incurs high performance overhead. Our proposed\nPre-gated MoE system effectively tackles the compute and memory challenges of\nconventional MoE architectures using our algorithm-system co-design. Pre-gated\nMoE employs our novel pre-gating function which alleviates the dynamic nature\nof sparse expert activation, allowing our proposed system to address the large\nmemory footprint of MoEs while also achieving high performance. We demonstrate\nthat Pre-gated MoE is able to improve performance, reduce GPU memory\nconsumption, while also maintaining the same level of model quality. These\nfeatures allow our Pre-gated MoE system to cost-effectively deploy large-scale\nLLMs using just a single GPU with high performance.\n","authors":["Ranggi Hwang","Jianyu Wei","Shijie Cao","Changho Hwang","Xiaohu Tang","Ting Cao","Mao Yang","Minsoo Rhu"],"pdf_url":"https://arxiv.org/pdf/2308.12066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12065v1","updated":"2023-08-23T11:24:28Z","published":"2023-08-23T11:24:28Z","title":"Ensembling Uncertainty Measures to Improve Safety of Black-Box\n Classifiers","summary":" Machine Learning (ML) algorithms that perform classification may predict the\nwrong class, experiencing misclassifications. It is well-known that\nmisclassifications may have cascading effects on the encompassing system,\npossibly resulting in critical failures. This paper proposes SPROUT, a Safety\nwraPper thROugh ensembles of UncertainTy measures, which suspects\nmisclassifications by computing uncertainty measures on the inputs and outputs\nof a black-box classifier. If a misclassification is detected, SPROUT blocks\nthe propagation of the output of the classifier to the encompassing system. The\nresulting impact on safety is that SPROUT transforms erratic outputs\n(misclassifications) into data omission failures, which can be easily managed\nat the system level. SPROUT has a broad range of applications as it fits binary\nand multi-class classification, comprising image and tabular datasets. We\nexperimentally show that SPROUT always identifies a huge fraction of the\nmisclassifications of supervised classifiers, and it is able to detect all\nmisclassifications in specific cases. SPROUT implementation contains\npre-trained wrappers, it is publicly available and ready to be deployed with\nminimal effort.\n","authors":["Tommaso Zoppi","Andrea Ceccarelli","Andrea Bondavalli"],"pdf_url":"https://arxiv.org/pdf/2308.12065v1.pdf","comment":"To appear at ECAI23 in October23"},{"id":"http://arxiv.org/abs/2301.10137v2","updated":"2023-08-23T11:17:19Z","published":"2023-01-12T13:53:27Z","title":"Dirac signal processing of higher-order topological signals","summary":" Higher-order networks can sustain topological signals which are variables\nassociated not only to the nodes, but also to the links, to the triangles and\nin general to the higher dimensional simplices of simplicial complexes. These\ntopological signals can describe a large variety of real systems including\ncurrents in the ocean, synaptic currents between neurons and biological\ntransportation networks. In real scenarios topological signal data might be\nnoisy and an important task is to process these signals by improving their\nsignal to noise ratio. So far topological signals are typically processed\nindependently of each other. For instance, node signals are processed\nindependently of link signals, and algorithms that can enforce a consistent\nprocessing of topological signals across different dimensions are largely\nlacking. Here we propose Dirac signal processing, an adaptive, unsupervised\nsignal processing algorithm that learns to jointly filter topological signals\nsupported on nodes, links and triangles of simplicial complexes in a consistent\nway. The proposed Dirac signal processing algorithm is formulated in terms of\nthe discrete Dirac operator which can be interpreted as \"square root\" of a\nhigher-order Hodge Laplacian. We discuss in detail the properties of the Dirac\noperator including its spectrum and the chirality of its eigenvectors and we\nadopt this operator to formulate Dirac signal processing that can filter noisy\nsignals defined on nodes, links and triangles of simplicial complexes. We test\nour algorithms on noisy synthetic data and noisy data of drifters in the ocean\nand find that the algorithm can learn to efficiently reconstruct the true\nsignals outperforming algorithms based exclusively on the Hodge Laplacian.\n","authors":["Lucille Calmon","Michael T. Schaub","Ginestra Bianconi"],"pdf_url":"https://arxiv.org/pdf/2301.10137v2.pdf","comment":"(26 pages, 12 figures)"},{"id":"http://arxiv.org/abs/2210.13708v3","updated":"2023-08-23T11:15:21Z","published":"2022-10-11T03:11:12Z","title":"MARLlib: A Scalable and Efficient Multi-agent Reinforcement Learning\n Library","summary":" A significant challenge facing researchers in the area of multi-agent\nreinforcement learning (MARL) pertains to the identification of a library that\ncan offer fast and compatible development for multi-agent tasks and algorithm\ncombinations, while obviating the need to consider compatibility issues. In\nthis paper, we present MARLlib, a library designed to address the\naforementioned challenge by leveraging three key mechanisms: 1) a standardized\nmulti-agent environment wrapper, 2) an agent-level algorithm implementation,\nand 3) a flexible policy mapping strategy. By utilizing these mechanisms,\nMARLlib can effectively disentangle the intertwined nature of the multi-agent\ntask and the learning process of the algorithm, with the ability to\nautomatically alter the training strategy based on the current task's\nattributes. The MARLlib library's source code is publicly accessible on GitHub:\n\\url{https://github.com/Replicable-MARL/MARLlib}.\n","authors":["Siyi Hu","Yifan Zhong","Minquan Gao","Weixun Wang","Hao Dong","Xiaodan Liang","Zhihui Li","Xiaojun Chang","Yaodong Yang"],"pdf_url":"https://arxiv.org/pdf/2210.13708v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12061v1","updated":"2023-08-23T11:03:28Z","published":"2023-08-23T11:03:28Z","title":"HarvestNet: A Dataset for Detecting Smallholder Farming Activity Using\n Harvest Piles and Remote Sensing","summary":" Small farms contribute to a large share of the productive land in developing\ncountries. In regions such as sub-Saharan Africa, where 80% of farms are small\n(under 2 ha in size), the task of mapping smallholder cropland is an important\npart of tracking sustainability measures such as crop productivity. However,\nthe visually diverse and nuanced appearance of small farms has limited the\neffectiveness of traditional approaches to cropland mapping. Here we introduce\na new approach based on the detection of harvest piles characteristic of many\nsmallholder systems throughout the world. We present HarvestNet, a dataset for\nmapping the presence of farms in the Ethiopian regions of Tigray and Amhara\nduring 2020-2023, collected using expert knowledge and satellite images,\ntotaling 7k hand-labeled images and 2k ground collected labels. We also\nbenchmark a set of baselines including SOTA models in remote sensing with our\nbest models having around 80% classification performance on hand labelled data\nand 90%, 98% accuracy on ground truth data for Tigray, Amhara respectively. We\nalso perform a visual comparison with a widely used pre-existing coverage map\nand show that our model detects an extra 56,621 hectares of cropland in Tigray.\nWe conclude that remote sensing of harvest piles can contribute to more timely\nand accurate cropland assessments in food insecure region.\n","authors":["Jonathan Xu","Amna Elmustafa","Liya Weldegebriel","Emnet Negash","Richard Lee","Chenlin Meng","Stefano Ermon","David Lobell"],"pdf_url":"https://arxiv.org/pdf/2308.12061v1.pdf","comment":"18 pages, 22 figures"},{"id":"http://arxiv.org/abs/2308.12059v1","updated":"2023-08-23T10:59:41Z","published":"2023-08-23T10:59:41Z","title":"Manipulating Embeddings of Stable Diffusion Prompts","summary":" Generative text-to-image models such as Stable Diffusion allow users to\ngenerate images based on a textual description, the prompt. Changing the prompt\nis still the primary means for the user to change a generated image as desired.\nHowever, changing the image by reformulating the prompt remains a difficult\nprocess of trial and error, which has led to the emergence of prompt\nengineering as a new field of research. We propose and analyze methods to\nchange the embedding of a prompt directly instead of the prompt text. It allows\nfor more fine-grained and targeted control that takes into account user\nintentions. Our approach treats the generative text-to-image model as a\ncontinuous function and passes gradients between the image space and the prompt\nembedding space. By addressing different user interaction problems, we can\napply this idea in three scenarios: (1) Optimization of a metric defined in\nimage space that could measure, for example, image style. (2) Assistance of\nusers in creative tasks by enabling them to navigate the image space along a\nselection of directions of \"near\" prompt embeddings. (3) Changing the embedding\nof the prompt to include information that the user has seen in a particular\nseed but finds difficult to describe in the prompt. Our experiments demonstrate\nthe feasibility of the described methods.\n","authors":["Niklas Deckers","Julia Peters","Martin Potthast"],"pdf_url":"https://arxiv.org/pdf/2308.12059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12054v1","updated":"2023-08-23T10:51:33Z","published":"2023-08-23T10:51:33Z","title":"Sample Complexity of Robust Learning against Evasion Attacks","summary":" It is becoming increasingly important to understand the vulnerability of\nmachine learning models to adversarial attacks. One of the fundamental problems\nin adversarial machine learning is to quantify how much training data is needed\nin the presence of evasion attacks, where data is corrupted at test time. In\nthis thesis, we work with the exact-in-the-ball notion of robustness and study\nthe feasibility of adversarially robust learning from the perspective of\nlearning theory, considering sample complexity.\n We first explore the setting where the learner has access to random examples\nonly, and show that distributional assumptions are essential. We then focus on\nlearning problems with distributions on the input data that satisfy a Lipschitz\ncondition and show that robustly learning monotone conjunctions has sample\ncomplexity at least exponential in the adversary's budget (the maximum number\nof bits it can perturb on each input). However, if the adversary is restricted\nto perturbing $O(\\log n)$ bits, then one can robustly learn conjunctions and\ndecision lists w.r.t. log-Lipschitz distributions.\n We then study learning models where the learner is given more power. We first\nconsider local membership queries, where the learner can query the label of\npoints near the training sample. We show that, under the uniform distribution,\nthe exponential dependence on the adversary's budget to robustly learn\nconjunctions remains inevitable. We then introduce a local equivalence query\noracle, which returns whether the hypothesis and target concept agree in a\ngiven region around a point in the training sample, and a counterexample if it\nexists. We show that if the query radius is equal to the adversary's budget, we\ncan develop robust empirical risk minimization algorithms in the\ndistribution-free setting. We give general query complexity upper and lower\nbounds, as well as for concrete concept classes.\n","authors":["Pascale Gourdeau"],"pdf_url":"https://arxiv.org/pdf/2308.12054v1.pdf","comment":"DPhil (PhD) Thesis - University of Oxford"},{"id":"http://arxiv.org/abs/2308.12053v1","updated":"2023-08-23T10:48:28Z","published":"2023-08-23T10:48:28Z","title":"Layer-wise Feedback Propagation","summary":" In this paper, we present Layer-wise Feedback Propagation (LFP), a novel\ntraining approach for neural-network-like predictors that utilizes\nexplainability, specifically Layer-wise Relevance Propagation(LRP), to assign\nrewards to individual connections based on their respective contributions to\nsolving a given task. This differs from traditional gradient descent, which\nupdates parameters towards anestimated loss minimum. LFP distributes a reward\nsignal throughout the model without the need for gradient computations. It then\nstrengthens structures that receive positive feedback while reducingthe\ninfluence of structures that receive negative feedback. We establish the\nconvergence of LFP theoretically and empirically, and demonstrate its\neffectiveness in achieving comparable performance to gradient descent on\nvarious models and datasets. Notably, LFP overcomes certain limitations\nassociated with gradient-based methods, such as reliance on meaningful\nderivatives. We further investigate how the different LRP-rules can be extended\nto LFP, what their effects are on training, as well as potential applications,\nsuch as training models with no meaningful derivatives, e.g., step-function\nactivated Spiking Neural Networks (SNNs), or for transfer learning, to\nefficiently utilize existing knowledge.\n","authors":["Leander Weber","Jim Berend","Alexander Binder","Thomas Wiegand","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2308.12053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.11418v2","updated":"2023-08-23T10:42:09Z","published":"2022-04-25T03:32:17Z","title":"Riemannian Hamiltonian methods for min-max optimization on manifolds","summary":" In this paper, we study min-max optimization problems on Riemannian\nmanifolds. We introduce a Riemannian Hamiltonian function, minimization of\nwhich serves as a proxy for solving the original min-max problems. Under the\nRiemannian Polyak--{\\L}ojasiewicz condition on the Hamiltonian function, its\nminimizer corresponds to the desired min-max saddle point. We also provide\ncases where this condition is satisfied. For geodesic-bilinear optimization in\nparticular, solving the proxy problem leads to the correct search direction\ntowards global optimality, which becomes challenging with the min-max\nformulation. To minimize the Hamiltonian function, we propose Riemannian\nHamiltonian methods (RHM) and present their convergence analyses. We extend RHM\nto include consensus regularization and to the stochastic setting. We\nillustrate the efficacy of the proposed RHM in applications such as subspace\nrobust Wasserstein distance, robust training of neural networks, and generative\nadversarial networks.\n","authors":["Andi Han","Bamdev Mishra","Pratik Jawanpuria","Pawan Kumar","Junbin Gao"],"pdf_url":"https://arxiv.org/pdf/2204.11418v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10276v2","updated":"2023-08-23T10:10:49Z","published":"2023-08-20T14:12:11Z","title":"Minimalist Traffic Prediction: Linear Layer Is All You Need","summary":" Traffic prediction is essential for the progression of Intelligent\nTransportation Systems (ITS) and the vision of smart cities. While\nSpatial-Temporal Graph Neural Networks (STGNNs) have shown promise in this\ndomain by leveraging Graph Neural Networks (GNNs) integrated with either RNNs\nor Transformers, they present challenges such as computational complexity,\ngradient issues, and resource-intensiveness. This paper addresses these\nchallenges, advocating for three main solutions: a node-embedding approach,\ntime series decomposition, and periodicity learning. We introduce STLinear, a\nminimalist model architecture designed for optimized efficiency and\nperformance. Unlike traditional STGNNs, STlinear operates fully locally,\navoiding inter-node data exchanges, and relies exclusively on linear layers,\ndrastically cutting computational demands. Our empirical studies on real-world\ndatasets confirm STLinear's prowess, matching or exceeding the accuracy of\nleading STGNNs, but with significantly reduced complexity and computation\noverhead (more than 95% reduction in MACs per epoch compared to\nstate-of-the-art STGNN baseline published in 2023). In summary, STLinear\nemerges as a potent, efficient alternative to conventional STGNNs, with\nprofound implications for the future of ITS and smart city initiatives.\n","authors":["Wenying Duan","Hong Rao","Wei Huang","Xiaoxi He"],"pdf_url":"https://arxiv.org/pdf/2308.10276v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.12044v1","updated":"2023-08-23T10:08:52Z","published":"2023-08-23T10:08:52Z","title":"A multiobjective continuation method to compute the regularization path\n of deep neural networks","summary":" Sparsity is a highly desired feature in deep neural networks (DNNs) since it\nensures numerical efficiency, improves the interpretability of models (due to\nthe smaller number of relevant features), and robustness. In machine learning\napproaches based on linear models, it is well known that there exists a\nconnecting path between the sparsest solution in terms of the $\\ell^1$ norm\n(i.e., zero weights) and the non-regularized solution, which is called the\nregularization path. Very recently, there was a first attempt to extend the\nconcept of regularization paths to DNNs by means of treating the empirical loss\nand sparsity ($\\ell^1$ norm) as two conflicting criteria and solving the\nresulting multiobjective optimization problem. However, due to the\nnon-smoothness of the $\\ell^1$ norm and the high number of parameters, this\napproach is not very efficient from a computational perspective. To overcome\nthis limitation, we present an algorithm that allows for the approximation of\nthe entire Pareto front for the above-mentioned objectives in a very efficient\nmanner. We present numerical examples using both deterministic and stochastic\ngradients. We furthermore demonstrate that knowledge of the regularization path\nallows for a well-generalizing network parametrization.\n","authors":["Augustina C. Amakor","Konstantin Sontag","Sebastian Peitz"],"pdf_url":"https://arxiv.org/pdf/2308.12044v1.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.12043v1","updated":"2023-08-23T10:08:10Z","published":"2023-08-23T10:08:10Z","title":"IncreLoRA: Incremental Parameter Allocation Method for\n Parameter-Efficient Fine-tuning","summary":" With the increasing size of pre-trained language models (PLMs), fine-tuning\nall the parameters in the model is not efficient, especially when there are a\nlarge number of downstream tasks, which incur significant training and storage\ncosts. Many parameter-efficient fine-tuning (PEFT) approaches have been\nproposed, among which, Low-Rank Adaptation (LoRA) is a representative approach\nthat injects trainable rank decomposition matrices into every target module.\nYet LoRA ignores the importance of parameters in different modules. To address\nthis problem, many works have been proposed to prune the parameters of LoRA.\nHowever, under limited training conditions, the upper bound of the rank of the\npruned parameter matrix is still affected by the preset values. We, therefore,\npropose IncreLoRA, an incremental parameter allocation method that adaptively\nadds trainable parameters during training based on the importance scores of\neach module. This approach is different from the pruning method as it is not\nlimited by the initial number of training parameters, and each parameter matrix\nhas a higher rank upper bound for the same training overhead. We conduct\nextensive experiments on GLUE to demonstrate the effectiveness of IncreLoRA.\nThe results show that our method owns higher parameter efficiency, especially\nwhen under the low-resource settings where our method significantly outperforms\nthe baselines. Our code is publicly available.\n","authors":["Feiyu Zhang","Liangzhi Li","Junhao Chen","Zhouqiang Jiang","Bowen Wang","Yiming Qian"],"pdf_url":"https://arxiv.org/pdf/2308.12043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15782v3","updated":"2023-08-23T10:02:15Z","published":"2023-06-27T20:09:56Z","title":"UTRNet: High-Resolution Urdu Text Recognition In Printed Documents","summary":" In this paper, we propose a novel approach to address the challenges of\nprinted Urdu text recognition using high-resolution, multi-scale semantic\nfeature extraction. Our proposed UTRNet architecture, a hybrid CNN-RNN model,\ndemonstrates state-of-the-art performance on benchmark datasets. To address the\nlimitations of previous works, which struggle to generalize to the intricacies\nof the Urdu script and the lack of sufficient annotated real-world data, we\nhave introduced the UTRSet-Real, a large-scale annotated real-world dataset\ncomprising over 11,000 lines and UTRSet-Synth, a synthetic dataset with 20,000\nlines closely resembling real-world and made corrections to the ground truth of\nthe existing IIITH dataset, making it a more reliable resource for future\nresearch. We also provide UrduDoc, a benchmark dataset for Urdu text line\ndetection in scanned documents. Additionally, we have developed an online tool\nfor end-to-end Urdu OCR from printed documents by integrating UTRNet with a\ntext detection model. Our work not only addresses the current limitations of\nUrdu OCR but also paves the way for future research in this area and\nfacilitates the continued advancement of Urdu OCR technology. The project page\nwith source code, datasets, annotations, trained models, and online tool is\navailable at abdur75648.github.io/UTRNet.\n","authors":["Abdur Rahman","Arjun Ghosh","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2306.15782v3.pdf","comment":"Accepted at The 17th International Conference on Document Analysis\n and Recognition (ICDAR 2023)"},{"id":"http://arxiv.org/abs/2308.12031v1","updated":"2023-08-23T09:44:12Z","published":"2023-08-23T09:44:12Z","title":"CACTUS: a Comprehensive Abstraction and Classification Tool for\n Uncovering Structures","summary":" The availability of large data sets is providing an impetus for driving\ncurrent artificial intelligent developments. There are, however, challenges for\ndeveloping solutions with small data sets due to practical and cost-effective\ndeployment and the opacity of deep learning models. The Comprehensive\nAbstraction and Classification Tool for Uncovering Structures called CACTUS is\npresented for improved secure analytics by effectively employing explainable\nartificial intelligence. It provides additional support for categorical\nattributes, preserving their original meaning, optimising memory usage, and\nspeeding up the computation through parallelisation. It shows to the user the\nfrequency of the attributes in each class and ranks them by their\ndiscriminative power. Its performance is assessed by application to the\nWisconsin diagnostic breast cancer and Thyroid0387 data sets.\n","authors":["Luca Gherardini","Varun Ravi Varma","Karol Capala","Roger Woods","Jose Sousa"],"pdf_url":"https://arxiv.org/pdf/2308.12031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12030v1","updated":"2023-08-23T09:43:10Z","published":"2023-08-23T09:43:10Z","title":"Prompt-Based Length Controlled Generation with Reinforcement Learning","summary":" Recently, large language models (LLMs) like ChatGPT and GPT-4 have attracted\ngreat attention given their surprising improvement and performance. Length\ncontrolled generation of LLMs emerges as an important topic, which also enables\nusers to fully leverage the capability of LLMs in more real-world scenarios\nlike generating a proper answer or essay of a desired length. In addition, the\nautoregressive generation in LLMs is extremely time-consuming, while the\nability of controlling this generated length can arbitrarily reduce the\ninference cost by limiting the length, and thus satisfy different needs.\nTherefore, we aim to propose a prompt-based length control method to achieve\nthis length controlled generation, which can also be widely applied in\nGPT-style LLMs. In particular, we adopt reinforcement learning with the reward\nsignal given by either trainable or rule-based reward model, which further\naffects the generation of LLMs via rewarding a pre-defined target length.\nExperiments show that our method significantly improves the accuracy of\nprompt-based length control for summarization task on popular datasets like\nCNNDM and NYT. We believe this length-controllable ability can provide more\npotentials towards the era of LLMs.\n","authors":["Renlong Jie","Xiaojun Meng","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12029v1","updated":"2023-08-23T09:41:28Z","published":"2023-08-23T09:41:28Z","title":"A Scale-Invariant Task Balancing Approach for Multi-Task Learning","summary":" Multi-task learning (MTL), a learning paradigm to learn multiple related\ntasks simultaneously, has achieved great success in various fields. However,\ntask-balancing remains a significant challenge in MTL, with the disparity in\nloss/gradient scales often leading to performance compromises. In this paper,\nwe propose a Scale-Invariant Multi-Task Learning (SI-MTL) method to alleviate\nthe task-balancing problem from both loss and gradient perspectives.\nSpecifically, SI-MTL contains a logarithm transformation which is performed on\nall task losses to ensure scale-invariant at the loss level, and a gradient\nbalancing method, SI-G, which normalizes all task gradients to the same\nmagnitude as the maximum gradient norm. Extensive experiments conducted on\nseveral benchmark datasets consistently demonstrate the effectiveness of SI-G\nand the state-of-the-art performance of SI-MTL.\n","authors":["Baijiong Lin","Weisen Jiang","Feiyang Ye","Yu Zhang","Pengguang Chen","Ying-Cong Chen","Shu Liu"],"pdf_url":"https://arxiv.org/pdf/2308.12029v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2308.07758v3","updated":"2023-08-23T09:35:33Z","published":"2023-08-15T13:19:59Z","title":"Forward-Backward Reasoning in Large Language Models for Verification","summary":" Chain-of-Though (CoT) prompting has shown promising performance in various\nreasoning tasks. Recently, Self-Consistency \\citep{wang2023selfconsistency}\nproposes to sample a diverse set of reasoning chains which may lead to\ndifferent answers while the answer that receives the most votes is selected. In\nthis paper, we propose a novel method to use backward reasoning in verifying\ncandidate answers. We mask a token in the question by ${\\bf x}$ and ask the LLM\nto predict the masked token when a candidate answer is provided by \\textit{a\nsimple template}, i.e., \"\\textit{\\textbf{If we know the answer of the above\nquestion is \\{a candidate answer\\}, what is the value of unknown variable ${\\bf\nx}$?}}\" Intuitively, the LLM is expected to predict the masked token\nsuccessfully if the provided candidate answer is correct. We further propose\nFOBAR to combine forward and backward reasoning for estimating the probability\nof candidate answers. We conduct extensive experiments on six data sets and\nthree LLMs. Experimental results demonstrate that FOBAR achieves\nstate-of-the-art performance on various reasoning benchmarks.\n","authors":["Weisen Jiang","Han Shi","Longhui Yu","Zhengying Liu","Yu Zhang","Zhenguo Li","James T. Kwok"],"pdf_url":"https://arxiv.org/pdf/2308.07758v3.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2302.13991v2","updated":"2023-08-23T09:27:19Z","published":"2023-02-27T17:30:00Z","title":"Learning to Generalize towards Unseen Domains via a Content-Aware Style\n Invariant Model for Disease Detection from Chest X-rays","summary":" Performance degradation due to source domain mismatch is a longstanding\nchallenge in deep learning-based medical image analysis, particularly for chest\nX-rays (CXRs). Several methods (e.g., adversarial training, multi-domain\nmixups) have been proposed to extract domain-invariant high-level features to\naddress this domain shift. However, these methods do not explicitly regularize\nthe content and style characteristics of the extracted domain-invariant\nfeatures. Recent studies have demonstrated that CNN models exhibit a strong\nbias toward styles (e.g., uninformative textures) rather than content (e.g.,\nshape), in stark contrast to the human-vision system. Radiologists tend to\nlearn visual cues from CXRs and thus perform well across multiple domains.\nTherefore, in medical imaging for pathology diagnosis from CXR images, models\nshould extract domain-invariant features that are style-invariant and\ncontent-biased. Motivated by this, we employ the novel style randomization\nmodules (SRMs) at both image and feature levels that work together\nhierarchically to create rich style perturbed features on the fly while keeping\nthe content intact. In addition, we leverage consistency regularizations\nbetween global semantic features and predicted probability distributions,\nrespectively, for with and without style perturbed versions of the same CXR\nimage to tweak the model's sensitivity toward content markers for accurate\npredictions. Extensive experiments with three large-scale thoracic disease\ndatasets, i.e., CheXpert, MIMIC-CXR, and BRAX, demonstrate that our proposed\nframework is more robust in the presence of domain shift and achieves\nstate-of-the-art performance.\n","authors":["Mohammad Zunaed","Md. Aynal Haque","Taufiq Hasan"],"pdf_url":"https://arxiv.org/pdf/2302.13991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12018v1","updated":"2023-08-23T09:20:41Z","published":"2023-08-23T09:20:41Z","title":"Bias-Aware Minimisation: Understanding and Mitigating Estimator Bias in\n Private SGD","summary":" Differentially private SGD (DP-SGD) holds the promise of enabling the safe\nand responsible application of machine learning to sensitive datasets. However,\nDP-SGD only provides a biased, noisy estimate of a mini-batch gradient. This\nrenders optimisation steps less effective and limits model utility as a result.\nWith this work, we show a connection between per-sample gradient norms and the\nestimation bias of the private gradient oracle used in DP-SGD. Here, we propose\nBias-Aware Minimisation (BAM) that allows for the provable reduction of private\ngradient estimator bias. We show how to efficiently compute quantities needed\nfor BAM to scale to large neural networks and highlight similarities to closely\nrelated methods such as Sharpness-Aware Minimisation. Finally, we provide\nempirical evidence that BAM not only reduces bias but also substantially\nimproves privacy-utility trade-offs on the CIFAR-10, CIFAR-100, and ImageNet-32\ndatasets.\n","authors":["Moritz Knolle","Robert Dorfman","Alexander Ziller","Daniel Rueckert","Georgios Kaissis"],"pdf_url":"https://arxiv.org/pdf/2308.12018v1.pdf","comment":"Accepted to the 2023 Theory and Practice of Differential Privacy\n (TPDP) Workshop"},{"id":"http://arxiv.org/abs/2308.12016v1","updated":"2023-08-23T09:18:41Z","published":"2023-08-23T09:18:41Z","title":"MKL-$L_{0/1}$-SVM","summary":" This paper presents a Multiple Kernel Learning (abbreviated as MKL) framework\nfor the Support Vector Machine (SVM) with the $(0, 1)$ loss function. Some\nfirst-order optimality conditions are given and then exploited to develop a\nfast ADMM solver to deal with the nonconvex and nonsmooth optimization problem.\nExtensive numerical experiments on synthetic and real datasets show that the\nperformance of our MKL-$L_{0/1}$-SVM is comparable with the one of the leading\napproaches called SimpleMKL developed by Rakotomamonjy, Bach, Canu, and\nGrandvalet [Journal of Machine Learning Research, vol. 9, pp. 2491-2521, 2008].\n","authors":["Bin Zhu","Yijie Shi"],"pdf_url":"https://arxiv.org/pdf/2308.12016v1.pdf","comment":"25 pages in the JMLR template, 4 figures, and 2 tables. arXiv admin\n note: substantial text overlap with arXiv:2303.04445"},{"id":"http://arxiv.org/abs/2308.12013v1","updated":"2023-08-23T09:09:32Z","published":"2023-08-23T09:09:32Z","title":"Quantum-Noise-driven Generative Diffusion Models","summary":" Generative models realized with machine learning techniques are powerful\ntools to infer complex and unknown data distributions from a finite number of\ntraining samples in order to produce new synthetic data. Diffusion models are\nan emerging framework that have recently overcome the performance of the\ngenerative adversarial networks in creating synthetic text and high-quality\nimages. Here, we propose and discuss the quantum generalization of diffusion\nmodels, i.e., three quantum-noise-driven generative diffusion models that could\nbe experimentally tested on real quantum systems. The idea is to harness unique\nquantum features, in particular the non-trivial interplay among coherence,\nentanglement and noise that the currently available noisy quantum processors do\nunavoidably suffer from, in order to overcome the main computational burdens of\nclassical diffusion models during inference. Hence, we suggest to exploit\nquantum noise not as an issue to be detected and solved but instead as a very\nremarkably beneficial key ingredient to generate much more complex probability\ndistributions that would be difficult or even impossible to express\nclassically, and from which a quantum processor might sample more efficiently\nthan a classical one. Therefore, our results are expected to pave the way for\nnew quantum-inspired or quantum-based generative diffusion algorithms\naddressing more powerfully classical tasks as data generation/prediction with\nwidespread real-world applications ranging from climate forecasting to\nneuroscience, from traffic flow analysis to financial forecasting.\n","authors":["Marco Parigi","Stefano Martina","Filippo Caruso"],"pdf_url":"https://arxiv.org/pdf/2308.12013v1.pdf","comment":"13 pages, 2 figures"},{"id":"http://arxiv.org/abs/2301.04785v3","updated":"2023-08-23T09:06:18Z","published":"2023-01-12T02:25:22Z","title":"Phase-shifted Adversarial Training","summary":" Adversarial training has been considered an imperative component for safely\ndeploying neural network-based applications to the real world. To achieve\nstronger robustness, existing methods primarily focus on how to generate strong\nattacks by increasing the number of update steps, regularizing the models with\nthe smoothed loss function, and injecting the randomness into the attack.\nInstead, we analyze the behavior of adversarial training through the lens of\nresponse frequency. We empirically discover that adversarial training causes\nneural networks to have low convergence to high-frequency information,\nresulting in highly oscillated predictions near each data. To learn\nhigh-frequency contents efficiently and effectively, we first prove that a\nuniversal phenomenon of frequency principle, i.e., \\textit{lower frequencies\nare learned first}, still holds in adversarial training. Based on that, we\npropose phase-shifted adversarial training (PhaseAT) in which the model learns\nhigh-frequency components by shifting these frequencies to the low-frequency\nrange where the fast convergence occurs. For evaluations, we conduct the\nexperiments on CIFAR-10 and ImageNet with the adaptive attack carefully\ndesigned for reliable evaluation. Comprehensive results show that PhaseAT\nsignificantly improves the convergence for high-frequency information. This\nresults in improved adversarial robustness by enabling the model to have\nsmoothed predictions near each data.\n","authors":["Yeachan Kim","Seongyeon Kim","Ihyeok Seo","Bonggun Shin"],"pdf_url":"https://arxiv.org/pdf/2301.04785v3.pdf","comment":"Conference on Uncertainty in Artificial Intelligence, 2023 (UAI 2023)"},{"id":"http://arxiv.org/abs/2301.05763v3","updated":"2023-08-23T08:57:56Z","published":"2023-01-13T21:24:23Z","title":"A Rigorous Uncertainty-Aware Quantification Framework Is Essential for\n Reproducible and Replicable Machine Learning Workflows","summary":" The ability to replicate predictions by machine learning (ML) or artificial\nintelligence (AI) models and results in scientific workflows that incorporate\nsuch ML/AI predictions is driven by numerous factors. An uncertainty-aware\nmetric that can quantitatively assess the reproducibility of quantities of\ninterest (QoI) would contribute to the trustworthiness of results obtained from\nscientific workflows involving ML/AI models. In this article, we discuss how\nuncertainty quantification (UQ) in a Bayesian paradigm can provide a general\nand rigorous framework for quantifying reproducibility for complex scientific\nworkflows. Such as framework has the potential to fill a critical gap that\ncurrently exists in ML/AI for scientific workflows, as it will enable\nresearchers to determine the impact of ML/AI model prediction variability on\nthe predictive outcomes of ML/AI-powered workflows. We expect that the\nenvisioned framework will contribute to the design of more reproducible and\ntrustworthy workflows for diverse scientific applications, and ultimately,\naccelerate scientific discoveries.\n","authors":["Line Pouchard","Kristofer G. Reyes","Francis J. Alexander","Byung-Jun Yoon"],"pdf_url":"https://arxiv.org/pdf/2301.05763v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10522v3","updated":"2023-08-23T08:49:54Z","published":"2023-08-21T07:19:47Z","title":"Information Theory-Guided Heuristic Progressive Multi-View Coding","summary":" Multi-view representation learning aims to capture comprehensive information\nfrom multiple views of a shared context. Recent works intuitively apply\ncontrastive learning to different views in a pairwise manner, which is still\nscalable: view-specific noise is not filtered in learning view-shared\nrepresentations; the fake negative pairs, where the negative terms are actually\nwithin the same class as the positive, and the real negative pairs are\ncoequally treated; evenly measuring the similarities between terms might\ninterfere with optimization. Importantly, few works study the theoretical\nframework of generalized self-supervised multi-view learning, especially for\nmore than two views. To this end, we rethink the existing multi-view learning\nparadigm from the perspective of information theory and then propose a novel\ninformation theoretical framework for generalized multi-view learning. Guided\nby it, we build a multi-view coding method with a three-tier progressive\narchitecture, namely Information theory-guided hierarchical Progressive\nMulti-view Coding (IPMC). In the distribution-tier, IPMC aligns the\ndistribution between views to reduce view-specific noise. In the set-tier, IPMC\nconstructs self-adjusted contrasting pools, which are adaptively modified by a\nview filter. Lastly, in the instance-tier, we adopt a designed unified loss to\nlearn representations and reduce the gradient interference. Theoretically and\nempirically, we demonstrate the superiority of IPMC over state-of-the-art\nmethods.\n","authors":["Jiangmeng Li","Hang Gao","Wenwen Qiang","Changwen Zheng"],"pdf_url":"https://arxiv.org/pdf/2308.10522v3.pdf","comment":"This paper is accepted by the jourcal of Neural Networks (Elsevier)\n by 2023. arXiv admin note: substantial text overlap with arXiv:2109.02344"},{"id":"http://arxiv.org/abs/2308.12002v1","updated":"2023-08-23T08:41:24Z","published":"2023-08-23T08:41:24Z","title":"Neural oscillators for magnetic hysteresis modeling","summary":" Hysteresis is a ubiquitous phenomenon in science and engineering; its\nmodeling and identification are crucial for understanding and optimizing the\nbehavior of various systems. We develop an ordinary differential equation-based\nrecurrent neural network (RNN) approach to model and quantify the hysteresis,\nwhich manifests itself in sequentiality and history-dependence. Our neural\noscillator, HystRNN, draws inspiration from coupled-oscillatory RNN and\nphenomenological hysteresis models to update the hidden states. The performance\nof HystRNN is evaluated to predict generalized scenarios, involving first-order\nreversal curves and minor loops. The findings show the ability of HystRNN to\ngeneralize its behavior to previously untrained regions, an essential feature\nthat hysteresis models must have. This research highlights the advantage of\nneural oscillators over the traditional RNN-based methods in capturing complex\nhysteresis patterns in magnetic materials, where traditional rate-dependent\nmethods are inadequate to capture intrinsic nonlinearity.\n","authors":["Abhishek Chandra","Taniya Kapoor","Bram Daniels","Mitrofan Curti","Koen Tiels","Daniel M. Tartakovsky","Elena A. Lomonova"],"pdf_url":"https://arxiv.org/pdf/2308.12002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12000v1","updated":"2023-08-23T08:38:53Z","published":"2023-08-23T08:38:53Z","title":"On Uniformly Optimal Algorithms for Best Arm Identification in Two-Armed\n Bandits with Fixed Budget","summary":" We study the problem of best-arm identification with fixed budget in\nstochastic two-arm bandits with Bernoulli rewards. We prove that surprisingly,\nthere is no algorithm that (i) performs as well as the algorithm sampling each\narm equally (this algorithm is referred to as the {\\it uniform sampling}\nalgorithm) on all instances, and that (ii) strictly outperforms this algorithm\non at least one instance. In short, there is no algorithm better than the\nuniform sampling algorithm. Towards this result, we introduce the natural class\nof {\\it consistent} and {\\it stable} algorithms, and show that any algorithm\nthat performs as well as the uniform sampling algorithm on all instances\nbelongs to this class. The proof is completed by deriving a lower bound on the\nerror rate satisfied by any consistent and stable algorithm, and by showing\nthat the uniform sampling algorithm matches this lower bound. Our results\nprovide a solution to the two open problems presented in \\cite{qin2022open}.\n","authors":["Po-An Wang","Kaito Ariu","Alexandre Proutiere"],"pdf_url":"https://arxiv.org/pdf/2308.12000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11991v1","updated":"2023-08-23T08:25:33Z","published":"2023-08-23T08:25:33Z","title":"Relational Concept Based Models","summary":" The design of interpretable deep learning models working in relational\ndomains poses an open challenge: interpretable deep learning methods, such as\nConcept-Based Models (CBMs), are not designed to solve relational problems,\nwhile relational models are not as interpretable as CBMs. To address this\nproblem, we propose Relational Concept-Based Models, a family of relational\ndeep learning methods providing interpretable task predictions. Our\nexperiments, ranging from image classification to link prediction in knowledge\ngraphs, show that relational CBMs (i) match generalization performance of\nexisting relational black-boxes (as opposed to non-relational CBMs), (ii)\nsupport the generation of quantified concept-based explanations, (iii)\neffectively respond to test-time interventions, and (iv) withstand demanding\nsettings including out-of-distribution scenarios, limited training data\nregimes, and scarce concept supervisions.\n","authors":["Pietro Barbiero","Francesco Giannini","Gabriele Ciravegna","Michelangelo Diligenti","Giuseppe Marra"],"pdf_url":"https://arxiv.org/pdf/2308.11991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11406v2","updated":"2023-08-23T08:00:04Z","published":"2023-08-22T12:53:09Z","title":"Designing an attack-defense game: how to increase robustness of\n financial transaction models via a competition","summary":" Given the escalating risks of malicious attacks in the finance sector and the\nconsequential severe damage, a thorough understanding of adversarial strategies\nand robust defense mechanisms for machine learning models is critical. The\nthreat becomes even more severe with the increased adoption in banks more\naccurate, but potentially fragile neural networks. We aim to investigate the\ncurrent state and dynamics of adversarial attacks and defenses for neural\nnetwork models that use sequential financial data as the input.\n To achieve this goal, we have designed a competition that allows realistic\nand detailed investigation of problems in modern financial transaction data.\nThe participants compete directly against each other, so possible attacks and\ndefenses are examined in close-to-real-life conditions. Our main contributions\nare the analysis of the competition dynamics that answers the questions on how\nimportant it is to conceal a model from malicious users, how long does it take\nto break it, and what techniques one should use to make it more robust, and\nintroduction additional way to attack models or increase their robustness.\n Our analysis continues with a meta-study on the used approaches with their\npower, numerical experiments, and accompanied ablations studies. We show that\nthe developed attacks and defenses outperform existing alternatives from the\nliterature while being practical in terms of execution, proving the validity of\nthe competition as a tool for uncovering vulnerabilities of machine learning\nmodels and mitigating them in various domains.\n","authors":["Alexey Zaytsev","Alex Natekin","Evgeni Vorsin","Valerii Smirnov","Georgii Smirnov","Oleg Sidorshin","Alexander Senin","Alexander Dudin","Dmitry Berestnev"],"pdf_url":"https://arxiv.org/pdf/2308.11406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11978v1","updated":"2023-08-23T07:57:45Z","published":"2023-08-23T07:57:45Z","title":"Will More Expressive Graph Neural Networks do Better on Generative\n Tasks?","summary":" Graph generation poses a significant challenge as it involves predicting a\ncomplete graph with multiple nodes and edges based on simply a given label.\nThis task also carries fundamental importance to numerous real-world\napplications, including de-novo drug and molecular design. In recent years,\nseveral successful methods have emerged in the field of graph generation.\nHowever, these approaches suffer from two significant shortcomings: (1) the\nunderlying Graph Neural Network (GNN) architectures used in these methods are\noften underexplored; and (2) these methods are often evaluated on only a\nlimited number of metrics. To fill this gap, we investigate the expressiveness\nof GNNs under the context of the molecular graph generation task, by replacing\nthe underlying GNNs of graph generative models with more expressive GNNs.\nSpecifically, we analyse the performance of six GNNs in two different\ngenerative frameworks (GCPN and GraphAF), on six different molecular generative\nobjectives on the ZINC-250k dataset. Through our extensive experiments, we\ndemonstrate that advanced GNNs can indeed improve the performance of GCPN and\nGraphAF on molecular generation tasks, but GNN expressiveness is not a\nnecessary condition for a good GNN-based generative model. Moreover, we show\nthat GCPN and GraphAF with advanced GNNs can achieve state-of-the-art results\nacross 17 other non-GNN-based graph generative approaches, such as variational\nautoencoders and Bayesian optimisation models, on the proposed molecular\ngenerative objectives (DRD2, Median1, Median2), which are important metrics for\nde-novo molecular design.\n","authors":["Xiandong Zou","Xiangyu Zhao","Pietro Liò","Yiren Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.11978v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11975v1","updated":"2023-08-23T07:50:43Z","published":"2023-08-23T07:50:43Z","title":"Approximating Score-based Explanation Techniques Using Conformal\n Regression","summary":" Score-based explainable machine-learning techniques are often used to\nunderstand the logic behind black-box models. However, such explanation\ntechniques are often computationally expensive, which limits their application\nin time-critical contexts. Therefore, we propose and investigate the use of\ncomputationally less costly regression models for approximating the output of\nscore-based explanation techniques, such as SHAP. Moreover, validity guarantees\nfor the approximated values are provided by the employed inductive conformal\nprediction framework. We propose several non-conformity measures designed to\ntake the difficulty of approximating the explanations into account while\nkeeping the computational cost low. We present results from a large-scale\nempirical investigation, in which the approximate explanations generated by our\nproposed models are evaluated with respect to efficiency (interval size). The\nresults indicate that the proposed method can significantly improve execution\ntime compared to the fast version of SHAP, TreeSHAP. The results also suggest\nthat the proposed method can produce tight intervals, while providing validity\nguarantees. Moreover, the proposed approach allows for comparing explanations\nof different approximation methods and selecting a method based on how\ninformative (tight) are the predicted intervals.\n","authors":["Amr Alkhatib","Henrik Boström","Sofiane Ennadir","Ulf Johansson"],"pdf_url":"https://arxiv.org/pdf/2308.11975v1.pdf","comment":"20 pages, 14 figures, The 12th Symposium on Conformal and\n Probabilistic Prediction with Applications (COPA 2023)"},{"id":"http://arxiv.org/abs/2306.01792v3","updated":"2023-08-23T07:43:03Z","published":"2023-06-01T08:10:03Z","title":"Task Relation-aware Continual User Representation Learning","summary":" User modeling, which learns to represent users into a low-dimensional\nrepresentation space based on their past behaviors, got a surge of interest\nfrom the industry for providing personalized services to users. Previous\nefforts in user modeling mainly focus on learning a task-specific user\nrepresentation that is designed for a single task. However, since learning\ntask-specific user representations for every task is infeasible, recent studies\nintroduce the concept of universal user representation, which is a more\ngeneralized representation of a user that is relevant to a variety of tasks.\nDespite their effectiveness, existing approaches for learning universal user\nrepresentations are impractical in real-world applications due to the data\nrequirement, catastrophic forgetting and the limited learning capability for\ncontinually added tasks. In this paper, we propose a novel continual user\nrepresentation learning method, called TERACON, whose learning capability is\nnot limited as the number of learned tasks increases while capturing the\nrelationship between the tasks. The main idea is to introduce an embedding for\neach task, i.e., task embedding, which is utilized to generate task-specific\nsoft masks that not only allow the entire model parameters to be updated until\nthe end of training sequence, but also facilitate the relationship between the\ntasks to be captured. Moreover, we introduce a novel knowledge retention module\nwith pseudo-labeling strategy that successfully alleviates the long-standing\nproblem of continual learning, i.e., catastrophic forgetting. Extensive\nexperiments on public and proprietary real-world datasets demonstrate the\nsuperiority and practicality of TERACON. Our code is available at\nhttps://github.com/Sein-Kim/TERACON.\n","authors":["Sein Kim","Namkyeong Lee","Donghyun Kim","Minchul Yang","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2306.01792v3.pdf","comment":"KDD 2023"},{"id":"http://arxiv.org/abs/2308.11971v1","updated":"2023-08-23T07:36:30Z","published":"2023-08-23T07:36:30Z","title":"EVE: Efficient Vision-Language Pre-training with Masked Prediction and\n Modality-Aware MoE","summary":" Building scalable vision-language models to learn from diverse, multimodal\ndata remains an open challenge. In this paper, we introduce an Efficient\nVision-languagE foundation model, namely EVE, which is one unified multimodal\nTransformer pre-trained solely by one unified pre-training task. Specifically,\nEVE encodes both vision and language within a shared Transformer network\nintegrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which\ncapture modality-specific information by selectively switching to different\nexperts. To unify pre-training tasks of vision and language, EVE performs\nmasked signal modeling on image-text pairs to reconstruct masked signals, i.e.,\nimage pixels and text tokens, given visible signals. This simple yet effective\npre-training objective accelerates training by 3.5x compared to the model\npre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing\nto the combination of the unified architecture and pre-training task, EVE is\neasy to scale up, enabling better downstream performance with fewer resources\nand faster training speed. Despite its simplicity, EVE achieves\nstate-of-the-art performance on various vision-language downstream tasks,\nincluding visual question answering, visual reasoning, and image-text\nretrieval.\n","authors":["Junyi Chen","Longteng Guo","Jia Sun","Shuai Shao","Zehuan Yuan","Liang Lin","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11969v1","updated":"2023-08-23T07:30:16Z","published":"2023-08-23T07:30:16Z","title":"Anisotropic Hybrid Networks for liver tumor segmentation with\n uncertainty quantification","summary":" The burden of liver tumors is important, ranking as the fourth leading cause\nof cancer mortality. In case of hepatocellular carcinoma (HCC), the delineation\nof liver and tumor on contrast-enhanced magnetic resonance imaging (CE-MRI) is\nperformed to guide the treatment strategy. As this task is time-consuming,\nneeds high expertise and could be subject to inter-observer variability there\nis a strong need for automatic tools. However, challenges arise from the lack\nof available training data, as well as the high variability in terms of image\nresolution and MRI sequence. In this work we propose to compare two different\npipelines based on anisotropic models to obtain the segmentation of the liver\nand tumors. The first pipeline corresponds to a baseline multi-class model that\nperforms the simultaneous segmentation of the liver and tumor classes. In the\nsecond approach, we train two distinct binary models, one segmenting the liver\nonly and the other the tumors. Our results show that both pipelines exhibit\ndifferent strengths and weaknesses. Moreover we propose an uncertainty\nquantification strategy allowing the identification of potential false positive\ntumor lesions. Both solutions were submitted to the MICCAI 2023 Atlas challenge\nregarding liver and tumor segmentation.\n","authors":["Benjamin Lambert","Pauline Roca","Florence Forbes","Senan Doyle","Michel Dojat"],"pdf_url":"https://arxiv.org/pdf/2308.11969v1.pdf","comment":"Accepted for presentation at MICCAI Workshop on 2nd\n Resource-Efficient Medical Image Analysis (REMIA)"},{"id":"http://arxiv.org/abs/2302.06912v3","updated":"2023-08-23T07:27:20Z","published":"2023-02-14T08:56:50Z","title":"Regret-Based Optimization for Robust Reinforcement Learning","summary":" Deep Reinforcement Learning (DRL) policies have been shown to be vulnerable\nto small adversarial noise in observations. Such adversarial noise can have\ndisastrous consequences in safety-critical environments. For instance, a\nself-driving car receiving adversarially perturbed sensory observations about\nnearby signs (e.g., a stop sign physically altered to be perceived as a speed\nlimit sign) or objects (e.g., cars altered to be recognized as trees) can be\nfatal. Existing approaches for making RL algorithms robust to an\nobservation-perturbing adversary have focused on reactive approaches that\niteratively improve against adversarial examples generated at each iteration.\nWhile such approaches have been shown to provide improvements over regular RL\nmethods, they are reactive and can fare significantly worse if certain\ncategories of adversarial examples are not generated during training. To that\nend, we pursue a more proactive approach that relies on directly optimizing a\nwell-studied robustness measure, regret instead of expected value. We provide a\nprincipled approach that minimizes maximum regret over a \"neighborhood\" of\nobservations to the received \"observation\". Our regret criterion can be used to\nmodify existing value- and policy-based Deep RL methods. We demonstrate that\nour approaches provide a significant improvement in performance across a wide\nvariety of benchmarks against leading approaches for robust Deep RL.\n","authors":["Roman Belaire","Pradeep Varakantham","Thanh Nguyen","David Lo"],"pdf_url":"https://arxiv.org/pdf/2302.06912v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.09091v2","updated":"2023-08-23T07:23:17Z","published":"2023-01-22T10:17:02Z","title":"BallGAN: 3D-aware Image Synthesis with a Spherical Background","summary":" 3D-aware GANs aim to synthesize realistic 3D scenes such that they can be\nrendered in arbitrary perspectives to produce images. Although previous methods\nproduce realistic images, they suffer from unstable training or degenerate\nsolutions where the 3D geometry is unnatural. We hypothesize that the 3D\ngeometry is underdetermined due to the insufficient constraint, i.e., being\nclassified as real image to the discriminator is not enough. To solve this\nproblem, we propose to approximate the background as a spherical surface and\nrepresent a scene as a union of the foreground placed in the sphere and the\nthin spherical background. It reduces the degree of freedom in the background\nfield. Accordingly, we modify the volume rendering equation and incorporate\ndedicated constraints to design a novel 3D-aware GAN framework named BallGAN.\nBallGAN has multiple advantages as follows. 1) It produces more reasonable 3D\ngeometry; the images of a scene across different viewpoints have better\nphotometric consistency and fidelity than the state-of-the-art methods. 2) The\ntraining becomes much more stable. 3) The foreground can be separately rendered\non top of different arbitrary backgrounds.\n","authors":["Minjung Shin","Yunji Seo","Jeongmin Bae","Young Sun Choi","Hyunsu Kim","Hyeran Byun","Youngjung Uh"],"pdf_url":"https://arxiv.org/pdf/2301.09091v2.pdf","comment":"ICCV 2023, Project Page: https://minjung-s.github.io/ballgan"},{"id":"http://arxiv.org/abs/2307.06857v2","updated":"2023-08-23T07:06:53Z","published":"2023-07-11T17:51:48Z","title":"Self-consistency for open-ended generations","summary":" Large Language Models (LLMs) can exhibit considerable variation in the\nquality of their sampled outputs. Reranking and selecting the best generation\nfrom the sampled set is a popular way of obtaining strong gains in generation\nquality. In this paper, we present a novel approach for reranking LLM\ngenerations. Unlike other techniques that might involve additional inferences\nor training a specialized reranker, our approach relies on easy to compute\npairwise statistics between the generations that have minimal compute overhead.\nWe show that our approach can be formalized as an extension of self-consistency\nand analyze its performance in that framework, theoretically as well as via\nsimulations. We show strong improvements for selecting the best $k$ generations\nfor code generation tasks as well as robust improvements for best generation\nfor the tasks of autoformalization, and summarization. While our approach only\nassumes black-box access to LLMs, we show that additional access to token\nprobabilities can improve performance even further.\n","authors":["Siddhartha Jain","Xiaofei Ma","Anoop Deoras","Bing Xiang"],"pdf_url":"https://arxiv.org/pdf/2307.06857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11958v1","updated":"2023-08-23T06:57:05Z","published":"2023-08-23T06:57:05Z","title":"Maintaining Plasticity via Regenerative Regularization","summary":" In continual learning, plasticity refers to the ability of an agent to\nquickly adapt to new information. Neural networks are known to lose plasticity\nwhen processing non-stationary data streams. In this paper, we propose L2 Init,\na very simple approach for maintaining plasticity by incorporating in the loss\nfunction L2 regularization toward initial parameters. This is very similar to\nstandard L2 regularization (L2), the only difference being that L2 regularizes\ntoward the origin. L2 Init is simple to implement and requires selecting only a\nsingle hyper-parameter. The motivation for this method is the same as that of\nmethods that reset neurons or parameter values. Intuitively, when recent losses\nare insensitive to particular parameters, these parameters drift toward their\ninitial values. This prepares parameters to adapt quickly to new tasks. On\nsimple problems representative of different types of nonstationarity in\ncontinual learning, we demonstrate that L2 Init consistently mitigates\nplasticity loss. We additionally find that our regularization term reduces\nparameter magnitudes and maintains a high effective feature rank.\n","authors":["Saurabh Kumar","Henrik Marklund","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2308.11958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11953v1","updated":"2023-08-23T06:51:22Z","published":"2023-08-23T06:51:22Z","title":"When MiniBatch SGD Meets SplitFed Learning:Convergence Analysis and\n Performance Evaluation","summary":" Federated learning (FL) enables collaborative model training across\ndistributed clients (e.g., edge devices) without sharing raw data. Yet, FL can\nbe computationally expensive as the clients need to train the entire model\nmultiple times. SplitFed learning (SFL) is a recent distributed approach that\nalleviates computation workload at the client device by splitting the model at\na cut layer into two parts, where clients only need to train part of the model.\nHowever, SFL still suffers from the \\textit{client drift} problem when clients'\ndata are highly non-IID. To address this issue, we propose MiniBatch-SFL. This\nalgorithm incorporates MiniBatch SGD into SFL, where the clients train the\nclient-side model in an FL fashion while the server trains the server-side\nmodel similar to MiniBatch SGD. We analyze the convergence of MiniBatch-SFL and\nshow that the bound of the expected loss can be obtained by analyzing the\nexpected server-side and client-side model updates, respectively. The\nserver-side updates do not depend on the non-IID degree of the clients'\ndatasets and can potentially mitigate client drift. However, the client-side\nmodel relies on the non-IID degree and can be optimized by properly choosing\nthe cut layer. Perhaps counter-intuitive, our empirical result shows that a\nlatter position of the cut layer leads to a smaller average gradient divergence\nand a better algorithm performance. Moreover, numerical results show that\nMiniBatch-SFL achieves higher accuracy than conventional SFL and FL. The\naccuracy improvement can be up to 24.1\\% and 17.1\\% with highly non-IID data,\nrespectively.\n","authors":["Chao Huang","Geng Tian","Ming Tang"],"pdf_url":"https://arxiv.org/pdf/2308.11953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.08434v2","updated":"2023-08-23T06:48:52Z","published":"2022-03-16T07:12:42Z","title":"Deep Residual Error and Bag-of-Tricks Learning for Gravitational Wave\n Surrogate Modeling","summary":" Deep learning methods have been employed in gravitational-wave astronomy to\naccelerate the construction of surrogate waveforms for the inspiral of\nspin-aligned black hole binaries, among other applications. We face the\nchallenge of modeling the residual error of an artificial neural network that\nmodels the coefficients of the surrogate waveform expansion (especially those\nof the phase of the waveform) which we demonstrate has sufficient structure to\nbe learnable by a second network. Adding this second network, we were able to\nreduce the maximum mismatch for waveforms in a validation set by 13.4 times. We\nalso explored several other ideas for improving the accuracy of the surrogate\nmodel, such as the exploitation of similarities between waveforms, the\naugmentation of the training set, the dissection of the input space, using\ndedicated networks per output coefficient and output augmentation. In several\ncases, small improvements can be observed, but the most significant improvement\nstill comes from the addition of a second network that models the residual\nerror. Since the residual error for more general surrogate waveform models\n(when e.g., eccentricity is included) may also have a specific structure, one\ncan expect our method to be applicable to cases where the gain in accuracy\ncould lead to significant gains in computational time.\n","authors":["Styliani-Christina Fragkouli","Paraskevi Nousi","Nikolaos Passalis","Panagiotis Iosif","Nikolaos Stergioulas","Anastasios Tefas"],"pdf_url":"https://arxiv.org/pdf/2203.08434v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10632v2","updated":"2023-08-23T06:41:42Z","published":"2023-08-21T11:07:27Z","title":"Foundation Model-oriented Robustness: Robust Image Model Evaluation with\n Pretrained Models","summary":" Machine learning has demonstrated remarkable performance over finite\ndatasets, yet whether the scores over the fixed benchmarks can sufficiently\nindicate the model's performance in the real world is still in discussion. In\nreality, an ideal robust model will probably behave similarly to the oracle\n(e.g., the human users), thus a good evaluation protocol is probably to\nevaluate the models' behaviors in comparison to the oracle. In this paper, we\nintroduce a new robustness measurement that directly measures the image\nclassification model's performance compared with a surrogate oracle (i.e., a\nfoundation model). Besides, we design a simple method that can accomplish the\nevaluation beyond the scope of the benchmarks. Our method extends the image\ndatasets with new samples that are sufficiently perturbed to be distinct from\nthe ones in the original sets, but are still bounded within the same\nimage-label structure the original test image represents, constrained by a\nfoundation model pretrained with a large amount of samples. As a result, our\nnew method will offer us a new way to evaluate the models' robustness\nperformance, free of limitations of fixed benchmarks or constrained\nperturbations, although scoped by the power of the oracle. In addition to the\nevaluation results, we also leverage our generated data to understand the\nbehaviors of the model and our new evaluation strategies.\n","authors":["Peiyan Zhang","Haoyang Liu","Chaozhuo Li","Xing Xie","Sunghun Kim","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10632v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11946v1","updated":"2023-08-23T06:40:05Z","published":"2023-08-23T06:40:05Z","title":"Multi-scale Transformer Pyramid Networks for Multivariate Time Series\n Forecasting","summary":" Multivariate Time Series (MTS) forecasting involves modeling temporal\ndependencies within historical records. Transformers have demonstrated\nremarkable performance in MTS forecasting due to their capability to capture\nlong-term dependencies. However, prior work has been confined to modeling\ntemporal dependencies at either a fixed scale or multiple scales that\nexponentially increase (most with base 2). This limitation hinders their\neffectiveness in capturing diverse seasonalities, such as hourly and daily\npatterns. In this paper, we introduce a dimension invariant embedding technique\nthat captures short-term temporal dependencies and projects MTS data into a\nhigher-dimensional space, while preserving the dimensions of time steps and\nvariables in MTS data. Furthermore, we present a novel Multi-scale Transformer\nPyramid Network (MTPNet), specifically designed to effectively capture temporal\ndependencies at multiple unconstrained scales. The predictions are inferred\nfrom multi-scale latent representations obtained from transformers at various\nscales. Extensive experiments on nine benchmark datasets demonstrate that the\nproposed MTPNet outperforms recent state-of-the-art methods.\n","authors":["Yifan Zhang","Rui Wu","Sergiu M. Dascalu","Frederick C. Harris Jr"],"pdf_url":"https://arxiv.org/pdf/2308.11946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.07771v2","updated":"2023-08-23T06:37:32Z","published":"2022-12-15T12:47:59Z","title":"Temporal Saliency Detection Towards Explainable Transformer-based\n Timeseries Forecasting","summary":" Despite the notable advancements in numerous Transformer-based models, the\ntask of long multi-horizon time series forecasting remains a persistent\nchallenge, especially towards explainability. Focusing on commonly used\nsaliency maps in explaining DNN in general, our quest is to build\nattention-based architecture that can automatically encode saliency-related\ntemporal patterns by establishing connections with appropriate attention heads.\nHence, this paper introduces Temporal Saliency Detection (TSD), an effective\napproach that builds upon the attention mechanism and applies it to\nmulti-horizon time series prediction. While our proposed architecture adheres\nto the general encoder-decoder structure, it undergoes a significant renovation\nin the encoder component, wherein we incorporate a series of information\ncontracting and expanding blocks inspired by the U-Net style architecture. The\nTSD approach facilitates the multiresolution analysis of saliency patterns by\ncondensing multi-heads, thereby progressively enhancing the forecasting of\ncomplex time series data. Empirical evaluations illustrate the superiority of\nour proposed approach compared to other models across multiple standard\nbenchmark datasets in diverse far-horizon forecasting settings. The initial TSD\nachieves substantial relative improvements of 31% and 46% over several models\nin the context of multivariate and univariate prediction. We believe the\ncomprehensive investigations presented in this study will offer valuable\ninsights and benefits to future research endeavors.\n","authors":["Nghia Duong-Trung","Duc-Manh Nguyen","Danh Le-Phuoc"],"pdf_url":"https://arxiv.org/pdf/2212.07771v2.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2308.11943v1","updated":"2023-08-23T06:32:14Z","published":"2023-08-23T06:32:14Z","title":"RamseyRL: A Framework for Intelligent Ramsey Number Counterexample\n Searching","summary":" The Ramsey number is the minimum number of nodes, $n = R(s, t)$, such that\nall undirected simple graphs of order $n$, contain a clique of order $s$, or an\nindependent set of order $t$. This paper explores the application of a best\nfirst search algorithm and reinforcement learning (RL) techniques to find\ncounterexamples to specific Ramsey numbers. We incrementally improve over prior\nsearch methods such as random search by introducing a graph vectorization and\ndeep neural network (DNN)-based heuristic, which gauge the likelihood of a\ngraph being a counterexample. The paper also proposes algorithmic optimizations\nto confine a polynomial search runtime. This paper does not aim to present new\ncounterexamples but rather introduces and evaluates a framework supporting\nRamsey counterexample exploration using other heuristics. Code and methods are\nmade available through a PyPI package and GitHub repository.\n","authors":["Steve Vott","Adam M. Lehavi"],"pdf_url":"https://arxiv.org/pdf/2308.11943v1.pdf","comment":"8 pages, 4 figures, submitted to AAAI2024"},{"id":"http://arxiv.org/abs/2308.11940v1","updated":"2023-08-23T06:21:46Z","published":"2023-08-23T06:21:46Z","title":"Audio Generation with Multiple Conditional Diffusion Model","summary":" Text-based audio generation models have limitations as they cannot encompass\nall the information in audio, leading to restricted controllability when\nrelying solely on text. To address this issue, we propose a novel model that\nenhances the controllability of existing pre-trained text-to-audio models by\nincorporating additional conditions including content (timestamp) and style\n(pitch contour and energy contour) as supplements to the text. This approach\nachieves fine-grained control over the temporal order, pitch, and energy of\ngenerated audio. To preserve the diversity of generation, we employ a trainable\ncontrol condition encoder that is enhanced by a large language model and a\ntrainable Fusion-Net to encode and fuse the additional conditions while keeping\nthe weights of the pre-trained text-to-audio model frozen. Due to the lack of\nsuitable datasets and evaluation metrics, we consolidate existing datasets into\na new dataset comprising the audio and corresponding conditions and use a\nseries of evaluation metrics to evaluate the controllability performance.\nExperimental results demonstrate that our model successfully achieves\nfine-grained control to accomplish controllable audio generation. Audio samples\nand our dataset are publicly available at\nhttps://conditionaudiogen.github.io/conditionaudiogen/\n","authors":["Zhifang Guo","Jianguo Mao","Rui Tao","Long Yan","Kazushige Ouchi","Hong Liu","Xiangdong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11940v1.pdf","comment":"Submitted to AAAI 2024"},{"id":"http://arxiv.org/abs/2308.11217v2","updated":"2023-08-23T06:17:21Z","published":"2023-08-22T06:05:11Z","title":"Federated Learning in Big Model Era: Domain-Specific Multimodal Large\n Models","summary":" Multimodal data, which can comprehensively perceive and recognize the\nphysical world, has become an essential path towards general artificial\nintelligence. However, multimodal large models trained on public datasets often\nunderperform in specific industrial domains. This paper proposes a multimodal\nfederated learning framework that enables multiple enterprises to utilize\nprivate domain data to collaboratively train large models for vertical domains,\nachieving intelligent services across scenarios. The authors discuss in-depth\nthe strategic transformation of federated learning in terms of intelligence\nfoundation and objectives in the era of big model, as well as the new\nchallenges faced in heterogeneous data, model aggregation, performance and cost\ntrade-off, data privacy, and incentive mechanism. The paper elaborates a case\nstudy of leading enterprises contributing multimodal data and expert knowledge\nto city safety operation management , including distributed deployment and\nefficient coordination of the federated learning platform, technical\ninnovations on data quality improvement based on large model capabilities and\nefficient joint fine-tuning approaches. Preliminary experiments show that\nenterprises can enhance and accumulate intelligent capabilities through\nmultimodal model federated learning, thereby jointly creating an smart city\nmodel that provides high-quality intelligent services covering energy\ninfrastructure safety, residential community security, and urban operation\nmanagement. The established federated learning cooperation ecosystem is\nexpected to further aggregate industry, academia, and research resources,\nrealize large models in multiple vertical domains, and promote the large-scale\nindustrial application of artificial intelligence and cutting-edge research on\nmultimodal federated learning.\n","authors":["Zengxiang Li","Zhaoxiang Hou","Hui Liu","Ying Wang","Tongzhi Li","Longfei Xie","Chao Shi","Chengyi Yang","Weishan Zhang","Zelei Liu"],"pdf_url":"https://arxiv.org/pdf/2308.11217v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11939v1","updated":"2023-08-23T06:14:02Z","published":"2023-08-23T06:14:02Z","title":"Retail Demand Forecasting: A Comparative Study for Multivariate Time\n Series","summary":" Accurate demand forecasting in the retail industry is a critical determinant\nof financial performance and supply chain efficiency. As global markets become\nincreasingly interconnected, businesses are turning towards advanced prediction\nmodels to gain a competitive edge. However, existing literature mostly focuses\non historical sales data and ignores the vital influence of macroeconomic\nconditions on consumer spending behavior. In this study, we bridge this gap by\nenriching time series data of customer demand with macroeconomic variables,\nsuch as the Consumer Price Index (CPI), Index of Consumer Sentiment (ICS), and\nunemployment rates. Leveraging this comprehensive dataset, we develop and\ncompare various regression and machine learning models to predict retail demand\naccurately.\n","authors":["Md Sabbirul Haque","Md Shahedul Amin","Jonayet Miah"],"pdf_url":"https://arxiv.org/pdf/2308.11939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11933v1","updated":"2023-08-23T05:53:13Z","published":"2023-08-23T05:53:13Z","title":"System Identification for Continuous-time Linear Dynamical Systems","summary":" The problem of system identification for the Kalman filter, relying on the\nexpectation-maximization (EM) procedure to learn the underlying parameters of a\ndynamical system, has largely been studied assuming that observations are\nsampled at equally-spaced time points. However, in many applications this is a\nrestrictive and unrealistic assumption. This paper addresses system\nidentification for the continuous-discrete filter, with the aim of generalizing\nlearning for the Kalman filter by relying on a solution to a continuous-time\nIt\\^o stochastic differential equation (SDE) for the latent state and\ncovariance dynamics. We introduce a novel two-filter, analytical form for the\nposterior with a Bayesian derivation, which yields analytical updates which do\nnot require the forward-pass to be pre-computed. Using this analytical and\nefficient computation of the posterior, we provide an EM procedure which\nestimates the parameters of the SDE, naturally incorporating irregularly\nsampled measurements. Generalizing the learning of latent linear dynamical\nsystems (LDS) to continuous-time may extend the use of the hybrid Kalman filter\nto data which is not regularly sampled or has intermittent missing values, and\ncan extend the power of non-linear system identification methods such as\nswitching LDS (SLDS), which rely on EM for the linear discrete-time Kalman\nfilter as a sub-unit for learning locally linearized behavior of a non-linear\nsystem. We apply the method by learning the parameters of a latent,\nmultivariate Fokker-Planck SDE representing a toggle-switch genetic circuit\nusing biologically realistic parameters, and compare the efficacy of learning\nrelative to the discrete-time Kalman filter as the step-size irregularity and\nspectral-radius of the dynamics-matrix increases.\n","authors":["Peter Halmos","Jonathan Pillow","David A. Knowles"],"pdf_url":"https://arxiv.org/pdf/2308.11933v1.pdf","comment":"32 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.11929v1","updated":"2023-08-23T05:33:03Z","published":"2023-08-23T05:33:03Z","title":"Dynamic landslide susceptibility mapping over recent three decades to\n uncover variations in landslide causes in subtropical urban mountainous areas","summary":" Landslide susceptibility assessment (LSA) is of paramount importance in\nmitigating landslide risks. Recently, there has been a surge in the utilization\nof data-driven methods for predicting landslide susceptibility due to the\ngrowing availability of aerial and satellite data. Nonetheless, the rapid\noscillations within the landslide-inducing environment (LIE), primarily due to\nsignificant changes in external triggers such as rainfall, pose difficulties\nfor contemporary data-driven LSA methodologies to accommodate LIEs over diverse\ntimespans. This study presents dynamic landslide susceptibility mapping that\nsimply employs multiple predictive models for annual LSA. In practice, this\nwill inevitably encounter small sample problems due to the limited number of\nlandslide samples in certain years. Another concern arises owing to the\nmajority of the existing LSA approaches train black-box models to fit distinct\ndatasets, yet often failing in generalization and providing comprehensive\nexplanations concerning the interactions between input features and\npredictions. Accordingly, we proposed to meta-learn representations with fast\nadaptation ability using a few samples and gradient updates; and apply SHAP for\neach model interpretation and landslide feature permutation. Additionally, we\napplied MT-InSAR for LSA result enhancement and validation. The chosen study\narea is Lantau Island, Hong Kong, where we conducted a comprehensive dynamic\nLSA spanning from 1992 to 2019. The model interpretation results demonstrate\nthat the primary factors responsible for triggering landslides in Lantau Island\nare terrain slope and extreme rainfall. The results also indicate that the\nvariation in landslide causes can be primarily attributed to extreme rainfall\nevents, which result from global climate change, and the implementation of the\nLandslip Prevention and Mitigation Programme (LPMitP) by the Hong Kong\ngovernment.\n","authors":["Peifeng Ma","Li Chen","Chang Yu","Qing Zhu","Yulin Ding"],"pdf_url":"https://arxiv.org/pdf/2308.11929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05237v3","updated":"2023-08-23T05:25:09Z","published":"2023-05-09T07:56:26Z","title":"Traffic Forecasting on New Roads Unseen in the Training Data Using\n Spatial Contrastive Pre-Training","summary":" New roads are being constructed all the time. However, the capabilities of\nprevious deep forecasting models to generalize to new roads not seen in the\ntraining data (unseen roads) are rarely explored. In this paper, we introduce a\nnovel setup called a spatio-temporal (ST) split to evaluate the models'\ncapabilities to generalize to unseen roads. In this setup, the models are\ntrained on data from a sample of roads, but tested on roads not seen in the\ntraining data. Moreover, we also present a novel framework called Spatial\nContrastive Pre-Training (SCPT) where we introduce a spatial encoder module to\nextract latent features from unseen roads during inference time. This spatial\nencoder is pre-trained using contrastive learning. During inference, the\nspatial encoder only requires two days of traffic data on the new roads and\ndoes not require any re-training. We also show that the output from the spatial\nencoder can be used effectively to infer latent node embeddings on unseen roads\nduring inference time. The SCPT framework also incorporates a new layer, named\nthe spatially gated addition (SGA) layer, to effectively combine the latent\nfeatures from the output of the spatial encoder to existing backbones.\nAdditionally, since there is limited data on the unseen roads, we argue that it\nis better to decouple traffic signals to trivial-to-capture periodic signals\nand difficult-to-capture Markovian signals, and for the spatial encoder to only\nlearn the Markovian signals. Finally, we empirically evaluated SCPT using the\nST split setup on four real-world datasets. The results showed that adding SCPT\nto a backbone consistently improves forecasting performance on unseen roads.\nMore importantly, the improvements are greater when forecasting further into\nthe future. The codes are available on GitHub:\nhttps://github.com/cruiseresearchgroup/forecasting-on-new-roads .\n","authors":["Arian Prabowo","Wei Shao","Hao Xue","Piotr Koniusz","Flora D. Salim"],"pdf_url":"https://arxiv.org/pdf/2305.05237v3.pdf","comment":"25 pages including reference, an additional 3 pages of appendix, 8\n figures. ECML PKDD 2023 Journal track special issue: Data Mining and\n Knowledge Discovery (DAMI)"},{"id":"http://arxiv.org/abs/2308.11925v1","updated":"2023-08-23T05:18:19Z","published":"2023-08-23T05:18:19Z","title":"Solving Elliptic Optimal Control Problems using Physics Informed Neural\n Networks","summary":" In this work, we present and analyze a numerical solver for optimal control\nproblems (without / with box constraint) for linear and semilinear second-order\nelliptic problems. The approach is based on a coupled system derived from the\nfirst-order optimality system of the optimal control problem, and applies\nphysics informed neural networks (PINNs) to solve the coupled system. We\npresent an error analysis of the numerical scheme, and provide $L^2(\\Omega)$\nerror bounds on the state, control and adjoint state in terms of deep neural\nnetwork parameters (e.g., depth, width, and parameter bounds) and the number of\nsampling points in the domain and on the boundary. The main tools in the\nanalysis include offset Rademacher complexity and boundedness and Lipschitz\ncontinuity of neural network functions. We present several numerical examples\nto illustrate the approach and compare it with three existing approaches.\n","authors":["Bangti Jin","Ramesh Sau","Luowei Yin","Zhi Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.11925v1.pdf","comment":"28 pages, 5 figures"},{"id":"http://arxiv.org/abs/2205.03977v3","updated":"2023-08-23T05:18:04Z","published":"2022-05-08T23:58:40Z","title":"A Structured Span Selector","summary":" Many natural language processing tasks, e.g., coreference resolution and\nsemantic role labeling, require selecting text spans and making decisions about\nthem. A typical approach to such tasks is to score all possible spans and\ngreedily select spans for task-specific downstream processing. This approach,\nhowever, does not incorporate any inductive bias about what sort of spans ought\nto be selected, e.g., that selected spans tend to be syntactic constituents. In\nthis paper, we propose a novel grammar-based structured span selection model\nwhich learns to make use of the partial span-level annotation provided for such\nproblems. Compared to previous approaches, our approach gets rid of the\nheuristic greedy span selection scheme, allowing us to model the downstream\ntask on an optimal set of spans. We evaluate our model on two popular span\nprediction tasks: coreference resolution and semantic role labeling. We show\nempirical improvements on both.\n","authors":["Tianyu Liu","Yuchen Eleanor Jiang","Ryan Cotterell","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2205.03977v3.pdf","comment":"NAACL 2022 camera-ready"},{"id":"http://arxiv.org/abs/2308.11924v1","updated":"2023-08-23T05:17:51Z","published":"2023-08-23T05:17:51Z","title":"Diverse Policies Converge in Reward-free Markov Decision Processe","summary":" Reinforcement learning has achieved great success in many decision-making\ntasks, and traditional reinforcement learning algorithms are mainly designed\nfor obtaining a single optimal solution. However, recent works show the\nimportance of developing diverse policies, which makes it an emerging research\ntopic. Despite the variety of diversity reinforcement learning algorithms that\nhave emerged, none of them theoretically answer the question of how the\nalgorithm converges and how efficient the algorithm is. In this paper, we\nprovide a unified diversity reinforcement learning framework and investigate\nthe convergence of training diverse policies. Under such a framework, we also\npropose a provably efficient diversity reinforcement learning algorithm.\nFinally, we verify the effectiveness of our method through numerical\nexperiments.\n","authors":["Fanqi Lin","Shiyu Huang","Weiwei Tu"],"pdf_url":"https://arxiv.org/pdf/2308.11924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11923v1","updated":"2023-08-23T05:13:25Z","published":"2023-08-23T05:13:25Z","title":"Audio Difference Captioning Utilizing Similarity-Discrepancy\n Disentanglement","summary":" We proposed Audio Difference Captioning (ADC) as a new extension task of\naudio captioning for describing the semantic differences between input pairs of\nsimilar but slightly different audio clips. The ADC solves the problem that\nconventional audio captioning sometimes generates similar captions for similar\naudio clips, failing to describe the difference in content. We also propose a\ncross-attention-concentrated transformer encoder to extract differences by\ncomparing a pair of audio clips and a similarity-discrepancy disentanglement to\nemphasize the difference in the latent space. To evaluate the proposed methods,\nwe built an AudioDiffCaps dataset consisting of pairs of similar but slightly\ndifferent audio clips with human-annotated descriptions of their differences.\nThe experiment with the AudioDiffCaps dataset showed that the proposed methods\nsolve the ADC task effectively and improve the attention weights to extract the\ndifference by visualizing them in the transformer encoder.\n","authors":["Daiki Takeuchi","Yasunori Ohishi","Daisuke Niizumi","Noboru Harada","Kunio Kashino"],"pdf_url":"https://arxiv.org/pdf/2308.11923v1.pdf","comment":"Accepted to DCASE2023 Workshop"},{"id":"http://arxiv.org/abs/2308.11912v1","updated":"2023-08-23T04:57:21Z","published":"2023-08-23T04:57:21Z","title":"Addressing Selection Bias in Computerized Adaptive Testing: A User-Wise\n Aggregate Influence Function Approach","summary":" Computerized Adaptive Testing (CAT) is a widely used, efficient test mode\nthat adapts to the examinee's proficiency level in the test domain. CAT\nrequires pre-trained item profiles, for CAT iteratively assesses the student\nreal-time based on the registered items' profiles, and selects the next item to\nadminister using candidate items' profiles. However, obtaining such item\nprofiles is a costly process that involves gathering a large, dense\nitem-response data, then training a diagnostic model on the collected data. In\nthis paper, we explore the possibility of leveraging response data collected in\nthe CAT service. We first show that this poses a unique challenge due to the\ninherent selection bias introduced by CAT, i.e., more proficient students will\nreceive harder questions. Indeed, when naively training the diagnostic model\nusing CAT response data, we observe that item profiles deviate significantly\nfrom the ground-truth. To tackle the selection bias issue, we propose the\nuser-wise aggregate influence function method. Our intuition is to filter out\nusers whose response data is heavily biased in an aggregate manner, as judged\nby how much perturbation the added data will introduce during parameter\nestimation. This way, we may enhance the performance of CAT while introducing\nminimal bias to the item profiles. We provide extensive experiments to\ndemonstrate the superiority of our proposed method based on the three public\ndatasets and one dataset that contains real-world CAT response data.\n","authors":["Soonwoo Kwon","Sojung Kim","Seunghyun Lee","Jin-Young Kim","Suyeong An","Kyuseok Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11912v1.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2302.05601v3","updated":"2023-08-23T04:55:20Z","published":"2023-02-11T04:52:20Z","title":"Pruning Deep Neural Networks from a Sparsity Perspective","summary":" In recent years, deep network pruning has attracted significant attention in\norder to enable the rapid deployment of AI into small devices with computation\nand memory constraints. Pruning is often achieved by dropping redundant\nweights, neurons, or layers of a deep network while attempting to retain a\ncomparable test performance. Many deep pruning algorithms have been proposed\nwith impressive empirical success. However, existing approaches lack a\nquantifiable measure to estimate the compressibility of a sub-network during\neach pruning iteration and thus may under-prune or over-prune the model. In\nthis work, we propose PQ Index (PQI) to measure the potential compressibility\nof deep neural networks and use this to develop a Sparsity-informed Adaptive\nPruning (SAP) algorithm. Our extensive experiments corroborate the hypothesis\nthat for a generic pruning procedure, PQI decreases first when a large model is\nbeing effectively regularized and then increases when its compressibility\nreaches a limit that appears to correspond to the beginning of underfitting.\nSubsequently, PQI decreases again when the model collapse and significant\ndeterioration in the performance of the model start to occur. Additionally, our\nexperiments demonstrate that the proposed adaptive pruning algorithm with\nproper choice of hyper-parameters is superior to the iterative pruning\nalgorithms such as the lottery ticket-based pruning methods, in terms of both\ncompression efficiency and robustness.\n","authors":["Enmao Diao","Ganghua Wang","Jiawei Zhan","Yuhong Yang","Jie Ding","Vahid Tarokh"],"pdf_url":"https://arxiv.org/pdf/2302.05601v3.pdf","comment":"ICLR 2023"},{"id":"http://arxiv.org/abs/2201.06714v3","updated":"2023-08-23T04:54:07Z","published":"2022-01-18T03:13:19Z","title":"AdaTerm: Adaptive T-Distribution Estimated Robust Moments for\n Noise-Robust Stochastic Gradient Optimization","summary":" With the increasing practicality of deep learning applications, practitioners\nare inevitably faced with datasets corrupted by noise from various sources such\nas measurement errors, mislabeling, and estimated surrogate inputs/outputs that\ncan adversely impact the optimization results. It is a common practice to\nimprove the optimization algorithm's robustness to noise, since this algorithm\nis ultimately in charge of updating the network parameters. Previous studies\nrevealed that the first-order moment used in Adam-like stochastic gradient\ndescent optimizers can be modified based on the Student's t-distribution. While\nthis modification led to noise-resistant updates, the other associated\nstatistics remained unchanged, resulting in inconsistencies in the assumed\nmodels. In this paper, we propose AdaTerm, a novel approach that incorporates\nthe Student's t-distribution to derive not only the first-order moment but also\nall the associated statistics. This provides a unified treatment of the\noptimization process, offering a comprehensive framework under the statistical\nmodel of the t-distribution for the first time. The proposed approach offers\nseveral advantages over previously proposed approaches, including reduced\nhyperparameters and improved robustness and adaptability. This noise-adaptive\nbehavior contributes to AdaTerm's exceptional learning performance, as\ndemonstrated through various optimization problems with different and/or\nunknown noise ratios. Furthermore, we introduce a new technique for deriving a\ntheoretical regret bound without relying on AMSGrad, providing a valuable\ncontribution to the field\n","authors":["Wendyam Eric Lionel Ilboudo","Taisuke Kobayashi","Takamitsu Matsubara"],"pdf_url":"https://arxiv.org/pdf/2201.06714v3.pdf","comment":"27 pages; Final version accepted by Elsevier Neurocomputing Journal\n (2023-08; https://doi.org/10.1016/j.neucom.2023.126692)"},{"id":"http://arxiv.org/abs/2308.11905v1","updated":"2023-08-23T04:14:45Z","published":"2023-08-23T04:14:45Z","title":"Utilizing Admissible Bounds for Heuristic Learning","summary":" While learning a heuristic function for forward search algorithms with modern\nmachine learning techniques has been gaining interest in recent years, there\nhas been little theoretical understanding of \\emph{what} they should learn,\n\\emph{how} to train them, and \\emph{why} we do so. This lack of understanding\nleads to various literature performing an ad-hoc selection of datasets\n(suboptimal vs optimal costs or admissible vs inadmissible heuristics) and\noptimization metrics (e.g., squared vs absolute errors). Moreover, due to the\nlack of admissibility of the resulting trained heuristics, little focus has\nbeen put on the role of admissibility \\emph{during} learning. This paper\narticulates the role of admissible heuristics in supervised heuristic learning\nusing them as parameters of Truncated Gaussian distributions, which tightens\nthe hypothesis space compared to ordinary Gaussian distributions. We argue that\nthis mathematical model faithfully follows the principle of maximum entropy and\nempirically show that, as a result, it yields more accurate heuristics and\nconverges faster during training.\n","authors":["Carlos Núñez-Molina","Masataro Asai"],"pdf_url":"https://arxiv.org/pdf/2308.11905v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.11903v1","updated":"2023-08-23T04:08:53Z","published":"2023-08-23T04:08:53Z","title":"Rethinking Data Perturbation and Model Stabilization for Semi-supervised\n Medical Image Segmentation","summary":" Studies on semi-supervised medical image segmentation (SSMIS) have seen fast\nprogress recently. Due to the limited labelled data, SSMIS methods mainly focus\non effectively leveraging unlabeled data to enhance the segmentation\nperformance. However, despite their promising performance, current\nstate-of-the-art methods often prioritize integrating complex techniques and\nloss terms rather than addressing the core challenges of semi-supervised\nscenarios directly. We argue that the key to SSMIS lies in generating\nsubstantial and appropriate prediction disagreement on unlabeled data. To this\nend, we emphasize the crutiality of data perturbation and model stabilization\nin semi-supervised segmentation, and propose a simple yet effective approach to\nboost SSMIS performance significantly, dubbed DPMS. Specifically, we first\nrevisit SSMIS from three distinct perspectives: the data, the model, and the\nloss, and conduct a comprehensive study of corresponding strategies to examine\ntheir effectiveness. Based on these examinations, we then propose DPMS, which\nadopts a plain teacher-student framework with a standard supervised loss and\nunsupervised consistency loss. To produce appropriate prediction disagreements,\nDPMS perturbs the unlabeled data via strong augmentations to enlarge prediction\ndisagreements considerably. On the other hand, using EMA teacher when strong\naugmentation is applied does not necessarily improve performance. DPMS further\nutilizes a forwarding-twice and momentum updating strategies for normalization\nstatistics to stabilize the training on unlabeled data effectively. Despite its\nsimplicity, DPMS can obtain new state-of-the-art performance on the public 2D\nACDC and 3D LA datasets across various semi-supervised settings, e.g. obtaining\na remarkable 22.62% improvement against previous SOTA on ACDC with 5% labels.\n","authors":["Zhen Zhao","Ye Liu","Meng Zhao","Di Yin","Yixuan Yuan","Luping Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.11903v1.pdf","comment":"Code and logs are available at https://github.com/ZhenZHAO/DPMS"},{"id":"http://arxiv.org/abs/2307.00252v2","updated":"2023-08-23T03:59:48Z","published":"2023-07-01T07:17:33Z","title":"An ML approach to resolution of singularities","summary":" The solution set of a system of polynomial equations typically contains\nill-behaved, singular points. Resolution is a fundamental process in geometry\nin which we replace singular points with smooth points, while keeping the rest\nof the solution set unchanged. Resolutions are not unique: the usual way to\ndescribe them involves repeatedly performing a fundamental operation known as\n\"blowing-up\", and the complexity of the resolution highly depends on certain\nchoices. The process can be translated into various versions of a 2-player\ngame, the so-called Hironaka game, and a winning strategy for the first player\nprovides a solution to the resolution problem. In this paper we introduce a new\napproach to the Hironaka game that uses reinforcement learning agents to find\noptimal resolutions of singularities. In certain domains, the trained model\noutperforms state-of-the-art selection heuristics in total number of polynomial\nadditions performed, which provides a proof-of-concept that recent developments\nin machine learning have the potential to improve performance of algorithms in\nsymbolic computation.\n","authors":["Gergely Bérczi","Honglu Fan","Mingcong Zeng"],"pdf_url":"https://arxiv.org/pdf/2307.00252v2.pdf","comment":"To appear in Proceedings of the 40th International Conference on\n Machine Learning TAG Workshop (ICML-TAG 2023)"},{"id":"http://arxiv.org/abs/2307.16680v4","updated":"2023-08-23T03:28:30Z","published":"2023-07-31T13:57:05Z","title":"On the Trustworthiness Landscape of State-of-the-art Generative Models:\n A Comprehensive Survey","summary":" Diffusion models and large language models have emerged as leading-edge\ngenerative models and have sparked a revolutionary impact on various aspects of\nhuman life. However, the practical implementation of these models has also\nexposed inherent risks, highlighting their dual nature and raising concerns\nregarding their trustworthiness. Despite the abundance of literature on this\nsubject, a comprehensive survey specifically delving into the intersection of\nlarge-scale generative models and their trustworthiness remains largely absent.\nTo bridge this gap, This paper investigates both the long-standing and emerging\nthreats associated with these models across four fundamental dimensions:\nprivacy, security, fairness, and responsibility. In this way, we construct an\nextensive map outlining the trustworthiness of these models, while also\nproviding practical recommendations and identifying future directions. These\nefforts are crucial for promoting the trustworthy deployment of these models,\nultimately benefiting society as a whole.\n","authors":["Mingyuan Fan","Cen Chen","Chengyu Wang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2307.16680v4.pdf","comment":"Draft Version"},{"id":"http://arxiv.org/abs/2308.11890v1","updated":"2023-08-23T03:23:07Z","published":"2023-08-23T03:23:07Z","title":"Shape-conditioned 3D Molecule Generation via Equivariant Diffusion\n Models","summary":" Ligand-based drug design aims to identify novel drug candidates of similar\nshapes with known active molecules. In this paper, we formulated an in silico\nshape-conditioned molecule generation problem to generate 3D molecule\nstructures conditioned on the shape of a given molecule. To address this\nproblem, we developed a translation- and rotation-equivariant shape-guided\ngenerative model ShapeMol. ShapeMol consists of an equivariant shape encoder\nthat maps molecular surface shapes into latent embeddings, and an equivariant\ndiffusion model that generates 3D molecules based on these embeddings.\nExperimental results show that ShapeMol can generate novel, diverse, drug-like\nmolecules that retain 3D molecular shapes similar to the given shape condition.\nThese results demonstrate the potential of ShapeMol in designing drug\ncandidates of desired 3D shapes binding to protein target pockets.\n","authors":["Ziqi Chen","Bo Peng","Srinivasan Parthasarathy","Xia Ning"],"pdf_url":"https://arxiv.org/pdf/2308.11890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11881v1","updated":"2023-08-23T02:58:02Z","published":"2023-08-23T02:58:02Z","title":"Adversarial Training Using Feedback Loops","summary":" Deep neural networks (DNN) have found wide applicability in numerous fields\ndue to their ability to accurately learn very complex input-output relations.\nDespite their accuracy and extensive use, DNNs are highly susceptible to\nadversarial attacks due to limited generalizability. For future progress in the\nfield, it is essential to build DNNs that are robust to any kind of\nperturbations to the data points. In the past, many techniques have been\nproposed to robustify DNNs using first-order derivative information of the\nnetwork.\n This paper proposes a new robustification approach based on control theory. A\nneural network architecture that incorporates feedback control, named Feedback\nNeural Networks, is proposed. The controller is itself a neural network, which\nis trained using regular and adversarial data such as to stabilize the system\noutputs. The novel adversarial training approach based on the feedback control\narchitecture is called Feedback Looped Adversarial Training (FLAT). Numerical\nresults on standard test problems empirically show that our FLAT method is more\neffective than the state-of-the-art to guard against adversarial attacks.\n","authors":["Ali Haisam Muhammad Rafid","Adrian Sandu"],"pdf_url":"https://arxiv.org/pdf/2308.11881v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11880v1","updated":"2023-08-23T02:57:58Z","published":"2023-08-23T02:57:58Z","title":"SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal\n Targets","summary":" Scene understanding using multi-modal data is necessary in many applications,\ne.g., autonomous navigation. To achieve this in a variety of situations,\nexisting models must be able to adapt to shifting data distributions without\narduous data annotation. Current approaches assume that the source data is\navailable during adaptation and that the source consists of paired multi-modal\ndata. Both these assumptions may be problematic for many applications. Source\ndata may not be available due to privacy, security, or economic concerns.\nAssuming the existence of paired multi-modal data for training also entails\nsignificant data collection costs and fails to take advantage of widely\navailable freely distributed pre-trained uni-modal models. In this work, we\nrelax both of these assumptions by addressing the problem of adapting a set of\nmodels trained independently on uni-modal data to a target domain consisting of\nunlabeled multi-modal data, without having access to the original source\ndataset. Our proposed approach solves this problem through a switching\nframework which automatically chooses between two complementary methods of\ncross-modal pseudo-label fusion -- agreement filtering and entropy weighting --\nbased on the estimated domain gap. We demonstrate our work on the semantic\nsegmentation problem. Experiments across seven challenging adaptation scenarios\nverify the efficacy of our approach, achieving results comparable to, and in\nsome cases outperforming, methods which assume access to source data. Our\nmethod achieves an improvement in mIoU of up to 12% over competing baselines.\nOur code is publicly available at https://github.com/csimo005/SUMMIT.\n","authors":["Cody Simons","Dripta S. Raychaudhuri","Sk Miraj Ahmed","Suya You","Konstantinos Karydis","Amit K. Roy-Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2308.11880v1.pdf","comment":"12 pages, 5 figures, 9 tables, ICCV 2023"},{"id":"http://arxiv.org/abs/2308.11878v1","updated":"2023-08-23T02:49:35Z","published":"2023-08-23T02:49:35Z","title":"Cabrita: closing the gap for foreign languages","summary":" The strategy of training the model from scratch in a specific language or\ndomain serves two essential purposes: i) enhancing performance in the\nparticular linguistic or domain context, and ii) ensuring effective\ntokenization. The main limitation inherent to this approach lies in the\nassociated cost, which can reach six to seven-digit dollar values, depending on\nthe model size and the number of parameters involved.\n The main solution to overcome the cost challenge is to rely on available\npre-trained models, which, despite recent advancements such as the LLaMA and\nLLaMA-2 models, still demonstrate inefficiency for certain specific domain\nproblems or prove ineffective in scenarios involving conversational memory\nresources, given the large number of tokens required to represent text.\n To overcome this issue, we present a methodology named Cabrita, which, as our\nresearch demonstrates, successfully addresses the performance and efficient\ntokenization problem, all at an affordable cost. We believe that this\nmethodology can be applied to any transformer-like architecture model. To\nvalidate the study, we conducted continuous pre-training exclusively using\nPortuguese text on a 3-billion-parameter model known as OpenLLaMA, resulting in\na model named openCabrita 3B. The openCabrita 3B also features a new tokenizer\nthat results in a significant reduction in the number of tokens required to\nrepresent the text. In our assessment, for few-shot learning tasks, we achieved\nsimilar results with this 3B model compared to a traditional continuous\npre-training approach as well as to 7B models English pre-trained models.\n","authors":["Celio Larcher","Marcos Piau","Paulo Finardi","Pedro Gengo","Piero Esposito","Vinicius Caridá"],"pdf_url":"https://arxiv.org/pdf/2308.11878v1.pdf","comment":"9 pages, 1 figure"},{"id":"http://arxiv.org/abs/2308.11873v1","updated":"2023-08-23T02:36:19Z","published":"2023-08-23T02:36:19Z","title":"Integrating Large Language Models into the Debugging C Compiler for\n generating contextual error explanations","summary":" This paper introduces a method for Large Language Models (LLM) to produce\nenhanced compiler error explanations, in simple language, within our Debugging\nC Compiler (DCC). It is well documented that compiler error messages have been\nknown to present a barrier for novices learning how to program. Although our\ninitial use of DCC in introductory programming (CS1) has been instrumental in\nteaching C to novice programmers by providing safeguards to commonly occurring\nerrors and translating the usually cryptic compiler error messages at both\ncompile- and run-time, we proposed that incorporating LLM-generated\nexplanations would further enhance the learning experience for novice\nprogrammers. Through an expert evaluation, we observed that LLM-generated\nexplanations for compiler errors were conceptually accurate in 90% of\ncompile-time errors, and 75% of run-time errors. Additionally, the new DCC-help\ntool has been increasingly adopted by students, with an average of 1047 unique\nruns per week, demonstrating a promising initial assessment of using LLMs to\ncomplement compiler output to enhance programming education for beginners. We\nrelease our tool as open-source to the community.\n","authors":["Andrew Taylor","Alexandra Vassar","Jake Renzella","Hammond Pearce"],"pdf_url":"https://arxiv.org/pdf/2308.11873v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2207.03678v2","updated":"2023-08-23T02:22:37Z","published":"2022-07-08T03:54:52Z","title":"Stability of Aggregation Graph Neural Networks","summary":" In this paper we study the stability properties of aggregation graph neural\nnetworks (Agg-GNNs) considering perturbations of the underlying graph. An\nAgg-GNN is a hybrid architecture where information is defined on the nodes of a\ngraph, but it is processed block-wise by Euclidean CNNs on the nodes after\nseveral diffusions on the graph shift operator. We derive stability bounds for\nthe mapping operator associated to a generic Agg-GNN, and we specify conditions\nunder which such operators can be stable to deformations. We prove that the\nstability bounds are defined by the properties of the filters in the first\nlayer of the CNN that acts on each node. Additionally, we show that there is a\nclose relationship between the number of aggregations, the filter's\nselectivity, and the size of the stability constants. We also conclude that in\nAgg-GNNs the selectivity of the mapping operators is tied to the properties of\nthe filters only in the first layer of the CNN stage. This shows a substantial\ndifference with respect to the stability properties of selection GNNs, where\nthe selectivity of the filters in all layers is constrained by their stability.\nWe provide numerical evidence corroborating the results derived, testing the\nbehavior of Agg-GNNs in real life application scenarios considering\nperturbations of different magnitude.\n","authors":["Alejandro Parada-Mayorga","Zhiyang Wang","Fernando Gama","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2207.03678v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19301v2","updated":"2023-08-23T02:18:51Z","published":"2023-05-30T14:24:40Z","title":"On the Choice of Perception Loss Function for Learned Video Compression","summary":" We study causal, low-latency, sequential video compression when the output is\nsubjected to both a mean squared-error (MSE) distortion loss as well as a\nperception loss to target realism. Motivated by prior approaches, we consider\ntwo different perception loss functions (PLFs). The first, PLF-JD, considers\nthe joint distribution (JD) of all the video frames up to the current one,\nwhile the second metric, PLF-FMD, considers the framewise marginal\ndistributions (FMD) between the source and reconstruction. Using information\ntheoretic analysis and deep-learning based experiments, we demonstrate that the\nchoice of PLF can have a significant effect on the reconstruction, especially\nat low-bit rates. In particular, while the reconstruction based on PLF-JD can\nbetter preserve the temporal correlation across frames, it also imposes a\nsignificant penalty in distortion compared to PLF-FMD and further makes it more\ndifficult to recover from errors made in the earlier output frames. Although\nthe choice of PLF decisively affects reconstruction quality, we also\ndemonstrate that it may not be essential to commit to a particular PLF during\nencoding and the choice of PLF can be delegated to the decoder. In particular,\nencoded representations generated by training a system to minimize the MSE\n(without requiring either PLF) can be {\\em near universal} and can generate\nclose to optimal reconstructions for either choice of PLF at the decoder. We\nvalidate our results using (one-shot) information-theoretic analysis, detailed\nstudy of the rate-distortion-perception tradeoff of the Gauss-Markov source\nmodel as well as deep-learning based experiments on moving MNIST and KTH\ndatasets.\n","authors":["Sadaf Salehkalaibar","Buu Phan","Jun Chen","Wei Yu","Ashish Khisti"],"pdf_url":"https://arxiv.org/pdf/2305.19301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06980v3","updated":"2023-08-23T02:11:53Z","published":"2023-03-13T10:30:02Z","title":"Self-supervised learning based general laboratory progress pretrained\n model for cardiovascular event detection","summary":" The inherent nature of patient data poses several challenges. Prevalent cases\namass substantial longitudinal data owing to their patient volume and\nconsistent follow-ups, however, longitudinal laboratory data are renowned for\ntheir irregularity, temporality, absenteeism, and sparsity; In contrast,\nrecruitment for rare or specific cases is often constrained due to their\nlimited patient size and episodic observations. This study employed\nself-supervised learning (SSL) to pretrain a generalized laboratory progress\n(GLP) model that captures the overall progression of six common laboratory\nmarkers in prevalent cardiovascular cases, with the intention of transferring\nthis knowledge to aid in the detection of specific cardiovascular event. GLP\nimplemented a two-stage training approach, leveraging the information embedded\nwithin interpolated data and amplify the performance of SSL. After GLP\npretraining, it is transferred for TVR detection. The proposed two-stage\ntraining improved the performance of pure SSL, and the transferability of GLP\nexhibited distinctiveness. After GLP processing, the classification exhibited a\nnotable enhancement, with averaged accuracy rising from 0.63 to 0.90. All\nevaluated metrics demonstrated substantial superiority (p < 0.01) compared to\nprior GLP processing. Our study effectively engages in translational\nengineering by transferring patient progression of cardiovascular laboratory\nparameters from one patient group to another, transcending the limitations of\ndata availability. The transferability of disease progression optimized the\nstrategies of examinations and treatments, and improves patient prognosis while\nusing commonly available laboratory parameters. The potential for expanding\nthis approach to encompass other diseases holds great promise.\n","authors":["Li-Chin Chen","Kuo-Hsuan Hung","Yi-Ju Tseng","Hsin-Yao Wang","Tse-Min Lu","Wei-Chieh Huang","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2303.06980v3.pdf","comment":"published in IEEE Journal of Translational Engineering in Health &\n Medicine"},{"id":"http://arxiv.org/abs/2212.08171v2","updated":"2023-08-23T01:52:33Z","published":"2022-12-15T22:11:34Z","title":"Graphon Pooling for Reducing Dimensionality of Signals and Convolutional\n Operators on Graphs","summary":" In this paper we propose a pooling approach for convolutional information\nprocessing on graphs relying on the theory of graphons and limits of dense\ngraph sequences. We present three methods that exploit the induced graphon\nrepresentation of graphs and graph signals on partitions of [0, 1]2 in the\ngraphon space. As a result we derive low dimensional representations of the\nconvolutional operators, while a dimensionality reduction of the signals is\nachieved by simple local interpolation of functions in L2([0, 1]). We prove\nthat those low dimensional representations constitute a convergent sequence of\ngraphs and graph signals, respectively. The methods proposed and the\ntheoretical guarantees that we provide show that the reduced graphs and signals\ninherit spectral-structural properties of the original quantities. We evaluate\nour approach with a set of numerical experiments performed on graph neural\nnetworks (GNNs) that rely on graphon pooling. We observe that graphon pooling\nperforms significantly better than other approaches proposed in the literature\nwhen dimensionality reduction ratios between layers are large. We also observe\nthat when graphon pooling is used we have, in general, less overfitting and\nlower computational cost.\n","authors":["Alejandro Parada-Mayorga","Zhiyang Wang","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2212.08171v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11863v1","updated":"2023-08-23T01:44:28Z","published":"2023-08-23T01:44:28Z","title":"KinSPEAK: Improving speech recognition for Kinyarwanda via\n semi-supervised learning methods","summary":" Despite recent availability of large transcribed Kinyarwanda speech data,\nachieving robust speech recognition for Kinyarwanda is still challenging. In\nthis work, we show that using self-supervised pre-training, following a simple\ncurriculum schedule during fine-tuning and using semi-supervised learning to\nleverage large unlabelled speech data significantly improve speech recognition\nperformance for Kinyarwanda. Our approach focuses on using public domain data\nonly. A new studio-quality speech dataset is collected from a public website,\nthen used to train a clean baseline model. The clean baseline model is then\nused to rank examples from a more diverse and noisy public dataset, defining a\nsimple curriculum training schedule. Finally, we apply semi-supervised learning\nto label and learn from large unlabelled data in four successive generations.\nOur final model achieves 3.2% word error rate (WER) on the new dataset and\n15.9% WER on Mozilla Common Voice benchmark, which is state-of-the-art to the\nbest of our knowledge. Our experiments also indicate that using syllabic rather\nthan character-based tokenization results in better speech recognition\nperformance for Kinyarwanda.\n","authors":["Antoine Nzeyimana"],"pdf_url":"https://arxiv.org/pdf/2308.11863v1.pdf","comment":"9 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2207.03364v3","updated":"2023-08-23T01:41:28Z","published":"2022-07-07T15:12:02Z","title":"Group Equality in Adaptive Submodular Maximization","summary":" In this paper, we study the classic submodular maximization problem subject\nto a group equality constraint under both non-adaptive and adaptive settings.\nIt has been shown that the utility function of many machine learning\napplications, including data summarization, influence maximization in social\nnetworks, and personalized recommendation, satisfies the property of\nsubmodularity. Hence, maximizing a submodular function subject to various\nconstraints can be found at the heart of many of those applications. On a high\nlevel, submodular maximization aims to select a group of most representative\nitems (e.g., data points). However, the design of most existing algorithms does\nnot incorporate the fairness constraint, leading to under- or\nover-representation of some particular groups. This motivates us to study the\nsubmodular maximization problem with group equality, where we aim to select a\ngroup of items to maximize a (possibly non-monotone) submodular utility\nfunction subject to a group equality constraint. To this end, we develop the\nfirst constant-factor approximation algorithm for this problem. The design of\nour algorithm is robust enough to be extended to solving the submodular\nmaximization problem under a more complicated adaptive setting. Moreover, we\nfurther extend our study to incorporating a global cardinality constraint and\nother fairness notations.\n","authors":["Shaojie Tang","Jing Yuan"],"pdf_url":"https://arxiv.org/pdf/2207.03364v3.pdf","comment":"This paper has been accepted by INFORMS Journal on Computing"},{"id":"http://arxiv.org/abs/2308.11854v1","updated":"2023-08-23T01:08:01Z","published":"2023-08-23T01:08:01Z","title":"Finding the Perfect Fit: Applying Regression Models to ClimateBench v1.0","summary":" Climate projections using data driven machine learning models acting as\nemulators, is one of the prevailing areas of research to enable policy makers\nmake informed decisions. Use of machine learning emulators as surrogates for\ncomputationally heavy GCM simulators reduces time and carbon footprints. In\nthis direction, ClimateBench [1] is a recently curated benchmarking dataset for\nevaluating the performance of machine learning emulators designed for climate\ndata. Recent studies have reported that despite being considered fundamental,\nregression models offer several advantages pertaining to climate emulations. In\nparticular, by leveraging the kernel trick, regression models can capture\ncomplex relationships and improve their predictive capabilities. This study\nfocuses on evaluating non-linear regression models using the aforementioned\ndataset. Specifically, we compare the emulation capabilities of three\nnon-linear regression models. Among them, Gaussian Process Regressor\ndemonstrates the best-in-class performance against standard evaluation metrics\nused for climate field emulation studies. However, Gaussian Process Regression\nsuffers from being computational resource hungry in terms of space and time\ncomplexity. Alternatively, Support Vector and Kernel Ridge models also deliver\ncompetitive results and but there are certain trade-offs to be addressed.\nAdditionally, we are actively investigating the performance of composite\nkernels and techniques such as variational inference to further enhance the\nperformance of the regression models and effectively model complex non-linear\npatterns, including phenomena like precipitation.\n","authors":["Anmol Chaure","Ashok Kumar Behera","Sudip Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2308.11854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08973v6","updated":"2023-08-23T01:05:39Z","published":"2023-02-17T16:19:26Z","title":"Measuring Equality in Machine Learning Security Defenses: A Case Study\n in Speech Recognition","summary":" Over the past decade, the machine learning security community has developed a\nmyriad of defenses for evasion attacks. An understudied question in that\ncommunity is: for whom do these defenses defend? This work considers common\napproaches to defending learned systems and how security defenses result in\nperformance inequities across different sub-populations. We outline appropriate\nparity metrics for analysis and begin to answer this question through empirical\nresults of the fairness implications of machine learning security methods. We\nfind that many methods that have been proposed can cause direct harm, like\nfalse rejection and unequal benefits from robustness training. The framework we\npropose for measuring defense equality can be applied to robustly trained\nmodels, preprocessing-based defenses, and rejection methods. We identify a set\nof datasets with a user-centered application and a reasonable computational\ncost suitable for case studies in measuring the equality of defenses. In our\ncase study of speech command recognition, we show how such adversarial training\nand augmentation have non-equal but complex protections for social subgroups\nacross gender, accent, and age in relation to user coverage. We present a\ncomparison of equality between two rejection-based defenses: randomized\nsmoothing and neural rejection, finding randomized smoothing more equitable due\nto the sampling mechanism for minority groups. This represents the first work\nexamining the disparity in the adversarial robustness in the speech domain and\nthe fairness evaluation of rejection-based defenses.\n","authors":["Luke E. Richards","Edward Raff","Cynthia Matuszek"],"pdf_url":"https://arxiv.org/pdf/2302.08973v6.pdf","comment":"Accepted to AISec'23"},{"id":"http://arxiv.org/abs/2308.11849v1","updated":"2023-08-23T00:55:39Z","published":"2023-08-23T00:55:39Z","title":"A deep reinforcement learning approach for real-time demand-responsive\n railway rescheduling to mitigate station overcrowding using mobile data","summary":" Real-time railway rescheduling is a timely and flexible technique to\nautomatically alter the operation schedule in response to time-varying\nconditions. Current research lacks data-driven approaches that capture\nreal-time passenger mobility during railway disruptions, relying mostly on\nOD-based data and model-based methods for estimating demands of trains.\nMeanwhile, the schedule-updating principles for a long-term disruption overlook\nthe uneven distribution of demand over time. To fill this gap, this paper\nproposes a demand-responsive approach by inferring real-world passenger\nmobility from mobile data (MD) to facilitate real-time rescheduling. Unlike\nnetwork-level approaches, this paper focuses on a heavy-demand station upstream\nof the disrupted area. The objective is to reschedule all trains on multiple\nroutes passing through this target station, which have been affected by a\nsevere emergency event such as a natural disaster. Particular attention should\nbe given to avoiding the accumulation of overcrowded passengers at this\nstation, to prevent additional accidents arising from overcrowding. This\nresearch addresses the challenges associated with this scenario, including the\ndynamics of arriving and leaving of passengers, station overcrowding, rolling\nstock shortage, open-ended disruption duration, integrated rescheduling on\nmultiple routes, and delays due to detours. A deep reinforcement learning (DRL)\nframework is proposed to determine the optimal rescheduled timetable, route\nstops, and rolling stock allocation, while considering real-time demand\nsatisfaction, station overcrowding, train capacity utilization, and headway\nsafety.\n","authors":["Enze Liu","Zhiyuan Lin","Judith Y. T. Wang","Hong Chen"],"pdf_url":"https://arxiv.org/pdf/2308.11849v1.pdf","comment":"36 pages,16 figures"},{"id":"http://arxiv.org/abs/2308.11845v1","updated":"2023-08-23T00:49:29Z","published":"2023-08-23T00:49:29Z","title":"SEA: Shareable and Explainable Attribution for Query-based Black-box\n Attacks","summary":" Machine Learning (ML) systems are vulnerable to adversarial examples,\nparticularly those from query-based black-box attacks. Despite various efforts\nto detect and prevent such attacks, there is a need for a more comprehensive\napproach to logging, analyzing, and sharing evidence of attacks. While classic\nsecurity benefits from well-established forensics and intelligence sharing,\nMachine Learning is yet to find a way to profile its attackers and share\ninformation about them. In response, this paper introduces SEA, a novel ML\nsecurity system to characterize black-box attacks on ML systems for forensic\npurposes and to facilitate human-explainable intelligence sharing. SEA\nleverages the Hidden Markov Models framework to attribute the observed query\nsequence to known attacks. It thus understands the attack's progression rather\nthan just focusing on the final adversarial examples. Our evaluations reveal\nthat SEA is effective at attack attribution, even on their second occurrence,\nand is robust to adaptive strategies designed to evade forensics analysis.\nInterestingly, SEA's explanations of the attack behavior allow us even to\nfingerprint specific minor implementation bugs in attack libraries. For\nexample, we discover that the SignOPT and Square attacks implementation in ART\nv1.14 sends over 50% specific zero difference queries. We thoroughly evaluate\nSEA on a variety of settings and demonstrate that it can recognize the same\nattack's second occurrence with 90+% Top-1 and 95+% Top-3 accuracy.\n","authors":["Yue Gao","Ilia Shumailov","Kassem Fawaz"],"pdf_url":"https://arxiv.org/pdf/2308.11845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11842v1","updated":"2023-08-23T00:18:17Z","published":"2023-08-23T00:18:17Z","title":"${\\rm E}(3)$-Equivariant Actor-Critic Methods for Cooperative\n Multi-Agent Reinforcement Learning","summary":" Identification and analysis of symmetrical patterns in the natural world have\nled to significant discoveries across various scientific fields, such as the\nformulation of gravitational laws in physics and advancements in the study of\nchemical structures. In this paper, we focus on exploiting Euclidean symmetries\ninherent in certain cooperative multi-agent reinforcement learning (MARL)\nproblems and prevalent in many applications. We begin by formally\ncharacterizing a subclass of Markov games with a general notion of symmetries\nthat admits the existence of symmetric optimal values and policies. Motivated\nby these properties, we design neural network architectures with symmetric\nconstraints embedded as an inductive bias for multi-agent actor-critic methods.\nThis inductive bias results in superior performance in various cooperative MARL\nbenchmarks and impressive generalization capabilities such as zero-shot\nlearning and transfer learning in unseen scenarios with repeated symmetric\npatterns. The code is available at: https://github.com/dchen48/E3AC.\n","authors":["Dingyang Chen","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11841v1","updated":"2023-08-23T00:17:51Z","published":"2023-08-23T00:17:51Z","title":"A Survey for Federated Learning Evaluations: Goals and Measures","summary":" Evaluation is a systematic approach to assessing how well a system achieves\nits intended purpose. Federated learning (FL) is a novel paradigm for\nprivacy-preserving machine learning that allows multiple parties to\ncollaboratively train models without sharing sensitive data. However,\nevaluating FL is challenging due to its interdisciplinary nature and diverse\ngoals, such as utility, efficiency, and security. In this survey, we first\nreview the major evaluation goals adopted in the existing studies and then\nexplore the evaluation metrics used for each goal. We also introduce FedEval,\nan open-source platform that provides a standardized and comprehensive\nevaluation framework for FL algorithms in terms of their utility, efficiency,\nand security. Finally, we discuss several challenges and future research\ndirections for FL evaluation.\n","authors":["Di Chai","Leye Wang","Liu Yang","Junxue Zhang","Kai Chen","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2308.11841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11838v1","updated":"2023-08-23T00:10:29Z","published":"2023-08-23T00:10:29Z","title":"A Benchmark Study on Calibration","summary":" Deep neural networks are increasingly utilized in various machine learning\ntasks. However, as these models grow in complexity, they often face calibration\nissues, despite enhanced prediction accuracy. Many studies have endeavored to\nimprove calibration performance through data preprocessing, the use of specific\nloss functions, and training frameworks. Yet, investigations into calibration\nproperties have been somewhat overlooked. Our study leverages the Neural\nArchitecture Search (NAS) search space, offering an exhaustive model\narchitecture space for thorough calibration properties exploration. We\nspecifically create a model calibration dataset. This dataset evaluates 90\nbin-based and 12 additional calibration measurements across 117,702 unique\nneural networks within the widely employed NATS-Bench search space. Our\nanalysis aims to answer several longstanding questions in the field, using our\nproposed dataset: (i) Can model calibration be generalized across different\ntasks? (ii) Can robustness be used as a calibration measurement? (iii) How\nreliable are calibration metrics? (iv) Does a post-hoc calibration method\naffect all models uniformly? (v) How does calibration interact with accuracy?\n(vi) What is the impact of bin size on calibration measurement? (vii) Which\narchitectural designs are beneficial for calibration? Additionally, our study\nbridges an existing gap by exploring calibration within NAS. By providing this\ndataset, we enable further research into NAS calibration. As far as we are\naware, our research represents the first large-scale investigation into\ncalibration properties and the premier study of calibration issues within NAS.\n","authors":["Linwei Tao","Younan Zhu","Haolan Guo","Minjing Dong","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.11838v1.pdf","comment":"39 pages, 35 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.12141v1","updated":"2023-08-23T13:56:38Z","published":"2023-08-23T13:56:38Z","title":"Aparecium: Revealing Secrets from Physical Photographs","summary":" Watermarking is a crucial tool for safeguarding copyrights and can serve as a\nmore aesthetically pleasing alternative to QR codes. In recent years,\nwatermarking methods based on deep learning have proved superior robustness\nagainst complex physical distortions than traditional watermarking methods.\nHowever, they have certain limitations that render them less effective in\npractice. For instance, current solutions necessitate physical photographs to\nbe rectangular for accurate localization, cannot handle physical bending or\nfolding, and require the hidden area to be completely captured at a close\ndistance and small angle. To overcome these challenges, we propose a novel deep\nwatermarking framework dubbed \\textit{Aparecium}. Specifically, we preprocess\nsecrets (i.e., watermarks) into a pattern and then embed it into the cover\nimage, which is symmetrical to the final decoding-then-extracting process. To\ncapture the watermarked region from complex physical scenarios, a locator is\nalso introduced. Besides, we adopt a three-stage training strategy for training\nconvergence. Extensive experiments demonstrate that \\textit{Aparecium} is not\nonly robust against different digital distortions, but also can resist various\nphysical distortions, such as screen-shooting and printing-shooting, even in\nsevere cases including different shapes, curvature, folding, incompleteness,\nlong distances, and big angles while maintaining high visual quality.\nFurthermore, some ablation studies are also conducted to verify our design.\n","authors":["Zhe Lei","Jie Zhang","Jingtao Li","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2308.12141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02051v2","updated":"2023-08-23T12:45:27Z","published":"2023-04-04T18:03:04Z","title":"Multimodal Garment Designer: Human-Centric Latent Diffusion Models for\n Fashion Image Editing","summary":" Fashion illustration is used by designers to communicate their vision and to\nbring the design idea from conceptualization to realization, showing how\nclothes interact with the human body. In this context, computer vision can thus\nbe used to improve the fashion design process. Differently from previous works\nthat mainly focused on the virtual try-on of garments, we propose the task of\nmultimodal-conditioned fashion image editing, guiding the generation of\nhuman-centric fashion images by following multimodal prompts, such as text,\nhuman body poses, and garment sketches. We tackle this problem by proposing a\nnew architecture based on latent diffusion models, an approach that has not\nbeen used before in the fashion domain. Given the lack of existing datasets\nsuitable for the task, we also extend two existing fashion datasets, namely\nDress Code and VITON-HD, with multimodal annotations collected in a\nsemi-automatic manner. Experimental results on these new datasets demonstrate\nthe effectiveness of our proposal, both in terms of realism and coherence with\nthe given multimodal inputs. Source code and collected multimodal annotations\nare publicly available at:\nhttps://github.com/aimagelab/multimodal-garment-designer.\n","authors":["Alberto Baldrati","Davide Morelli","Giuseppe Cartella","Marcella Cornia","Marco Bertini","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2304.02051v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12045v1","updated":"2023-08-23T10:25:37Z","published":"2023-08-23T10:25:37Z","title":"CgT-GAN: CLIP-guided Text GAN for Image Captioning","summary":" The large-scale visual-language pre-trained model, Contrastive Language-Image\nPre-training (CLIP), has significantly improved image captioning for scenarios\nwithout human-annotated image-caption pairs. Recent advanced CLIP-based image\ncaptioning without human annotations follows a text-only training paradigm,\ni.e., reconstructing text from shared embedding space. Nevertheless, these\napproaches are limited by the training/inference gap or huge storage\nrequirements for text embeddings. Given that it is trivial to obtain images in\nthe real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates\nimages into the training process to enable the model to \"see\" real visual\nmodality. Particularly, we use adversarial training to teach CgT-GAN to mimic\nthe phrases of an external text corpus and CLIP-based reward to provide\nsemantic guidance. The caption generator is jointly rewarded based on the\ncaption naturalness to human language calculated from the GAN's discriminator\nand the semantic guidance reward computed by the CLIP-based reward module. In\naddition to the cosine similarity as the semantic guidance reward (i.e.,\nCLIP-cos), we further introduce a novel semantic guidance reward called\nCLIP-agg, which aligns the generated caption with a weighted text embedding by\nattentively aggregating the entire corpus. Experimental results on three\nsubtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms\nstate-of-the-art methods significantly across all metrics. Code is available at\nhttps://github.com/Lihr747/CgtGAN.\n","authors":["Jiarui Yu","Haoran Li","Yanbin Hao","Bin Zhu","Tong Xu","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2308.12045v1.pdf","comment":"Accepted at ACM MM 2023"},{"id":"http://arxiv.org/abs/2308.11971v1","updated":"2023-08-23T07:36:30Z","published":"2023-08-23T07:36:30Z","title":"EVE: Efficient Vision-Language Pre-training with Masked Prediction and\n Modality-Aware MoE","summary":" Building scalable vision-language models to learn from diverse, multimodal\ndata remains an open challenge. In this paper, we introduce an Efficient\nVision-languagE foundation model, namely EVE, which is one unified multimodal\nTransformer pre-trained solely by one unified pre-training task. Specifically,\nEVE encodes both vision and language within a shared Transformer network\nintegrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which\ncapture modality-specific information by selectively switching to different\nexperts. To unify pre-training tasks of vision and language, EVE performs\nmasked signal modeling on image-text pairs to reconstruct masked signals, i.e.,\nimage pixels and text tokens, given visible signals. This simple yet effective\npre-training objective accelerates training by 3.5x compared to the model\npre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing\nto the combination of the unified architecture and pre-training task, EVE is\neasy to scale up, enabling better downstream performance with fewer resources\nand faster training speed. Despite its simplicity, EVE achieves\nstate-of-the-art performance on various vision-language downstream tasks,\nincluding visual question answering, visual reasoning, and image-text\nretrieval.\n","authors":["Junyi Chen","Longteng Guo","Jia Sun","Shuai Shao","Zehuan Yuan","Liang Lin","Dongyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.11971v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07056v4","updated":"2023-08-23T06:39:08Z","published":"2023-08-14T10:31:29Z","title":"VoxBlink: X-Large Speaker Verification Dataset on Camera","summary":" In this paper, we contribute a novel and extensive dataset for speaker\nverification, which contains noisy 38k identities/1.45M utterances (VoxBlink)\nand relatively cleaned 18k identities/1.02M (VoxBlink-Clean) utterances for\ntraining. Firstly, we accumulate a 60K+ users' list with their avatars and\ndownload their short videos on YouTube. We then established an automatic and\nscalable pipeline to extract relevant speech and video segments from these\nvideos. To our knowledge, the VoxBlink dataset is one of the largest speaker\nrecognition datasets available. Secondly, we conduct a series of experiments\nbased on different backbones trained on a mix of the VoxCeleb2 and the\nVoxBlink-Clean. Our findings highlight a notable performance improvement,\nranging from 13% to 30%, across different backbone architectures upon\nintegrating our dataset for training. The dataset will be made publicly\navailable shortly.\n","authors":["Yuke Lin","Xiaoyi Qin","Ming Cheng","Ning Jiang","Guoqing Zhao","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2308.07056v4.pdf","comment":"submit to ICASSP2024"},{"id":"http://arxiv.org/abs/2211.06924v3","updated":"2023-08-23T04:02:28Z","published":"2022-11-13T15:11:03Z","title":"A Tale of Two Graphs: Freezing and Denoising Graph Structures for\n Multimodal Recommendation","summary":" Multimodal recommender systems utilizing multimodal features (e.g., images\nand textual descriptions) typically show better recommendation accuracy than\ngeneral recommendation models based solely on user-item interactions.\nGenerally, prior work fuses multimodal features into item ID embeddings to\nenrich item representations, thus failing to capture the latent semantic\nitem-item structures. In this context, LATTICE proposes to learn the latent\nstructure between items explicitly and achieves state-of-the-art performance\nfor multimodal recommendations. However, we argue the latent graph structure\nlearning of LATTICE is both inefficient and unnecessary. Experimentally, we\ndemonstrate that freezing its item-item structure before training can also\nachieve competitive performance. Based on this finding, we propose a simple yet\neffective model, dubbed as FREEDOM, that FREEzes the item-item graph and\nDenOises the user-item interaction graph simultaneously for Multimodal\nrecommendation. Theoretically, we examine the design of FREEDOM through a graph\nspectral perspective and demonstrate that it possesses a tighter upper bound on\nthe graph spectrum. In denoising the user-item interaction graph, we devise a\ndegree-sensitive edge pruning method, which rejects possibly noisy edges with a\nhigh probability when sampling the graph. We evaluate the proposed model on\nthree real-world datasets and show that FREEDOM can significantly outperform\ncurrent strongest baselines. Compared with LATTICE, FREEDOM achieves an average\nimprovement of 19.07% in recommendation accuracy while reducing its memory cost\nup to 6$\\times$ on large graphs. The source code is available at:\nhttps://github.com/enoche/FREEDOM.\n","authors":["Xin Zhou","Zhiqi Shen"],"pdf_url":"https://arxiv.org/pdf/2211.06924v3.pdf","comment":"Accepted to ACM Multimedia (MM) 2023"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 49 + +
+
+
+ + ☆ D4: Improving LLM Pretraining via Document De-Duplication and + Diversification + + +
+ Over recent years, an increasing amount of compute and data has been poured +into training large language models (LLMs), usually by doing one-pass learning +on as many tokens as possible randomly selected from large-scale web corpora. +While training on ever-larger portions of the internet leads to consistent +performance improvements, the size of these improvements diminishes with scale, +and there has been little work exploring the effect of data selection on +pre-training and downstream performance beyond simple de-duplication methods +such as MinHash. Here, we show that careful data selection (on top of +de-duplicated data) via pre-trained model embeddings can speed up training (20% +efficiency gains) and improves average downstream accuracy on 16 NLP tasks (up +to 2%) at the 6.7B model scale. Furthermore, we show that repeating data +intelligently consistently outperforms baseline training (while repeating +random data performs worse than baseline training). Our results indicate that +clever data selection can significantly improve LLM pre-training, calls into +question the common practice of training for a single epoch on as much data as +possible, and demonstrates a path to keep improving our models past the limits +of randomly sampling web data. + +
+
+
+
+
+ + ☆ Simple is Better and Large is Not Enough: Towards Ensembling of + Foundational Language Models SC + + +
+ Foundational Language Models (FLMs) have advanced natural language processing +(NLP) research. Current researchers are developing larger FLMs (e.g., XLNet, +T5) to enable contextualized language representation, classification, and +generation. While developing larger FLMs has been of significant advantage, it +is also a liability concerning hallucination and predictive uncertainty. +Fundamentally, larger FLMs are built on the same foundations as smaller FLMs +(e.g., BERT); hence, one must recognize the potential of smaller FLMs which can +be realized through an ensemble. In the current research, we perform a reality +check on FLMs and their ensemble on benchmark and real-world datasets. We +hypothesize that the ensembling of FLMs can influence the individualistic +attention of FLMs and unravel the strength of coordination and cooperation of +different FLMs. We utilize BERT and define three other ensemble techniques: +{Shallow, Semi, and Deep}, wherein the Deep-Ensemble introduces a +knowledge-guided reinforcement learning approach. We discovered that the +suggested Deep-Ensemble BERT outperforms its large variation i.e. BERTlarge, by +a factor of many times using datasets that show the usefulness of NLP in +sensitive fields, such as mental health. + +
+
+ comment: Accepted at the 10th Mid-Atlantic Student Colloquium on Speech, + Language and Learning (MASC-SLL 2023) +
+
+
+
+
+ + ☆ Prompt2Model: Generating Deployable Models from Natural Language + Instructions + + +
+ Large language models (LLMs) enable system builders today to create competent +NLP systems through prompting, where they only need to describe the task in +natural language and provide a few examples. However, in other ways, LLMs are a +step backward from traditional special-purpose NLP models; they require +extensive computational resources for deployment and can be gated behind APIs. +In this paper, we propose Prompt2Model, a general-purpose method that takes a +natural language task description like the prompts provided to LLMs, and uses +it to train a special-purpose model that is conducive to deployment. This is +done through a multi-step process of retrieval of existing datasets and +pretrained models, dataset generation using LLMs, and supervised fine-tuning on +these retrieved and generated datasets. Over three tasks, we demonstrate that +given the same few-shot prompt as input, Prompt2Model trains models that +outperform the results of a strong LLM, gpt-3.5-turbo, by an average of 20% +while being up to 700 times smaller. We also show that this data can be used to +obtain reliable performance estimates of model performance, enabling model +developers to assess model reliability before deployment. Prompt2Model is +available open-source at https://github.com/neulab/prompt2model. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ How to Protect Copyright Data in Optimization of Large Language Models? + + +
+ Large language models (LLMs) and generative AI have played a transformative +role in computer research and applications. Controversy has arisen as to +whether these models output copyrighted data, which can occur if the data the +models are trained on is copyrighted. LLMs are built on the transformer neural +network architecture, which in turn relies on a mathematical computation called +Attention that uses the softmax function. + In this paper, we show that large language model training and optimization +can be seen as a softmax regression problem. We then establish a method of +efficiently performing softmax regression, in a way that prevents the +regression function from generating copyright data. This establishes a +theoretical method of training large language models in a way that avoids +generating copyright data. + +
+
+
+
+
+ + ☆ Diffusion Language Models Can Perform Many Tasks with Scaling and + Instruction-Finetuning + + +
+ The recent surge of generative AI has been fueled by the generative power of +diffusion probabilistic models and the scalable capabilities of large language +models. Despite their potential, it remains elusive whether diffusion language +models can solve general language tasks comparable to their autoregressive +counterparts. This paper demonstrates that scaling diffusion models w.r.t. +data, sizes, and tasks can effectively make them strong language learners. We +build competent diffusion language models at scale by first acquiring knowledge +from massive data via masked language modeling pretraining thanks to their +intrinsic connections. We then reprogram pretrained masked language models into +diffusion language models via diffusive adaptation, wherein task-specific +finetuning and instruction finetuning are explored to unlock their versatility +in solving general language tasks. Experiments show that scaling diffusion +language models consistently improves performance across downstream language +tasks. We further discover that instruction finetuning can elicit zero-shot and +few-shot in-context learning abilities that help tackle many unseen tasks by +following natural language instructions, and show promise in advanced and +challenging abilities such as reasoning + +
+
+
+
+
+ + ☆ The Challenges of Machine Learning for Trust and Safety: A Case Study on + Misinformation Detection + + +
+ We examine the disconnect between scholarship and practice in applying +machine learning to trust and safety problems, using misinformation detection +as a case study. We systematize literature on automated detection of +misinformation across a corpus of 270 well-cited papers in the field. We then +examine subsets of papers for data and code availability, design missteps, +reproducibility, and generalizability. We find significant shortcomings in the +literature that call into question claimed performance and practicality. +Detection tasks are often meaningfully distinct from the challenges that online +services actually face. Datasets and model evaluation are often +non-representative of real-world contexts, and evaluation frequently is not +independent of model training. Data and code availability is poor. Models do +not generalize well to out-of-domain data. Based on these results, we offer +recommendations for evaluating machine learning applications to trust and +safety problems. Our aim is for future work to avoid the pitfalls that we +identify. + +
+
+
+
+
+ + ☆ Curriculum Learning with Adam: The Devil Is in the Wrong Details + + +
+ Curriculum learning (CL) posits that machine learning models -- similar to +humans -- may learn more efficiently from data that match their current +learning progress. However, CL methods are still poorly understood and, in +particular for natural language processing (NLP), have achieved only limited +success. In this paper, we explore why. Starting from an attempt to replicate +and extend a number of recent curriculum methods, we find that their results +are surprisingly brittle when applied to NLP. A deep dive into the +(in)effectiveness of the curricula in some scenarios shows us why: when +curricula are employed in combination with the popular Adam optimisation +algorithm, they oftentimes learn to adapt to suboptimally chosen optimisation +parameters for this algorithm. We present a number of different case studies +with different common hand-crafted and automated CL approaches to illustrate +this phenomenon, and we find that none of them outperforms optimisation with +only Adam with well-chosen hyperparameters. As such, our results contribute to +understanding why CL methods work, but at the same time urge caution when +claiming positive results. + +
+
+
+
+
+ + ☆ Evaluation of Faithfulness Using the Longest Supported Subsequence + + +
+ As increasingly sophisticated language models emerge, their trustworthiness +becomes a pivotal issue, especially in tasks such as summarization and +question-answering. Ensuring their responses are contextually grounded and +faithful is challenging due to the linguistic diversity and the myriad of +possible answers. In this paper, we introduce a novel approach to evaluate +faithfulness of machine-generated text by computing the longest noncontinuous +substring of the claim that is supported by the context, which we refer to as +the Longest Supported Subsequence (LSS). Using a new human-annotated dataset, +we finetune a model to generate LSS. We introduce a new method of evaluation +and demonstrate that these metrics correlate better with human ratings when LSS +is employed, as opposed to when it is not. Our proposed metric demonstrates an +18% enhancement over the prevailing state-of-the-art metric for faithfulness on +our dataset. Our metric consistently outperforms other metrics on a +summarization dataset across six different models. Finally, we compare several +popular Large Language Models (LLMs) for faithfulness using this metric. We +release the human-annotated dataset built for predicting LSS and our fine-tuned +model for evaluating faithfulness. + +
+
+
+
+
+ + ☆ Semantic Change Detection for the Romanian Language + + +
+ Automatic semantic change methods try to identify the changes that appear +over time in the meaning of words by analyzing their usage in diachronic +corpora. In this paper, we analyze different strategies to create static and +contextual word embedding models, i.e., Word2Vec and ELMo, on real-world +English and Romanian datasets. To test our pipeline and determine the +performance of our models, we first evaluate both word embedding models on an +English dataset (SEMEVAL-CCOHA). Afterward, we focus our experiments on a +Romanian dataset, and we underline different aspects of semantic changes in +this low-resource language, such as meaning acquisition and loss. The +experimental results show that, depending on the corpus, the most important +factors to consider are the choice of model and the distance to calculate a +score for detecting semantic change. + +
+
+
+
+
+ + ☆ Instruction Position Matters in Sequence Generation with Large Language + Models + + +
+ Large language models (LLMs) are capable of performing conditional sequence +generation tasks, such as translation or summarization, through instruction +fine-tuning. The fine-tuning data is generally sequentially concatenated from a +specific task instruction, an input sentence, and the corresponding response. +Considering the locality modeled by the self-attention mechanism of LLMs, these +models face the risk of instruction forgetting when generating responses for +long input sentences. To mitigate this issue, we propose enhancing the +instruction-following capability of LLMs by shifting the position of task +instructions after the input sentences. Theoretical analysis suggests that our +straightforward method can alter the model's learning focus, thereby +emphasizing the training of instruction-following capabilities. Concurrently, +experimental results demonstrate that our approach consistently outperforms +traditional settings across various model scales (1B / 7B / 13B) and different +sequence generation tasks (translation and summarization), without any +additional data or annotation costs. Notably, our method significantly improves +the zero-shot performance on conditional sequence generation, e.g., up to 9.7 +BLEU points on WMT zero-shot translation tasks. + +
+
+ comment: Codes and results are at + https://github.com/Adaxry/Post-Instruction/tree/main +
+
+
+
+
+ + ☆ Out of the Cage: How Stochastic Parrots Win in Cyber Security + Environments + + +
+ Large Language Models (LLMs) have gained widespread popularity across diverse +domains involving text generation, summarization, and various natural language +processing tasks. Despite their inherent limitations, LLM-based designs have +shown promising capabilities in planning and navigating open-world scenarios. +This paper introduces a novel application of pre-trained LLMs as agents within +cybersecurity network environments, focusing on their utility for sequential +decision-making processes. + We present an approach wherein pre-trained LLMs are leveraged as attacking +agents in two reinforcement learning environments. Our proposed agents +demonstrate similar or better performance against state-of-the-art agents +trained for thousands of episodes in most scenarios and configurations. In +addition, the best LLM agents perform similarly to human testers of the +environment without any additional training process. This design highlights the +potential of LLMs to efficiently address complex decision-making tasks within +cybersecurity. + Furthermore, we introduce a new network security environment named +NetSecGame. The environment is designed to eventually support complex +multi-agent scenarios within the network security domain. The proposed +environment mimics real network attacks and is designed to be highly modular +and adaptable for various scenarios. + +
+
+ comment: Under review. 10 pages plus appendices, 7 figures, 4 tables +
+
+
+
+
+ + ☆ InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4 + + +
+ Multimodal large language models acquire their instruction-following +capabilities through a two-stage training process: pre-training on image-text +pairs and fine-tuning on supervised vision-language instruction data. Recent +studies have shown that large language models can achieve satisfactory results +even with a limited amount of high-quality instruction-following data. In this +paper, we introduce InstructionGPT-4, which is fine-tuned on a small dataset +comprising only 200 examples, amounting to approximately 6% of the +instruction-following data used in the alignment dataset for MiniGPT-4. We +first propose several metrics to access the quality of multimodal instruction +data. Based on these metrics, we present a simple and effective data selector +to automatically identify and filter low-quality vision-language data. By +employing this method, InstructionGPT-4 outperforms the original MiniGPT-4 on +various evaluations (e.g., visual question answering, GPT-4 preference). +Overall, our findings demonstrate that less but high-quality instruction tuning +data is efficient to enable multimodal large language models to generate better +output. + +
+
+
+
+
+ + ☆ FlexKBQA: A Flexible LLM-Powered Framework for Few-Shot Knowledge Base + Question Answering + + +
+ Knowledge base question answering (KBQA) is a critical yet challenging task +due to the vast number of entities within knowledge bases and the diversity of +natural language questions posed by users. Unfortunately, the performance of +most KBQA models tends to decline significantly in real-world scenarios where +high-quality annotated data is insufficient. To mitigate the burden associated +with manual annotation, we introduce FlexKBQA by utilizing Large Language +Models (LLMs) as program translators for addressing the challenges inherent in +the few-shot KBQA task. Specifically, FlexKBQA leverages automated algorithms +to sample diverse programs, such as SPARQL queries, from the knowledge base, +which are subsequently converted into natural language questions via LLMs. This +synthetic dataset facilitates training a specialized lightweight model for the +KB. Additionally, to reduce the barriers of distribution shift between +synthetic data and real user questions, FlexKBQA introduces an executionguided +self-training method to iterative leverage unlabeled user questions. +Furthermore, we explore harnessing the inherent reasoning capability of LLMs to +enhance the entire framework. Consequently, FlexKBQA delivers substantial +flexibility, encompassing data annotation, deployment, and being domain +agnostic. Through extensive experiments on GrailQA, WebQSP, and KQA Pro, we +observe that under the few-shot even the more challenging zero-shot scenarios, +FlexKBQA achieves impressive results with a few annotations, surpassing all +previous baselines and even approaching the performance of supervised models, +achieving a remarkable 93% performance relative to the fully-supervised models. +We posit that FlexKBQA represents a significant advancement towards exploring +better integration of large and lightweight models. The code is open-sourced. + +
+
+
+
+
+ + ☆ Aligning Language Models with Offline Reinforcement Learning from Human + Feedback + + +
+ Learning from human preferences is crucial for language models (LMs) to +effectively cater to human needs and societal values. Previous research has +made notable progress by leveraging human feedback to follow instructions. +However, these approaches rely primarily on online reinforcement learning (RL) +techniques like Proximal Policy Optimization (PPO), which have been proven +unstable and challenging to tune for language models. Moreover, PPO requires +complex distributed system implementation, hindering the efficiency of +large-scale distributed training. In this study, we propose an offline +reinforcement learning from human feedback (RLHF) framework to align LMs using +pre-generated samples without interacting with RL environments. Specifically, +we explore maximum likelihood estimation (MLE) with filtering, reward-weighted +regression (RWR), and Decision Transformer (DT) to align language models to +human preferences. By employing a loss function similar to supervised +fine-tuning, our methods ensure more stable model training than PPO with a +simple machine learning system~(MLSys) and much fewer (around 12.3\%) computing +resources. Experimental results demonstrate the DT alignment outperforms other +Offline RLHF methods and is better than PPO. + +
+
+
+
+
+ + ☆ CgT-GAN: CLIP-guided Text GAN for Image Captioning ACM MM 2023 + + +
+ The large-scale visual-language pre-trained model, Contrastive Language-Image +Pre-training (CLIP), has significantly improved image captioning for scenarios +without human-annotated image-caption pairs. Recent advanced CLIP-based image +captioning without human annotations follows a text-only training paradigm, +i.e., reconstructing text from shared embedding space. Nevertheless, these +approaches are limited by the training/inference gap or huge storage +requirements for text embeddings. Given that it is trivial to obtain images in +the real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates +images into the training process to enable the model to "see" real visual +modality. Particularly, we use adversarial training to teach CgT-GAN to mimic +the phrases of an external text corpus and CLIP-based reward to provide +semantic guidance. The caption generator is jointly rewarded based on the +caption naturalness to human language calculated from the GAN's discriminator +and the semantic guidance reward computed by the CLIP-based reward module. In +addition to the cosine similarity as the semantic guidance reward (i.e., +CLIP-cos), we further introduce a novel semantic guidance reward called +CLIP-agg, which aligns the generated caption with a weighted text embedding by +attentively aggregating the entire corpus. Experimental results on three +subtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms +state-of-the-art methods significantly across all metrics. Code is available at +https://github.com/Lihr747/CgtGAN. + +
+
+ comment: Accepted at ACM MM 2023 +
+
+
+
+
+ + ☆ IncreLoRA: Incremental Parameter Allocation Method for + Parameter-Efficient Fine-tuning + + +
+ With the increasing size of pre-trained language models (PLMs), fine-tuning +all the parameters in the model is not efficient, especially when there are a +large number of downstream tasks, which incur significant training and storage +costs. Many parameter-efficient fine-tuning (PEFT) approaches have been +proposed, among which, Low-Rank Adaptation (LoRA) is a representative approach +that injects trainable rank decomposition matrices into every target module. +Yet LoRA ignores the importance of parameters in different modules. To address +this problem, many works have been proposed to prune the parameters of LoRA. +However, under limited training conditions, the upper bound of the rank of the +pruned parameter matrix is still affected by the preset values. We, therefore, +propose IncreLoRA, an incremental parameter allocation method that adaptively +adds trainable parameters during training based on the importance scores of +each module. This approach is different from the pruning method as it is not +limited by the initial number of training parameters, and each parameter matrix +has a higher rank upper bound for the same training overhead. We conduct +extensive experiments on GLUE to demonstrate the effectiveness of IncreLoRA. +The results show that our method owns higher parameter efficiency, especially +when under the low-resource settings where our method significantly outperforms +the baselines. Our code is publicly available. + +
+
+
+
+
+ + ☆ Hybrid Retrieval and Multi-stage Text Ranking Solution at TREC 2022 Deep + Learning Track + + +
+ Large-scale text retrieval technology has been widely used in various +practical business scenarios. This paper presents our systems for the TREC 2022 +Deep Learning Track. We explain the hybrid text retrieval and multi-stage text +ranking method adopted in our solution. The retrieval stage combined the two +structures of traditional sparse retrieval and neural dense retrieval. In the +ranking stage, in addition to the full interaction-based ranking model built on +large pre-trained language model, we also proposes a lightweight sub-ranking +module to further enhance the final text ranking performance. Evaluation +results demonstrate the effectiveness of our proposed approach. Our models +achieve the 1st and 4th rank on the test set of passage ranking and document +ranking respectively. + +
+
+ comment: TREC 2022 Deep Learning Track +
+
+
+
+
+ + ☆ Large Multilingual Models Pivot Zero-Shot Multimodal Learning across + Languages + + +
+ Recently there has been a significant surge in multimodal learning in terms +of both image-to-text and text-to-image generation. However, the success is +typically limited to English, leaving other languages largely behind. Building +a competitive counterpart in other languages is highly challenging due to the +low-resource nature of non-English multimodal data (i.e., lack of large-scale, +high-quality image-text data). In this work, we propose MPM, an effective +training paradigm for training large multimodal models in low-resource +languages. MPM demonstrates that Multilingual language models can Pivot +zero-shot Multimodal learning across languages. Specifically, based on a strong +multilingual large language model, multimodal models pretrained on English-only +image-text data can well generalize to other languages in a zero-shot manner +for both image-to-text and text-to-image generation, even surpassing models +trained on image-text data in native languages. Taking Chinese as a practice of +MPM, we build large multimodal models VisCPM in image-to-text and text-to-image +generation, which achieve state-of-the-art (open-source) performance in +Chinese. To facilitate future research, we open-source codes and model weights +at https://github.com/OpenBMB/VisCPM.git. + +
+
+ comment: https://github.com/OpenBMB/VisCPM.git +
+
+
+
+
+ + ☆ PREFER: Prompt Ensemble Learning via Feedback-Reflect-Refine + + +
+ As an effective tool for eliciting the power of Large Language Models (LLMs), +prompting has recently demonstrated unprecedented abilities across a variety of +complex tasks. To further improve the performance, prompt ensemble has +attracted substantial interest for tackling the hallucination and instability +of LLMs. However, existing methods usually adopt a two-stage paradigm, which +requires a pre-prepared set of prompts with substantial manual effort, and is +unable to perform directed optimization for different weak learners. In this +paper, we propose a simple, universal, and automatic method named PREFER (Pompt +Ensemble learning via Feedback-Reflect-Refine) to address the stated +limitations. Specifically, given the fact that weak learners are supposed to +focus on hard examples during boosting, PREFER builds a feedback mechanism for +reflecting on the inadequacies of existing weak learners. Based on this, the +LLM is required to automatically synthesize new prompts for iterative +refinement. Moreover, to enhance stability of the prompt effect evaluation, we +propose a novel prompt bagging method involving forward and backward thinking, +which is superior to majority voting and is beneficial for both feedback and +weight calculation in boosting. Extensive experiments demonstrate that our +PREFER achieves state-of-the-art performance in multiple types of tasks by a +significant margin. We have made our code publicly available. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ From Quantity to Quality: Boosting LLM Performance with Self-Guided Data + Selection for Instruction Tuning + + +
+ In the realm of Large Language Models, the balance between instruction data +quality and quantity has become a focal point. Recognizing this, we introduce a +self-guided methodology for LLMs to autonomously discern and select cherry +samples from vast open-source datasets, effectively minimizing manual curation +and potential cost for instruction tuning an LLM. Our key innovation, the +Instruction-Following Difficulty (IFD) metric, emerges as a pivotal tool to +identify discrepancies between a model's expected responses and its autonomous +generation prowess. Through the adept application of IFD, cherry samples are +pinpointed, leading to a marked uptick in model training efficiency. Empirical +validations on renowned datasets like Alpaca and WizardLM underpin our +findings; with a mere 10% of conventional data input, our strategy showcases +improved results. This synthesis of self-guided cherry-picking and the IFD +metric signifies a transformative leap in the optimization of LLMs, promising +both efficiency and resource-conscious advancements. + +
+
+
+
+
+ + ☆ Prompt-Based Length Controlled Generation with Reinforcement Learning + + +
+ Recently, large language models (LLMs) like ChatGPT and GPT-4 have attracted +great attention given their surprising improvement and performance. Length +controlled generation of LLMs emerges as an important topic, which also enables +users to fully leverage the capability of LLMs in more real-world scenarios +like generating a proper answer or essay of a desired length. In addition, the +autoregressive generation in LLMs is extremely time-consuming, while the +ability of controlling this generated length can arbitrarily reduce the +inference cost by limiting the length, and thus satisfy different needs. +Therefore, we aim to propose a prompt-based length control method to achieve +this length controlled generation, which can also be widely applied in +GPT-style LLMs. In particular, we adopt reinforcement learning with the reward +signal given by either trainable or rule-based reward model, which further +affects the generation of LLMs via rewarding a pre-defined target length. +Experiments show that our method significantly improves the accuracy of +prompt-based length control for summarization task on popular datasets like +CNNDM and NYT. We believe this length-controllable ability can provide more +potentials towards the era of LLMs. + +
+
+
+
+
+ + ☆ Knowledge-injected Prompt Learning for Chinese Biomedical Entity + Normalization + + +
+ The Biomedical Entity Normalization (BEN) task aims to align raw, +unstructured medical entities to standard entities, thus promoting data +coherence and facilitating better downstream medical applications. Recently, +prompt learning methods have shown promising results in this task. However, +existing research falls short in tackling the more complex Chinese BEN task, +especially in the few-shot scenario with limited medical data, and the vast +potential of the external medical knowledge base has yet to be fully harnessed. +To address these challenges, we propose a novel Knowledge-injected Prompt +Learning (PL-Knowledge) method. Specifically, our approach consists of five +stages: candidate entity matching, knowledge extraction, knowledge encoding, +knowledge injection, and prediction output. By effectively encoding the +knowledge items contained in medical entities and incorporating them into our +tailor-made knowledge-injected templates, the additional knowledge enhances the +model's ability to capture latent relationships between medical entities, thus +achieving a better match with the standard entities. We extensively evaluate +our model on a benchmark dataset in both few-shot and full-scale scenarios. Our +method outperforms existing baselines, with an average accuracy boost of +12.96\% in few-shot and 0.94\% in full-data cases, showcasing its excellence in +the BEN task. + +
+
+
+
+
+ + ☆ Reranking Passages with Coarse-to-Fine Neural Retriever using + List-Context Information + + +
+ Passage reranking is a crucial task in many applications, particularly when +dealing with large-scale documents. Traditional neural architectures are +limited in retrieving the best passage for a question because they usually +match the question to each passage separately, seldom considering contextual +information in other passages that can provide comparison and reference +information. This paper presents a list-context attention mechanism to augment +the passage representation by incorporating the list-context information from +other candidates. The proposed coarse-to-fine (C2F) neural retriever addresses +the out-of-memory limitation of the passage attention mechanism by dividing the +list-context modeling process into two sub-processes, allowing for efficient +encoding of context information from a large number of candidate answers. This +method can be generally used to encode context information from any number of +candidate answers in one pass. Different from most multi-stage information +retrieval architectures, this model integrates the coarse and fine rankers into +the joint optimization process, allowing for feedback between the two layers to +update the model simultaneously. Experiments demonstrate the effectiveness of +the proposed approach. + +
+
+
+
+
+ + ☆ From Instructions to Intrinsic Human Values -- A Survey of Alignment + Goals for Big Models + + +
+ Big models, exemplified by Large Language Models (LLMs), are models typically +pre-trained on massive data and comprised of enormous parameters, which not +only obtain significantly improved performance across diverse tasks but also +present emergent capabilities absent in smaller models. However, the growing +intertwining of big models with everyday human lives poses potential risks and +might cause serious social harm. Therefore, many efforts have been made to +align LLMs with humans to make them better follow user instructions and satisfy +human preferences. Nevertheless, `what to align with' has not been fully +discussed, and inappropriate alignment goals might even backfire. In this +paper, we conduct a comprehensive survey of different alignment goals in +existing work and trace their evolution paths to help identify the most +essential goal. Particularly, we investigate related works from two +perspectives: the definition of alignment goals and alignment evaluation. Our +analysis encompasses three distinct levels of alignment goals and reveals a +goal transformation from fundamental abilities to value orientation, indicating +the potential of intrinsic human values as the alignment goal for enhanced +LLMs. Based on such results, we further discuss the challenges of achieving +such intrinsic value alignment and provide a collection of available resources +for future research on the alignment of big models. + +
+
+ comment: 20 pages, 5 figures +
+
+
+
+
+ + ☆ Graecia capta ferum victorem cepit. Detecting Latin Allusions to Ancient + Greek Literature + + +
+ Intertextual allusions hold a pivotal role in Classical Philology, with Latin +authors frequently referencing Ancient Greek texts. Until now, the automatic +identification of these intertextual references has been constrained to +monolingual approaches, seeking parallels solely within Latin or Greek texts. +In this study, we introduce SPhilBERTa, a trilingual Sentence-RoBERTa model +tailored for Classical Philology, which excels at cross-lingual semantic +comprehension and identification of identical sentences across Ancient Greek, +Latin, and English. We generate new training data by automatically translating +English texts into Ancient Greek. Further, we present a case study, +demonstrating SPhilBERTa's capability to facilitate automated detection of +intertextual parallels. Our models and resources are available at +https://github.com/Heidelberg-NLP/ancient-language-models. + +
+
+ comment: Paper accepted for publication at the First Workshop on Ancient + Language Processing (ALP) 2023; 9 pages, 5 tables +
+
+
+
+
+ + ☆ Topical-Chat: Towards Knowledge-Grounded Open-Domain Conversations INTERSPEECH 2019 + + +
+ Building socialbots that can have deep, engaging open-domain conversations +with humans is one of the grand challenges of artificial intelligence (AI). To +this end, bots need to be able to leverage world knowledge spanning several +domains effectively when conversing with humans who have their own world +knowledge. Existing knowledge-grounded conversation datasets are primarily +stylized with explicit roles for conversation partners. These datasets also do +not explore depth or breadth of topical coverage with transitions in +conversations. We introduce Topical-Chat, a knowledge-grounded human-human +conversation dataset where the underlying knowledge spans 8 broad topics and +conversation partners don't have explicitly defined roles, to help further +research in open-domain conversational AI. We also train several +state-of-the-art encoder-decoder conversational models on Topical-Chat and +perform automated and human evaluation for benchmarking. + +
+
+ comment: arXiving an old paper accepted at INTERSPEECH 2019 +
+
+
+
+
+ + ☆ EVE: Efficient Vision-Language Pre-training with Masked Prediction and + Modality-Aware MoE + + +
+ Building scalable vision-language models to learn from diverse, multimodal +data remains an open challenge. In this paper, we introduce an Efficient +Vision-languagE foundation model, namely EVE, which is one unified multimodal +Transformer pre-trained solely by one unified pre-training task. Specifically, +EVE encodes both vision and language within a shared Transformer network +integrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which +capture modality-specific information by selectively switching to different +experts. To unify pre-training tasks of vision and language, EVE performs +masked signal modeling on image-text pairs to reconstruct masked signals, i.e., +image pixels and text tokens, given visible signals. This simple yet effective +pre-training objective accelerates training by 3.5x compared to the model +pre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing +to the combination of the unified architecture and pre-training task, EVE is +easy to scale up, enabling better downstream performance with fewer resources +and faster training speed. Despite its simplicity, EVE achieves +state-of-the-art performance on various vision-language downstream tasks, +including visual question answering, visual reasoning, and image-text +retrieval. + +
+
+
+
+
+ + ☆ Audio Generation with Multiple Conditional Diffusion Model AAAI 2024 + + +
+ Text-based audio generation models have limitations as they cannot encompass +all the information in audio, leading to restricted controllability when +relying solely on text. To address this issue, we propose a novel model that +enhances the controllability of existing pre-trained text-to-audio models by +incorporating additional conditions including content (timestamp) and style +(pitch contour and energy contour) as supplements to the text. This approach +achieves fine-grained control over the temporal order, pitch, and energy of +generated audio. To preserve the diversity of generation, we employ a trainable +control condition encoder that is enhanced by a large language model and a +trainable Fusion-Net to encode and fuse the additional conditions while keeping +the weights of the pre-trained text-to-audio model frozen. Due to the lack of +suitable datasets and evaluation metrics, we consolidate existing datasets into +a new dataset comprising the audio and corresponding conditions and use a +series of evaluation metrics to evaluate the controllability performance. +Experimental results demonstrate that our model successfully achieves +fine-grained control to accomplish controllable audio generation. Audio samples +and our dataset are publicly available at +https://conditionaudiogen.github.io/conditionaudiogen/ + +
+
+ comment: Submitted to AAAI 2024 +
+
+
+
+
+ + ☆ Audio Difference Captioning Utilizing Similarity-Discrepancy + Disentanglement + + +
+ We proposed Audio Difference Captioning (ADC) as a new extension task of +audio captioning for describing the semantic differences between input pairs of +similar but slightly different audio clips. The ADC solves the problem that +conventional audio captioning sometimes generates similar captions for similar +audio clips, failing to describe the difference in content. We also propose a +cross-attention-concentrated transformer encoder to extract differences by +comparing a pair of audio clips and a similarity-discrepancy disentanglement to +emphasize the difference in the latent space. To evaluate the proposed methods, +we built an AudioDiffCaps dataset consisting of pairs of similar but slightly +different audio clips with human-annotated descriptions of their differences. +The experiment with the AudioDiffCaps dataset showed that the proposed methods +solve the ADC task effectively and improve the attention weights to extract the +difference by visualizing them in the transformer encoder. + +
+
+ comment: Accepted to DCASE2023 Workshop +
+
+
+
+
+ + ☆ Bridging the Gap: Deciphering Tabular Data Using Large Language Model + + +
+ In the realm of natural language processing, the understanding of tabular +data has perpetually stood as a focal point of scholarly inquiry. The emergence +of expansive language models, exemplified by the likes of ChatGPT, has ushered +in a wave of endeavors wherein researchers aim to harness these models for +tasks related to table-based question answering. Central to our investigative +pursuits is the elucidation of methodologies that amplify the aptitude of such +large language models in discerning both the structural intricacies and +inherent content of tables, ultimately facilitating their capacity to provide +informed responses to pertinent queries. To this end, we have architected a +distinctive module dedicated to the serialization of tables for seamless +integration with expansive language models. Additionally, we've instituted a +corrective mechanism within the model to rectify potential inaccuracies. +Experimental results indicate that, although our proposed method trails the +SOTA by approximately 11.7% in overall metrics, it surpasses the SOTA by about +1.2% in tests on specific datasets. This research marks the first application +of large language models to table-based question answering tasks, enhancing the +model's comprehension of both table structures and content. + +
+
+
+
+
+ + ☆ Cabrita: closing the gap for foreign languages + + +
+ The strategy of training the model from scratch in a specific language or +domain serves two essential purposes: i) enhancing performance in the +particular linguistic or domain context, and ii) ensuring effective +tokenization. The main limitation inherent to this approach lies in the +associated cost, which can reach six to seven-digit dollar values, depending on +the model size and the number of parameters involved. + The main solution to overcome the cost challenge is to rely on available +pre-trained models, which, despite recent advancements such as the LLaMA and +LLaMA-2 models, still demonstrate inefficiency for certain specific domain +problems or prove ineffective in scenarios involving conversational memory +resources, given the large number of tokens required to represent text. + To overcome this issue, we present a methodology named Cabrita, which, as our +research demonstrates, successfully addresses the performance and efficient +tokenization problem, all at an affordable cost. We believe that this +methodology can be applied to any transformer-like architecture model. To +validate the study, we conducted continuous pre-training exclusively using +Portuguese text on a 3-billion-parameter model known as OpenLLaMA, resulting in +a model named openCabrita 3B. The openCabrita 3B also features a new tokenizer +that results in a significant reduction in the number of tokens required to +represent the text. In our assessment, for few-shot learning tasks, we achieved +similar results with this 3B model compared to a traditional continuous +pre-training approach as well as to 7B models English pre-trained models. + +
+
+ comment: 9 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Tryage: Real-time, intelligent Routing of User Prompts to Large Language + Models + + +
+ The introduction of the transformer architecture and the self-attention +mechanism has led to an explosive production of language models trained on +specific downstream tasks and data domains. With over 200, 000 models in the +Hugging Face ecosystem, users grapple with selecting and optimizing models to +suit multifaceted workflows and data domains while addressing computational, +security, and recency concerns. There is an urgent need for machine learning +frameworks that can eliminate the burden of model selection and customization +and unleash the incredible power of the vast emerging model library for end +users. Here, we propose a context-aware routing system, Tryage, that leverages +a language model router for optimal selection of expert models from a model +library based on analysis of individual input prompts. Inspired by the thalamic +router in the brain, Tryage employs a perceptive router to predict down-stream +model performance on prompts and, then, makes a routing decision using an +objective function that integrates performance predictions with user goals and +constraints that are incorporated through flags (e.g., model size, model +recency). Tryage allows users to explore a Pareto front and automatically +trade-off between task accuracy and secondary goals including minimization of +model size, recency, security, verbosity, and readability. Across heterogeneous +data sets that include code, text, clinical data, and patents, the Tryage +framework surpasses Gorilla and GPT3.5 turbo in dynamic model selection +identifying the optimal model with an accuracy of 50.9% , compared to 23.6% by +GPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how +routing models can be applied to program and control the behavior of +multi-model LLM systems to maximize efficient use of the expanding and evolving +language model ecosystem. + +
+
+
+
+
+ + ♻ ☆ How Good Are Large Language Models at Out-of-Distribution Detection? + + +
+ Out-of-distribution (OOD) detection plays a vital role in enhancing the +reliability of machine learning (ML) models. The emergence of large language +models (LLMs) has catalyzed a paradigm shift within the ML community, +showcasing their exceptional capabilities across diverse natural language +processing tasks. While existing research has probed OOD detection with +relative small-scale Transformers like BERT, RoBERTa and GPT-2, the stark +differences in scales, pre-training objectives, and inference paradigms call +into question the applicability of these findings to LLMs. This paper embarks +on a pioneering empirical investigation of OOD detection in the domain of LLMs, +focusing on LLaMA series ranging from 7B to 65B in size. We thoroughly evaluate +commonly-used OOD detectors, scrutinizing their performance in both zero-grad +and fine-tuning scenarios. Notably, we alter previous discriminative +in-distribution fine-tuning into generative fine-tuning, aligning the +pre-training objective of LLMs with downstream tasks. Our findings unveil that +a simple cosine distance OOD detector demonstrates superior efficacy, +outperforming other OOD detectors. We provide an intriguing explanation for +this phenomenon by highlighting the isotropic nature of the embedding spaces of +LLMs, which distinctly contrasts with the anisotropic property observed in +smaller BERT family models. The new insight enhances our understanding of how +LLMs detect OOD data, thereby enhancing their adaptability and reliability in +dynamic environments. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Low-Resource Authorship Style Transfer: Can Non-Famous Authors Be + Imitated? + + +
+ Authorship style transfer involves altering text to match the style of a +target author whilst preserving the original meaning. Existing unsupervised +approaches like STRAP have largely focused on style transfer to target authors +with many examples of their writing style in books, speeches, or other +published works. This high-resource training data requirement (often greater +than 100,000 words) makes these approaches primarily useful for style transfer +to published authors, politicians, or other well-known figures and authorship +styles, while style transfer to non-famous authors has not been well-studied. +We introduce the \textit{low-resource authorship style transfer} task, a more +challenging class of authorship style transfer where only a limited amount of +text in the target author's style may exist. In our experiments, we +specifically choose source and target authors from Reddit and style transfer +their Reddit posts, limiting ourselves to just 16 posts (on average ~500 words) +of the target author's style. Style transfer accuracy is typically measured by +how often a classifier or human judge will classify an output as written by the +target author. Recent authorship representations models excel at authorship +identification even with just a few writing samples, making automatic +evaluation of this task possible for the first time through evaluation metrics +we propose. Our results establish an in-context learning technique we develop +as the strongest baseline, though we find current approaches do not yet achieve +mastery of this challenging task. We release our data and implementations to +encourage further investigation. + +
+
+
+
+
+ + ♻ ☆ Large Language Model as a User Simulator + + +
+ The unparalleled performance of closed-sourced ChatGPT has sparked efforts +towards its democratization, with notable strides made by leveraging real user +and ChatGPT conversations, as evidenced by Vicuna. However, while current +endeavors like Baize and UltraChat aim to auto-generate conversational data due +to challenges in gathering human participation, they primarily rely on ChatGPT +to simulate human behaviors based on directives rather than genuine human +learning. This results in a limited scope, diminished diversity, and an absence +of genuine multi-round conversational dynamics. To address the above issues, we +innovatively target human questions extracted from genuine human-machine +conversations as a learning goal and train a user simulator, UserGPT, to +produce a high-quality human-centric synthetic conversation dataset, RealChat. +Subsequently, this dataset trains our assistant model, ReaLM. Experimentally, +ReaLM outpaces baseline models in both Vicuna-Bench and MT-Bench by pairwise +comparison when considering equivalent training set sizes, and manual +evaluation also shows that our model is highly competitive. Impressively, when +fine-tuned with the latest LLaMA 2 model, ReaLM secured a leading score of 6.33 +in the MT-Bench, outshining the contemporary same-scale models, including the +LLaMA-2-7B-chat model. Further in-depth analysis demonstrates the scalability +and transferability of our approach. A preliminary exploration into the +interplay between training set data quality and resultant model performance is +also undertaken, laying a robust groundwork for future investigations. The code +is available at https://github.com/FreedomIntelligence/ReaLM. + +
+
+
+
+
+ + ♻ ☆ Domain Specific Question Answering Over Knowledge Graphs Using Logical + Programming and Large Language Models + + +
+ Answering questions over domain-specific graphs requires a tailored approach +due to the limited number of relations and the specific nature of the domain. +Our approach integrates classic logical programming languages into large +language models (LLMs), enabling the utilization of logical reasoning +capabilities to tackle the KGQA task. By representing the questions as Prolog +queries, which are readable and near close to natural language in +representation, we facilitate the generation of programmatically derived +answers. To validate the effectiveness of our approach, we evaluate it using a +well-known benchmark dataset, MetaQA. Our experimental results demonstrate that +our method achieves accurate identification of correct answer entities for all +test questions, even when trained on a small fraction of annotated data. +Overall, our work presents a promising approach to addressing question +answering over domain-specific graphs, offering an explainable and robust +solution by incorporating logical programming languages. + +
+
+
+
+
+ + ♻ ☆ Exploring the Landscape of Natural Language Processing Research + + +
+ As an efficient approach to understand, generate, and process natural +language texts, research in natural language processing (NLP) has exhibited a +rapid spread and wide adoption in recent years. Given the increasing research +work in this area, several NLP-related approaches have been surveyed in the +research community. However, a comprehensive study that categorizes established +topics, identifies trends, and outlines areas for future research remains +absent. Contributing to closing this gap, we have systematically classified and +analyzed research papers in the ACL Anthology. As a result, we present a +structured overview of the research landscape, provide a taxonomy of fields of +study in NLP, analyze recent developments in NLP, summarize our findings, and +highlight directions for future work. + +
+
+ comment: Extended version of the paper published at the 14th International + Conference on Recent Advances in Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ Comparison of Machine Learning Methods for Assigning Software Issues to + Team Members + + +
+ Software issues contain units of work to fix, improve, or create new threads +during the development and facilitate communication among the team members. +Assigning an issue to the most relevant team member and determining a category +of an issue is a tedious and challenging task. Wrong classifications cause +delays and rework in the project and trouble among the team members. This paper +proposes a set of carefully curated linguistic features for shallow machine +learning methods and compares the performance of shallow and ensemble methods +with deep language models. Unlike the state-of-the-art, we assign issues to +four roles (designer, developer, tester, and leader) rather than to specific +individuals or teams to contribute to the generality of our solution. We also +consider the level of experience of the developers to reflect the industrial +practices in our solution formulation. We collect and annotate five industrial +data sets from one of the top three global television producers to evaluate our +proposal and compare it with deep language models. Our data sets contain 5324 +issues in total. We show that an ensemble classifier of shallow techniques +achieves 0.92 for issue assignment in accuracy which is statistically +comparable to the state-of-the-art deep language models. The contributions +include the public sharing of five annotated industrial issue data sets, the +development of a clear and comprehensive feature set, the introduction of a +novel label set, and the validation of the efficacy of an ensemble classifier +of shallow machine learning techniques. + +
+
+
+
+
+ + ♻ ☆ BAN-PL: a Novel Polish Dataset of Banned Harmful and Offensive Content + from Wykop.pl web service + + +
+ Advances in automated detection of offensive language online, including hate +speech and cyberbullying, require improved access to publicly available +datasets comprising social media content. In this paper, we introduce BAN-PL, +the first open dataset in the Polish language that encompasses texts flagged as +harmful and subsequently removed by professional moderators. The dataset +encompasses a total of 691,662 pieces of content from a popular social +networking service, Wykop, often referred to as the "Polish Reddit", including +both posts and comments, and is evenly distributed into two distinct classes: +"harmful" and "neutral". We provide a comprehensive description of the data +collection and preprocessing procedures, as well as highlight the linguistic +specificity of the data. The BAN-PL dataset, along with advanced preprocessing +scripts for, i.a., unmasking profanities, will be publicly available. + +
+
+
+
+
+ + ♻ ☆ SONAR: Sentence-Level Multimodal and Language-Agnostic Representations + + +
+ We introduce SONAR, a new multilingual and multimodal fixed-size sentence +embedding space. Our single text encoder, covering 200 languages, substantially +outperforms existing sentence embeddings such as LASER3 and LabSE on the xsim +and xsim++ multilingual similarity search tasks. Speech segments can be +embedded in the same SONAR embedding space using language-specific speech +encoders trained in a teacher-student setting on speech transcription data. Our +encoders outperform existing speech encoders on similarity search tasks. We +also provide a text decoder for 200 languages, which allows us to perform +text-to-text and speech-to-text machine translation, including for zero-shot +language and modality combinations. Our text-to-text results are competitive +compared to the state-of-the-art NLLB~1B model, despite the fixed-size +bottleneck representation. Our zero-shot speech-to-text translation results +compare favorably with strong supervised baselines such as Whisper. + +
+
+
+
+
+ + ♻ ☆ Forward-Backward Reasoning in Large Language Models for Verification + + +
+ Chain-of-Though (CoT) prompting has shown promising performance in various +reasoning tasks. Recently, Self-Consistency \citep{wang2023selfconsistency} +proposes to sample a diverse set of reasoning chains which may lead to +different answers while the answer that receives the most votes is selected. In +this paper, we propose a novel method to use backward reasoning in verifying +candidate answers. We mask a token in the question by ${\bf x}$ and ask the LLM +to predict the masked token when a candidate answer is provided by \textit{a +simple template}, i.e., "\textit{\textbf{If we know the answer of the above +question is \{a candidate answer\}, what is the value of unknown variable ${\bf +x}$?}}" Intuitively, the LLM is expected to predict the masked token +successfully if the provided candidate answer is correct. We further propose +FOBAR to combine forward and backward reasoning for estimating the probability +of candidate answers. We conduct extensive experiments on six data sets and +three LLMs. Experimental results demonstrate that FOBAR achieves +state-of-the-art performance on various reasoning benchmarks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Self-consistency for open-ended generations + + +
+ Large Language Models (LLMs) can exhibit considerable variation in the +quality of their sampled outputs. Reranking and selecting the best generation +from the sampled set is a popular way of obtaining strong gains in generation +quality. In this paper, we present a novel approach for reranking LLM +generations. Unlike other techniques that might involve additional inferences +or training a specialized reranker, our approach relies on easy to compute +pairwise statistics between the generations that have minimal compute overhead. +We show that our approach can be formalized as an extension of self-consistency +and analyze its performance in that framework, theoretically as well as via +simulations. We show strong improvements for selecting the best $k$ generations +for code generation tasks as well as robust improvements for best generation +for the tasks of autoformalization, and summarization. While our approach only +assumes black-box access to LLMs, we show that additional access to token +probabilities can improve performance even further. + +
+
+
+
+
+ + ♻ ☆ Making first order linear logic a generating grammar + + +
+ It is known that different categorial grammars have surface representation in +a fragment of first order multiplicative linear logic (MLL1). We show that the +fragment of interest is equivalent to the recently introduced extended tensor +type calculus (ETTC). ETTC is a calculus of specific typed terms, which +represent tuples of strings, more precisely bipartite graphs decorated with +strings. Types are derived from linear logic formulas, and rules correspond to +concrete operations on these string-labeled graphs, so that they can be +conveniently visualized. This provides the above mentioned fragment of MLL1 +that is relevant for language modeling not only with some alternative syntax +and intuitive geometric representation, but also with an intrinsic deductive +system, which has been absent. + In this work we consider a non-trivial notationally enriched variation of the +previously introduced {\bf ETTC}, which allows more concise and transparent +computations. We present both a cut-free sequent calculus and a natural +deduction formalism. + +
+
+ comment: Revised and extended version with detailed proofs. arXiv admin note: + substantial text overlap with arXiv:2112.15253 +
+
+
+
+
+ + ♻ ☆ A Structured Span Selector NAACL 2022 + + +
+ Many natural language processing tasks, e.g., coreference resolution and +semantic role labeling, require selecting text spans and making decisions about +them. A typical approach to such tasks is to score all possible spans and +greedily select spans for task-specific downstream processing. This approach, +however, does not incorporate any inductive bias about what sort of spans ought +to be selected, e.g., that selected spans tend to be syntactic constituents. In +this paper, we propose a novel grammar-based structured span selection model +which learns to make use of the partial span-level annotation provided for such +problems. Compared to previous approaches, our approach gets rid of the +heuristic greedy span selection scheme, allowing us to model the downstream +task on an optimal set of spans. We evaluate our model on two popular span +prediction tasks: coreference resolution and semantic role labeling. We show +empirical improvements on both. + +
+
+ comment: NAACL 2022 camera-ready +
+
+
+
+
+ + ♻ ☆ Chain-of-Thought Prompt Distillation for Multimodal Named Entity + Recognition and Multimodal Relation Extraction + + +
+ Multimodal Named Entity Recognition (MNER) and Multimodal Relation Extraction +(MRE) necessitate the fundamental reasoning capacity for intricate linguistic +and multimodal comprehension. In this study, we explore distilling the +reasoning ability of large language models (LLMs) into a more compact student +model by generating a \textit{chain of thought} (CoT) -- a sequence of +intermediate reasoning steps. Specifically, we commence by exemplifying the +elicitation of such reasoning ability from LLMs through CoT prompts covering +multi-grain (noun, sentence, multimodality) and data-augmentation (style, +entity, image) dimensions. Subsequently, we present a novel conditional prompt +distillation method to assimilate the commonsense reasoning ability from LLMs, +thereby enhancing the utility of the student model in addressing text-only +inputs without the requisite addition of image and CoT knowledge. Extensive +experiments reveal that our approach attains state-of-the-art accuracy and +manifests a plethora of advantages concerning interpretability, data +efficiency, and cross-domain generalization on MNER and MRE datasets. + +
+
+ comment: modification +
+
+
+
+
+ + ♻ ☆ MemoChat: Tuning LLMs to Use Memos for Consistent Long-Range Open-Domain + Conversation + + +
+ We propose MemoChat, a pipeline for refining instructions that enables large +language models (LLMs) to effectively employ self-composed memos for +maintaining consistent long-range open-domain conversations. We demonstrate a +long-range open-domain conversation through iterative +"memorization-retrieval-response" cycles. This requires us to carefully design +tailored tuning instructions for each distinct stage. The instructions are +reconstructed from a collection of public datasets to teach the LLMs to +memorize and retrieve past dialogues with structured memos, leading to enhanced +consistency when participating in future conversations. We invite experts to +manually annotate a test set designed to evaluate the consistency of long-range +conversations questions. Experiments on three testing scenarios involving both +open-source and API-accessible chatbots at scale verify the efficacy of +MemoChat, which outperforms strong baselines. Our codes, data and models are +available here: https://github.com/LuJunru/MemoChat. + +
+
+
+
+
+ + ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models: + A Comprehensive Survey + + +
+ Diffusion models and large language models have emerged as leading-edge +generative models and have sparked a revolutionary impact on various aspects of +human life. However, the practical implementation of these models has also +exposed inherent risks, highlighting their dual nature and raising concerns +regarding their trustworthiness. Despite the abundance of literature on this +subject, a comprehensive survey specifically delving into the intersection of +large-scale generative models and their trustworthiness remains largely absent. +To bridge this gap, This paper investigates both the long-standing and emerging +threats associated with these models across four fundamental dimensions: +privacy, security, fairness, and responsibility. In this way, we construct an +extensive map outlining the trustworthiness of these models, while also +providing practical recommendations and identifying future directions. These +efforts are crucial for promoting the trustworthy deployment of these models, +ultimately benefiting society as a whole. + +
+
+ comment: Draft Version +
+
+
+
+
+ + ♻ ☆ NLP as a Lens for Causal Analysis and Perception Mining to Infer Mental + Health on Social Media + + +
+ Interactions among humans on social media often convey intentions behind +their actions, yielding a psychological language resource for Mental Health +Analysis (MHA) of online users. The success of Computational Intelligence +Techniques (CIT) for inferring mental illness from such social media resources +points to NLP as a lens for causal analysis and perception mining. However, we +argue that more consequential and explainable research is required for optimal +impact on clinical psychology practice and personalized mental healthcare. To +bridge this gap, we posit two significant dimensions: (1) Causal analysis to +illustrate a cause and effect relationship in the user generated text; (2) +Perception mining to infer psychological perspectives of social effects on +online users intentions. Within the scope of Natural Language Processing (NLP), +we further explore critical areas of inquiry associated with these two +dimensions, specifically through recent advancements in discourse analysis. +This position paper guides the community to explore solutions in this space and +advance the state of practice in developing conversational agents for inferring +mental health from social media. We advocate for a more explainable approach +toward modeling computational psychology problems through the lens of language +as we observe an increased number of research contributions in dataset and +problem formulation for causal relation extraction and perception enhancements +while inferring mental states. + +
+
+
+
+
+ + ♻ ☆ A Human-on-the-Loop Optimization Autoformalism Approach for + Sustainability + + +
+ This paper outlines a natural conversational approach to solving personalized +energy-related problems using large language models (LLMs). We focus on +customizable optimization problems that necessitate repeated solving with +slight variations in modeling and are user-specific, hence posing a challenge +to devising a one-size-fits-all model. We put forward a strategy that augments +an LLM with an optimization solver, enhancing its proficiency in understanding +and responding to user specifications and preferences while providing nonlinear +reasoning capabilities. Our approach pioneers the novel concept of human-guided +optimization autoformalism, translating a natural language task specification +automatically into an optimization instance. This enables LLMs to analyze, +explain, and tackle a variety of instance-specific energy-related problems, +pushing beyond the limits of current prompt-based techniques. + Our research encompasses various commonplace tasks in the energy sector, from +electric vehicle charging and Heating, Ventilation, and Air Conditioning (HVAC) +control to long-term planning problems such as cost-benefit evaluations for +installing rooftop solar photovoltaics (PVs) or heat pumps. This pilot study +marks an essential stride towards the context-based formulation of optimization +using LLMs, with the potential to democratize optimization processes. As a +result, stakeholders are empowered to optimize their energy consumption, +promoting sustainable energy practices customized to personal needs and +preferences. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 117 + +
+
+
+ + ☆ CHORUS: Learning Canonicalized 3D Human-Object Spatial Relations from + Unbounded Synthesized Images ICCV 2023 + + +
+ We present a method for teaching machines to understand and model the +underlying spatial common sense of diverse human-object interactions in 3D in a +self-supervised way. This is a challenging task, as there exist specific +manifolds of the interactions that can be considered human-like and natural, +but the human pose and the geometry of objects can vary even for similar +interactions. Such diversity makes the annotating task of 3D interactions +difficult and hard to scale, which limits the potential to reason about that in +a supervised way. One way of learning the 3D spatial relationship between +humans and objects during interaction is by showing multiple 2D images captured +from different viewpoints when humans interact with the same type of objects. +The core idea of our method is to leverage a generative model that produces +high-quality 2D images from an arbitrary text prompt input as an "unbounded" +data generator with effective controllability and view diversity. Despite its +imperfection of the image quality over real images, we demonstrate that the +synthesized images are sufficient to learn the 3D human-object spatial +relations. We present multiple strategies to leverage the synthesized images, +including (1) the first method to leverage a generative image model for 3D +human-object spatial relation learning; (2) a framework to reason about the 3D +spatial relations from inconsistent 2D cues in a self-supervised manner via 3D +occupancy reasoning with pose canonicalization; (3) semantic clustering to +disambiguate different types of interactions with the same object types; and +(4) a novel metric to assess the quality of 3D spatial learning of interaction. +Project Page: https://jellyheadandrew.github.io/projects/chorus + +
+
+ comment: Accepted to ICCV 2023 (Oral Presentation). Project Page: + https://jellyheadandrew.github.io/projects/chorus +
+
+
+
+
+ + ☆ A Generative Approach for Image Registration of Visible-Thermal (VT) + Cancer Faces MICCAI 2023 + + +
+ Since thermal imagery offers a unique modality to investigate pain, the U.S. +National Institutes of Health (NIH) has collected a large and diverse set of +cancer patient facial thermograms for AI-based pain research. However, +differing angles from camera capture between thermal and visible sensors has +led to misalignment between Visible-Thermal (VT) images. We modernize the +classic computer vision task of image registration by applying and modifying a +generative alignment algorithm to register VT cancer faces, without the need +for a reference or alignment parameters. By registering VT faces, we +demonstrate that the quality of thermal images produced in the generative AI +downstream task of Visible-to-Thermal (V2T) image translation significantly +improves up to 52.5\%, than without registration. Images in this paper have +been approved by the NIH NCI for public dissemination. + +
+
+ comment: 2nd Annual Artificial Intelligence over Infrared Images for Medical + Applications Workshop (AIIIMA) at the 26th International Conference on + Medical Image Computing and Computer Assisted Intervention (MICCAI 2023) +
+
+
+
+
+ + ☆ MolGrapher: Graph-based Visual Recognition of Chemical Structures + + +
+ The automatic analysis of chemical literature has immense potential to +accelerate the discovery of new materials and drugs. Much of the critical +information in patent documents and scientific articles is contained in +figures, depicting the molecule structures. However, automatically parsing the +exact chemical structure is a formidable challenge, due to the amount of +detailed information, the diversity of drawing styles, and the need for +training data. In this work, we introduce MolGrapher to recognize chemical +structures visually. First, a deep keypoint detector detects the atoms. Second, +we treat all candidate atoms and bonds as nodes and put them in a graph. This +construct allows a natural graph representation of the molecule. Last, we +classify atom and bond nodes in the graph with a Graph Neural Network. To +address the lack of real training data, we propose a synthetic data generation +pipeline producing diverse and realistic results. In addition, we introduce a +large-scale benchmark of annotated real molecule images, USPTO-30K, to spur +research on this critical topic. Extensive experiments on five datasets show +that our approach significantly outperforms classical and learning-based +methods in most settings. Code, models, and datasets are available. + +
+
+
+
+
+ + ☆ SPPNet: A Single-Point Prompt Network for Nuclei Image Segmentation + + +
+ Image segmentation plays an essential role in nuclei image analysis. +Recently, the segment anything model has made a significant breakthrough in +such tasks. However, the current model exists two major issues for cell +segmentation: (1) the image encoder of the segment anything model involves a +large number of parameters. Retraining or even fine-tuning the model still +requires expensive computational resources. (2) in point prompt mode, points +are sampled from the center of the ground truth and more than one set of points +is expected to achieve reliable performance, which is not efficient for +practical applications. In this paper, a single-point prompt network is +proposed for nuclei image segmentation, called SPPNet. We replace the original +image encoder with a lightweight vision transformer. Also, an effective +convolutional block is added in parallel to extract the low-level semantic +information from the image and compensate for the performance degradation due +to the small image encoder. We propose a new point-sampling method based on the +Gaussian kernel. The proposed model is evaluated on the MoNuSeg-2018 dataset. +The result demonstrated that SPPNet outperforms existing U-shape architectures +and shows faster convergence in training. Compared to the segment anything +model, SPPNet shows roughly 20 times faster inference, with 1/70 parameters and +computational cost. Particularly, only one set of points is required in both +the training and inference phases, which is more reasonable for clinical +applications. The code for our work and more technical details can be found at +https://github.com/xq141839/SPPNet. + +
+
+
+
+
+ + ☆ CIParsing: Unifying Causality Properties into Multiple Human Parsing + + +
+ Existing methods of multiple human parsing (MHP) apply statistical models to +acquire underlying associations between images and labeled body parts. However, +acquired associations often contain many spurious correlations that degrade +model generalization, leading statistical models to be vulnerable to visually +contextual variations in images (e.g., unseen image styles/external +interventions). To tackle this, we present a causality inspired parsing +paradigm termed CIParsing, which follows fundamental causal principles +involving two causal properties for human parsing (i.e., the causal diversity +and the causal invariance). Specifically, we assume that an input image is +constructed by a mix of causal factors (the characteristics of body parts) and +non-causal factors (external contexts), where only the former ones cause the +generation process of human parsing.Since causal/non-causal factors are +unobservable, a human parser in proposed CIParsing is required to construct +latent representations of causal factors and learns to enforce representations +to satisfy the causal properties. In this way, the human parser is able to rely +on causal factors w.r.t relevant evidence rather than non-causal factors w.r.t +spurious correlations, thus alleviating model degradation and yielding improved +parsing ability. Notably, the CIParsing is designed in a plug-and-play fashion +and can be integrated into any existing MHP models. Extensive experiments +conducted on two widely used benchmarks demonstrate the effectiveness and +generalizability of our method. + +
+
+
+
+
+ + ☆ SG-Former: Self-guided Transformer with Evolving Token Reallocation ICCV 2023 + + +
+ Vision Transformer has demonstrated impressive success across various vision +tasks. However, its heavy computation cost, which grows quadratically with +respect to the token sequence length, largely limits its power in handling +large feature maps. To alleviate the computation cost, previous works rely on +either fine-grained self-attentions restricted to local small regions, or +global self-attentions but to shorten the sequence length resulting in coarse +granularity. In this paper, we propose a novel model, termed as Self-guided +Transformer~(SG-Former), towards effective global self-attention with adaptive +fine granularity. At the heart of our approach is to utilize a significance +map, which is estimated through hybrid-scale self-attention and evolves itself +during training, to reallocate tokens based on the significance of each region. +Intuitively, we assign more tokens to the salient regions for achieving +fine-grained attention, while allocating fewer tokens to the minor regions in +exchange for efficiency and global receptive fields. The proposed SG-Former +achieves performance superior to state of the art: our base size model achieves +\textbf{84.7\%} Top-1 accuracy on ImageNet-1K, \textbf{51.2mAP} bbAP on CoCo, +\textbf{52.7mIoU} on ADE20K surpassing the Swin Transformer by \textbf{+1.3\% / ++2.7 mAP/ +3 mIoU}, with lower computation costs and fewer parameters. The code +is available at +\href{https://github.com/OliverRensu/SG-Former}{https://github.com/OliverRensu/SG-Former} + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ CLIPN for Zero-Shot OOD Detection: Teaching CLIP to Say No ICCV 2023 + + +
+ Out-of-distribution (OOD) detection refers to training the model on an +in-distribution (ID) dataset to classify whether the input images come from +unknown classes. Considerable effort has been invested in designing various OOD +detection methods based on either convolutional neural networks or +transformers. However, zero-shot OOD detection methods driven by CLIP, which +only require class names for ID, have received less attention. This paper +presents a novel method, namely CLIP saying "no" (\textbf{CLIPN}), which +empowers the logic of saying "no" within CLIP. Our key motivation is to equip +CLIP with the capability of distinguishing OOD and ID samples using +positive-semantic prompts and negation-semantic prompts. Specifically, we +design a novel learnable "no" prompt and a "no" text encoder to capture +negation semantics within images. Subsequently, we introduce two loss +functions: the image-text binary-opposite loss and the text semantic-opposite +loss, which we use to teach CLIPN to associate images with "no" prompts, +thereby enabling it to identify unknown samples. Furthermore, we propose two +threshold-free inference algorithms to perform OOD detection by utilizing +negation semantics from "no" prompts and the text encoder. Experimental results +on 9 benchmark datasets (3 ID datasets and 6 OOD datasets) for the OOD +detection task demonstrate that CLIPN, based on ViT-B-16, outperforms 7 +well-used algorithms by at least 2.34\% and 11.64\% in terms of AUROC and FPR95 +for zero-shot OOD detection on ImageNet-1K. Our CLIPN can serve as a solid +foundation for effectively leveraging CLIP in downstream OOD tasks. The code is +available on +https://github.com/xmed-lab/CLIPN}{https://github.com/xmed-lab/CLIPN. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Towards Real-Time Analysis of Broadcast Badminton Videos + + +
+ Analysis of player movements is a crucial subset of sports analysis. Existing +player movement analysis methods use recorded videos after the match is over. +In this work, we propose an end-to-end framework for player movement analysis +for badminton matches on live broadcast match videos. We only use the visual +inputs from the match and, unlike other approaches which use multi-modal sensor +data, our approach uses only visual cues. We propose a method to calculate the +on-court distance covered by both the players from the video feed of a live +broadcast badminton match. To perform this analysis, we focus on the gameplay +by removing replays and other redundant parts of the broadcast match. We then +perform player tracking to identify and track the movements of both players in +each frame. Finally, we calculate the distance covered by each player and the +average speed with which they move on the court. We further show a heatmap of +the areas covered by the player on the court which is useful for analyzing the +gameplay of the player. Our proposed framework was successfully used to analyze +live broadcast matches in real-time during the Premier Badminton League 2019 +(PBL 2019), with commentators and broadcasters appreciating the utility. + +
+
+
+
+
+ + ☆ Sign Language Translation with Iterative Prototype ICCV 2023 + + +
+ This paper presents IP-SLT, a simple yet effective framework for sign +language translation (SLT). Our IP-SLT adopts a recurrent structure and +enhances the semantic representation (prototype) of the input sign language +video via an iterative refinement manner. Our idea mimics the behavior of human +reading, where a sentence can be digested repeatedly, till reaching accurate +understanding. Technically, IP-SLT consists of feature extraction, prototype +initialization, and iterative prototype refinement. The initialization module +generates the initial prototype based on the visual feature extracted by the +feature extraction module. Then, the iterative refinement module leverages the +cross-attention mechanism to polish the previous prototype by aggregating it +with the original video feature. Through repeated refinement, the prototype +finally converges to a more stable and accurate state, leading to a fluent and +appropriate translation. In addition, to leverage the sequential dependence of +prototypes, we further propose an iterative distillation loss to compress the +knowledge of the final iteration into previous ones. As the autoregressive +decoding process is executed only once in inference, our IP-SLT is ready to +improve various SLT systems with acceptable overhead. Extensive experiments are +conducted on public benchmarks to demonstrate the effectiveness of the IP-SLT. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Tumor-Centered Patching for Enhanced Medical Image Segmentation + + +
+ The realm of medical image diagnosis has advanced significantly with the +integration of computer-aided diagnosis and surgical systems. However, +challenges persist, particularly in achieving precise image segmentation. While +deep learning techniques show potential, obstacles like limited resources, slow +convergence, and class imbalance impede their effectiveness. Traditional +patch-based methods, though common, struggle to capture intricate tumor +boundaries and often lead to redundant samples, compromising computational +efficiency and feature quality. To tackle these issues, this research +introduces an innovative approach centered on the tumor itself for patch-based +image analysis. This novel tumor-centered patching method aims to address the +class imbalance and boundary deficiencies, enabling focused and accurate tumor +segmentation. By aligning patches with the tumor's anatomical context, this +technique enhances feature extraction accuracy and reduces computational load. +Experimental results demonstrate improved class imbalance, with segmentation +scores of 0.78, 0.76, and 0.71 for whole, core, and enhancing tumors, +respectively using a lightweight simple U-Net. This approach shows potential +for enhancing medical image segmentation and improving computer-aided diagnosis +systems. + +
+
+ comment: 20 pages, 12 figures +
+
+
+
+
+ + ☆ NPF-200: A Multi-Modal Eye Fixation Dataset and Method for + Non-Photorealistic Videos ACM MM 2023 + + +
+ Non-photorealistic videos are in demand with the wave of the metaverse, but +lack of sufficient research studies. This work aims to take a step forward to +understand how humans perceive non-photorealistic videos with eye fixation +(\ie, saliency detection), which is critical for enhancing media production, +artistic design, and game user experience. To fill in the gap of missing a +suitable dataset for this research line, we present NPF-200, the first +large-scale multi-modal dataset of purely non-photorealistic videos with eye +fixations. Our dataset has three characteristics: 1) it contains soundtracks +that are essential according to vision and psychological studies; 2) it +includes diverse semantic content and videos are of high-quality; 3) it has +rich motions across and within videos. We conduct a series of analyses to gain +deeper insights into this task and compare several state-of-the-art methods to +explore the gap between natural images and non-photorealistic data. +Additionally, as the human attention system tends to extract visual and audio +features with different frequencies, we propose a universal frequency-aware +multi-modal non-photorealistic saliency detection model called NPSNet, +demonstrating the state-of-the-art performance of our task. The results uncover +strengths and weaknesses of multi-modal network design and multi-domain +training, opening up promising directions for future works. {Our dataset and +code can be found at \url{https://github.com/Yangziyu/NPF200}}. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ Multimodal Latent Emotion Recognition from Micro-expression and + Physiological Signals + + +
+ This paper discusses the benefits of incorporating multimodal data for +improving latent emotion recognition accuracy, focusing on micro-expression +(ME) and physiological signals (PS). The proposed approach presents a novel +multimodal learning framework that combines ME and PS, including a 1D separable +and mixable depthwise inception network, a standardised normal distribution +weighted feature fusion method, and depth/physiology guided attention modules +for multimodal learning. Experimental results show that the proposed approach +outperforms the benchmark method, with the weighted fusion method and guided +attention modules both contributing to enhanced performance. + +
+
+
+
+
+ + ☆ A Probabilistic Fluctuation based Membership Inference Attack for + Generative Models + + +
+ Membership Inference Attack (MIA) identifies whether a record exists in a +machine learning model's training set by querying the model. MIAs on the +classic classification models have been well-studied, and recent works have +started to explore how to transplant MIA onto generative models. Our +investigation indicates that existing MIAs designed for generative models +mainly depend on the overfitting in target models. However, overfitting can be +avoided by employing various regularization techniques, whereas existing MIAs +demonstrate poor performance in practice. Unlike overfitting, memorization is +essential for deep learning models to attain optimal performance, making it a +more prevalent phenomenon. Memorization in generative models leads to an +increasing trend in the probability distribution of generating records around +the member record. Therefore, we propose a Probabilistic Fluctuation Assessing +Membership Inference Attack (PFAMI), a black-box MIA that infers memberships by +detecting these trends via analyzing the overall probabilistic fluctuations +around given records. We conduct extensive experiments across multiple +generative models and datasets, which demonstrate PFAMI can improve the attack +success rate (ASR) by about 27.9% when compared with the best baseline. + +
+
+
+
+
+ + ☆ Mesh Conflation of Oblique Photogrammetric Models using Virtual Cameras + and Truncated Signed Distance Field + + +
+ Conflating/stitching 2.5D raster digital surface models (DSM) into a large +one has been a running practice in geoscience applications, however, conflating +full-3D mesh models, such as those from oblique photogrammetry, is extremely +challenging. In this letter, we propose a novel approach to address this +challenge by conflating multiple full-3D oblique photogrammetric models into a +single, and seamless mesh for high-resolution site modeling. Given two or more +individually collected and created photogrammetric meshes, we first propose to +create a virtual camera field (with a panoramic field of view) to incubate +virtual spaces represented by Truncated Signed Distance Field (TSDF), an +implicit volumetric field friendly for linear 3D fusion; then we adaptively +leverage the truncated bound of meshes in TSDF to conflate them into a single +and accurate full 3D site model. With drone-based 3D meshes, we show that our +approach significantly improves upon traditional methods for model conflations, +to drive new potentials to create excessively large and accurate full 3D mesh +models in support of geoscience and environmental applications. + +
+
+ comment: 5 Figures +
+
+
+
+
+ + ☆ Select-and-Combine (SAC): A Novel Multi-Stereo Depth Fusion Algorithm + for Point Cloud Generation via Efficient Local Markov Netlets + + +
+ Many practical systems for image-based surface reconstruction employ a +stereo/multi-stereo paradigm, due to its ability to scale for large scenes and +its ease of implementation for out-of-core operations. In this process, +multiple and abundant depth maps from stereo matching must be combined and +fused into a single, consistent, and clean point cloud. However, the noises and +outliers caused by stereo matching and the heterogenous geometric errors of the +poses present a challenge for existing fusion algorithms, since they mostly +assume Gaussian errors and predict fused results based on data from local +spatial neighborhoods, which may inherit uncertainties from multiple depths +resulting in lowered accuracy. In this paper, we propose a novel depth fusion +paradigm, that instead of numerically fusing points from multiple depth maps, +selects the best depth map per point, and combines them into a single and clean +point cloud. This paradigm, called select-and-combine (SAC), is achieved +through modeling the point level fusion using local Markov Netlets, a +micro-network over point across neighboring views for depth/view selection, +followed by a Netlets collapse process for point combination. The Markov +Netlets are optimized such that they can inherently leverage spatial +consistencies among depth maps of neighboring views, thus they can address +errors beyond Gaussian ones. Our experiment results show that our approach +outperforms existing depth fusion approaches by increasing the F1 score that +considers both accuracy and completeness by 2.07% compared to the best existing +method. Finally, our approach generates clearer point clouds that are 18% less +redundant while with a higher accuracy before fusion + +
+
+ comment: 6 Figures +
+
+
+
+
+ + ☆ Lite-HRNet Plus: Fast and Accurate Facial Landmark Detection ICIP2023 + + +
+ Facial landmark detection is an essential technology for driver status +tracking and has been in demand for real-time estimations. As a landmark +coordinate prediction, heatmap-based methods are known to achieve a high +accuracy, and Lite-HRNet can achieve a fast estimation. However, with +Lite-HRNet, the problem of a heavy computational cost of the fusion block, +which connects feature maps with different resolutions, has yet to be solved. +In addition, the strong output module used in HRNetV2 is not applied to +Lite-HRNet. Given these problems, we propose a novel architecture called +Lite-HRNet Plus. Lite-HRNet Plus achieves two improvements: a novel fusion +block based on a channel attention and a novel output module with less +computational intensity using multi-resolution feature maps. Through +experiments conducted on two facial landmark datasets, we confirmed that +Lite-HRNet Plus further improved the accuracy in comparison with conventional +methods, and achieved a state-of-the-art accuracy with a computational +complexity with the range of 10M FLOPs. + +
+
+ comment: Accepted at ICIP2023 +
+
+
+
+
+ + ☆ Masking Strategies for Background Bias Removal in Computer Vision Models ICCV + + +
+ Models for fine-grained image classification tasks, where the difference +between some classes can be extremely subtle and the number of samples per +class tends to be low, are particularly prone to picking up background-related +biases and demand robust methods to handle potential examples with +out-of-distribution (OOD) backgrounds. To gain deeper insights into this +critical problem, our research investigates the impact of background-induced +bias on fine-grained image classification, evaluating standard backbone models +such as Convolutional Neural Network (CNN) and Vision Transformers (ViT). We +explore two masking strategies to mitigate background-induced bias: Early +masking, which removes background information at the (input) image level, and +late masking, which selectively masks high-level spatial features corresponding +to the background. Extensive experiments assess the behavior of CNN and ViT +models under different masking strategies, with a focus on their generalization +to OOD backgrounds. The obtained findings demonstrate that both proposed +strategies enhance OOD performance compared to the baseline models, with early +masking consistently exhibiting the best OOD performance. Notably, a ViT +variant employing GAP-Pooled Patch token-based classification combined with +early masking achieves the highest OOD robustness. + +
+
+ comment: Accepted at the 2023 IEEE/CVF International Conference on Computer + Vision Workshop (ICCVW) on Out Of Distribution Generalization in Computer + Vision (OOD-CV) +
+
+
+
+
+ + ☆ The TYC Dataset for Understanding Instance-Level Semantics and Motions + of Cells in Microstructures ICCV 2023 + + +
+ Segmenting cells and tracking their motion over time is a common task in +biomedical applications. However, predicting accurate instance-wise +segmentation and cell motions from microscopy imagery remains a challenging +task. Using microstructured environments for analyzing single cells in a +constant flow of media adds additional complexity. While large-scale labeled +microscopy datasets are available, we are not aware of any large-scale dataset, +including both cells and microstructures. In this paper, we introduce the +trapped yeast cell (TYC) dataset, a novel dataset for understanding +instance-level semantics and motions of cells in microstructures. We release +$105$ dense annotated high-resolution brightfield microscopy images, including +about $19$k instance masks. We also release $261$ curated video clips composed +of $1293$ high-resolution microscopy images to facilitate unsupervised +understanding of cell motions and morphology. TYC offers ten times more +instance annotations than the previously largest dataset, including cells and +microstructures. Our effort also exceeds previous attempts in terms of +microstructure variability, resolution, complexity, and capturing device +(microscopy) variability. We facilitate a unified comparison on our novel +dataset by introducing a standardized evaluation strategy. TYC and evaluation +code are publicly available under CC BY 4.0 license. + +
+
+ comment: Accepted at ICCV 2023 Workshop on BioImage Computing. Project page + (with links to the dataset and code): + https://christophreich1996.github.io/tyc_dataset/ +
+
+
+
+
+ + ☆ Less is More -- Towards parsimonious multi-task models using structured + sparsity + + +
+ Group sparsity in Machine Learning (ML) encourages simpler, more +interpretable models with fewer active parameter groups. This work aims to +incorporate structured group sparsity into the shared parameters of a +Multi-Task Learning (MTL) framework, to develop parsimonious models that can +effectively address multiple tasks with fewer parameters while maintaining +comparable or superior performance to a dense model. Sparsifying the model +during training helps decrease the model's memory footprint, computation +requirements, and prediction time during inference. We use channel-wise l1/l2 +group sparsity in the shared layers of the Convolutional Neural Network (CNN). +This approach not only facilitates the elimination of extraneous groups +(channels) but also imposes a penalty on the weights, thereby enhancing the +learning of all tasks. We compare the outcomes of single-task and multi-task +experiments under group sparsity on two publicly available MTL datasets, NYU-v2 +and CelebAMask-HQ. We also investigate how changing the sparsification degree +impacts both the performance of the model and the sparsity of groups. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Advancements in Point Cloud Data Augmentation for Deep Learning: A + Survey + + +
+ Point cloud has a wide range of applications in areas such as autonomous +driving, mapping, navigation, scene reconstruction, and medical imaging. Due to +its great potentials in these applications, point cloud processing has gained +great attention in the field of computer vision. Among various point cloud +processing techniques, deep learning (DL) has become one of the mainstream and +effective methods for tasks such as detection, segmentation and classification. +To reduce overfitting during training DL models and improve model performance +especially when the amount and/or diversity of training data are limited, +augmentation is often crucial. Although various point cloud data augmentation +methods have been widely used in different point cloud processing tasks, there +are currently no published systematic surveys or reviews of these methods. +Therefore, this article surveys and discusses these methods and categorizes +them into a taxonomy framework. Through the comprehensive evaluation and +comparison of the augmentation methods, this article identifies their +potentials and limitations and suggests possible future research directions. +This work helps researchers gain a holistic understanding of the current status +of point cloud data augmentation and promotes its wider application and +development. + +
+
+
+
+
+ + ☆ Generalized Continual Category Discovery + + +
+ Most of Continual Learning (CL) methods push the limit of supervised learning +settings, where an agent is expected to learn new labeled tasks and not forget +previous knowledge. However, these settings are not well aligned with real-life +scenarios, where a learning agent has access to a vast amount of unlabeled data +encompassing both novel (entirely unlabeled) classes and examples from known +classes. Drawing inspiration from Generalized Category Discovery (GCD), we +introduce a novel framework that relaxes this assumption. Precisely, in any +task, we allow for the existence of novel and known classes, and one must use +continual version of unsupervised learning methods to discover them. We call +this setting Generalized Continual Category Discovery (GCCD). It unifies CL and +GCD, bridging the gap between synthetic benchmarks and real-life scenarios. +With a series of experiments, we present that existing methods fail to +accumulate knowledge from subsequent tasks in which unlabeled samples of novel +classes are present. In light of these limitations, we propose a method that +incorporates both supervised and unsupervised signals and mitigates the +forgetting through the use of centroid adaptation. Our method surpasses strong +CL methods adopted for GCD techniques and presents a superior representation +learning performance. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Cross-Modality Proposal-guided Feature Mining for Unregistered + RGB-Thermal Pedestrian Detection + + +
+ RGB-Thermal (RGB-T) pedestrian detection aims to locate the pedestrians in +RGB-T image pairs to exploit the complementation between the two modalities for +improving detection robustness in extreme conditions. Most existing algorithms +assume that the RGB-T image pairs are well registered, while in the real world +they are not aligned ideally due to parallax or different field-of-view of the +cameras. The pedestrians in misaligned image pairs may locate at different +positions in two images, which results in two challenges: 1) how to achieve +inter-modality complementation using spatially misaligned RGB-T pedestrian +patches, and 2) how to recognize the unpaired pedestrians at the boundary. To +deal with these issues, we propose a new paradigm for unregistered RGB-T +pedestrian detection, which predicts two separate pedestrian locations in the +RGB and thermal images, respectively. Specifically, we propose a cross-modality +proposal-guided feature mining (CPFM) mechanism to extract the two precise +fusion features for representing the pedestrian in the two modalities, even if +the RGB-T image pair is unaligned. It enables us to effectively exploit the +complementation between the two modalities. With the CPFM mechanism, we build a +two-stream dense detector; it predicts the two pedestrian locations in the two +modalities based on the corresponding fusion feature mined by the CPFM +mechanism. Besides, we design a data augmentation method, named Homography, to +simulate the discrepancy in scales and views between images. We also +investigate two non-maximum suppression (NMS) methods for post-processing. +Favorable experimental results demonstrate the effectiveness and robustness of +our method in dealing with unregistered pedestrians with different shifts. + +
+
+
+
+
+ + ☆ DISGAN: Wavelet-informed Discriminator Guides GAN to MRI + Super-resolution with Noise Cleaning + + +
+ MRI super-resolution (SR) and denoising tasks are fundamental challenges in +the field of deep learning, which have traditionally been treated as distinct +tasks with separate paired training data. In this paper, we propose an +innovative method that addresses both tasks simultaneously using a single deep +learning model, eliminating the need for explicitly paired noisy and clean +images during training. Our proposed model is primarily trained for SR, but +also exhibits remarkable noise-cleaning capabilities in the super-resolved +images. Instead of conventional approaches that introduce frequency-related +operations into the generative process, our novel approach involves the use of +a GAN model guided by a frequency-informed discriminator. To achieve this, we +harness the power of the 3D Discrete Wavelet Transform (DWT) operation as a +frequency constraint within the GAN framework for the SR task on magnetic +resonance imaging (MRI) data. Specifically, our contributions include: 1) a 3D +generator based on residual-in-residual connected blocks; 2) the integration of +the 3D DWT with $1\times 1$ convolution into a DWT+conv unit within a 3D Unet +for the discriminator; 3) the use of the trained model for high-quality image +SR, accompanied by an intrinsic denoising process. We dub the model "Denoising +Induced Super-resolution GAN (DISGAN)" due to its dual effects of SR image +generation and simultaneous denoising. Departing from the traditional approach +of training SR and denoising tasks as separate models, our proposed DISGAN is +trained only on the SR task, but also achieves exceptional performance in +denoising. The model is trained on 3D MRI data from dozens of subjects from the +Human Connectome Project (HCP) and further evaluated on previously unseen MRI +data from subjects with brain tumours and epilepsy to assess its denoising and +SR performance. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4 + + +
+ Multimodal large language models acquire their instruction-following +capabilities through a two-stage training process: pre-training on image-text +pairs and fine-tuning on supervised vision-language instruction data. Recent +studies have shown that large language models can achieve satisfactory results +even with a limited amount of high-quality instruction-following data. In this +paper, we introduce InstructionGPT-4, which is fine-tuned on a small dataset +comprising only 200 examples, amounting to approximately 6% of the +instruction-following data used in the alignment dataset for MiniGPT-4. We +first propose several metrics to access the quality of multimodal instruction +data. Based on these metrics, we present a simple and effective data selector +to automatically identify and filter low-quality vision-language data. By +employing this method, InstructionGPT-4 outperforms the original MiniGPT-4 on +various evaluations (e.g., visual question answering, GPT-4 preference). +Overall, our findings demonstrate that less but high-quality instruction tuning +data is efficient to enable multimodal large language models to generate better +output. + +
+
+
+
+
+ + ☆ SILT: Shadow-aware Iterative Label Tuning for Learning to Detect Shadows + from Noisy Labels ICCV 2023 + + +
+ Existing shadow detection datasets often contain missing or mislabeled +shadows, which can hinder the performance of deep learning models trained +directly on such data. To address this issue, we propose SILT, the Shadow-aware +Iterative Label Tuning framework, which explicitly considers noise in shadow +labels and trains the deep model in a self-training manner. Specifically, we +incorporate strong data augmentations with shadow counterfeiting to help the +network better recognize non-shadow regions and alleviate overfitting. We also +devise a simple yet effective label tuning strategy with global-local fusion +and shadow-aware filtering to encourage the network to make significant +refinements on the noisy labels. We evaluate the performance of SILT by +relabeling the test set of the SBU dataset and conducting various experiments. +Our results show that even a simple U-Net trained with SILT can outperform all +state-of-the-art methods by a large margin. When trained on SBU / UCF / ISTD, +our network can successfully reduce the Balanced Error Rate by 25.2% / 36.9% / +21.3% over the best state-of-the-art method. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ HarvestNet: A Dataset for Detecting Smallholder Farming Activity Using + Harvest Piles and Remote Sensing + + +
+ Small farms contribute to a large share of the productive land in developing +countries. In regions such as sub-Saharan Africa, where 80% of farms are small +(under 2 ha in size), the task of mapping smallholder cropland is an important +part of tracking sustainability measures such as crop productivity. However, +the visually diverse and nuanced appearance of small farms has limited the +effectiveness of traditional approaches to cropland mapping. Here we introduce +a new approach based on the detection of harvest piles characteristic of many +smallholder systems throughout the world. We present HarvestNet, a dataset for +mapping the presence of farms in the Ethiopian regions of Tigray and Amhara +during 2020-2023, collected using expert knowledge and satellite images, +totaling 7k hand-labeled images and 2k ground collected labels. We also +benchmark a set of baselines including SOTA models in remote sensing with our +best models having around 80% classification performance on hand labelled data +and 90%, 98% accuracy on ground truth data for Tigray, Amhara respectively. We +also perform a visual comparison with a widely used pre-existing coverage map +and show that our model detects an extra 56,621 hectares of cropland in Tigray. +We conclude that remote sensing of harvest piles can contribute to more timely +and accurate cropland assessments in food insecure region. + +
+
+ comment: 18 pages, 22 figures +
+
+
+
+
+ + ☆ Manipulating Embeddings of Stable Diffusion Prompts + + +
+ Generative text-to-image models such as Stable Diffusion allow users to +generate images based on a textual description, the prompt. Changing the prompt +is still the primary means for the user to change a generated image as desired. +However, changing the image by reformulating the prompt remains a difficult +process of trial and error, which has led to the emergence of prompt +engineering as a new field of research. We propose and analyze methods to +change the embedding of a prompt directly instead of the prompt text. It allows +for more fine-grained and targeted control that takes into account user +intentions. Our approach treats the generative text-to-image model as a +continuous function and passes gradients between the image space and the prompt +embedding space. By addressing different user interaction problems, we can +apply this idea in three scenarios: (1) Optimization of a metric defined in +image space that could measure, for example, image style. (2) Assistance of +users in creative tasks by enabling them to navigate the image space along a +selection of directions of "near" prompt embeddings. (3) Changing the embedding +of the prompt to include information that the user has seen in a particular +seed but finds difficult to describe in the prompt. Our experiments demonstrate +the feasibility of the described methods. + +
+
+
+
+
+ + ☆ DR-Tune: Improving Fine-tuning of Pretrained Visual Models by + Distribution Regularization with Semantic Calibration ICCV'2023 + + +
+ The visual models pretrained on large-scale benchmarks encode general +knowledge and prove effective in building more powerful representations for +downstream tasks. Most existing approaches follow the fine-tuning paradigm, +either by initializing or regularizing the downstream model based on the +pretrained one. The former fails to retain the knowledge in the successive +fine-tuning phase, thereby prone to be over-fitting, and the latter imposes +strong constraints to the weights or feature maps of the downstream model +without considering semantic drift, often incurring insufficient optimization. +To deal with these issues, we propose a novel fine-tuning framework, namely +distribution regularization with semantic calibration (DR-Tune). It employs +distribution regularization by enforcing the downstream task head to decrease +its classification error on the pretrained feature distribution, which prevents +it from over-fitting while enabling sufficient training of downstream encoders. +Furthermore, to alleviate the interference by semantic drift, we develop the +semantic calibration (SC) module to align the global shape and class centers of +the pretrained and downstream feature distributions. Extensive experiments on +widely used image classification datasets show that DR-Tune consistently +improves the performance when combing with various backbones under different +pretraining strategies. Code is available at: +https://github.com/weeknan/DR-Tune. + +
+
+ comment: Accepted by ICCV'2023 +
+
+
+
+
+ + ☆ Towards Privacy-Supporting Fall Detection via Deep Unsupervised + RGB2Depth Adaptation + + +
+ Fall detection is a vital task in health monitoring, as it allows the system +to trigger an alert and therefore enabling faster interventions when a person +experiences a fall. Although most previous approaches rely on standard RGB +video data, such detailed appearance-aware monitoring poses significant privacy +concerns. Depth sensors, on the other hand, are better at preserving privacy as +they merely capture the distance of objects from the sensor or camera, omitting +color and texture information. In this paper, we introduce a privacy-supporting +solution that makes the RGB-trained model applicable in depth domain and +utilizes depth data at test time for fall detection. To achieve cross-modal +fall detection, we present an unsupervised RGB to Depth (RGB2Depth) cross-modal +domain adaptation approach that leverages labelled RGB data and unlabelled +depth data during training. Our proposed pipeline incorporates an intermediate +domain module for feature bridging, modality adversarial loss for modality +discrimination, classification loss for pseudo-labeled depth data and labeled +source data, triplet loss that considers both source and target domains, and a +novel adaptive loss weight adjustment method for improved coordination among +various losses. Our approach achieves state-of-the-art results in the +unsupervised RGB2Depth domain adaptation task for fall detection. Code is +available at https://github.com/1015206533/privacy_supporting_fall_detection. + +
+
+
+
+
+ + ☆ Head-Tail Cooperative Learning Network for Unbiased Scene Graph + Generation + + +
+ Scene Graph Generation (SGG) as a critical task in image understanding, +facing the challenge of head-biased prediction caused by the long-tail +distribution of predicates. However, current unbiased SGG methods can easily +prioritize improving the prediction of tail predicates while ignoring the +substantial sacrifice in the prediction of head predicates, leading to a shift +from head bias to tail bias. To address this issue, we propose a model-agnostic +Head-Tail Collaborative Learning (HTCL) network that includes head-prefer and +tail-prefer feature representation branches that collaborate to achieve +accurate recognition of both head and tail predicates. We also propose a +self-supervised learning approach to enhance the prediction ability of the +tail-prefer feature representation branch by constraining tail-prefer predicate +features. Specifically, self-supervised learning converges head predicate +features to their class centers while dispersing tail predicate features as +much as possible through contrast learning and head center loss. We demonstrate +the effectiveness of our HTCL by applying it to various SGG models on VG150, +Open Images V6 and GQA200 datasets. The results show that our method achieves +higher mean Recall with a minimal sacrifice in Recall and achieves a new +state-of-the-art overall performance. Our code is available at +https://github.com/wanglei0618/HTCL. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ CgT-GAN: CLIP-guided Text GAN for Image Captioning ACM MM 2023 + + +
+ The large-scale visual-language pre-trained model, Contrastive Language-Image +Pre-training (CLIP), has significantly improved image captioning for scenarios +without human-annotated image-caption pairs. Recent advanced CLIP-based image +captioning without human annotations follows a text-only training paradigm, +i.e., reconstructing text from shared embedding space. Nevertheless, these +approaches are limited by the training/inference gap or huge storage +requirements for text embeddings. Given that it is trivial to obtain images in +the real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates +images into the training process to enable the model to "see" real visual +modality. Particularly, we use adversarial training to teach CgT-GAN to mimic +the phrases of an external text corpus and CLIP-based reward to provide +semantic guidance. The caption generator is jointly rewarded based on the +caption naturalness to human language calculated from the GAN's discriminator +and the semantic guidance reward computed by the CLIP-based reward module. In +addition to the cosine similarity as the semantic guidance reward (i.e., +CLIP-cos), we further introduce a novel semantic guidance reward called +CLIP-agg, which aligns the generated caption with a weighted text embedding by +attentively aggregating the entire corpus. Experimental results on three +subtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms +state-of-the-art methods significantly across all metrics. Code is available at +https://github.com/Lihr747/CgtGAN. + +
+
+ comment: Accepted at ACM MM 2023 +
+
+
+
+
+ + ☆ Large Multilingual Models Pivot Zero-Shot Multimodal Learning across + Languages + + +
+ Recently there has been a significant surge in multimodal learning in terms +of both image-to-text and text-to-image generation. However, the success is +typically limited to English, leaving other languages largely behind. Building +a competitive counterpart in other languages is highly challenging due to the +low-resource nature of non-English multimodal data (i.e., lack of large-scale, +high-quality image-text data). In this work, we propose MPM, an effective +training paradigm for training large multimodal models in low-resource +languages. MPM demonstrates that Multilingual language models can Pivot +zero-shot Multimodal learning across languages. Specifically, based on a strong +multilingual large language model, multimodal models pretrained on English-only +image-text data can well generalize to other languages in a zero-shot manner +for both image-to-text and text-to-image generation, even surpassing models +trained on image-text data in native languages. Taking Chinese as a practice of +MPM, we build large multimodal models VisCPM in image-to-text and text-to-image +generation, which achieve state-of-the-art (open-source) performance in +Chinese. To facilitate future research, we open-source codes and model weights +at https://github.com/OpenBMB/VisCPM.git. + +
+
+ comment: https://github.com/OpenBMB/VisCPM.git +
+
+
+
+
+ + ☆ RefEgo: Referring Expression Comprehension Dataset from First-Person + Perception of Ego4D ICCV2023 + + +
+ Grounding textual expressions on scene objects from first-person views is a +truly demanding capability in developing agents that are aware of their +surroundings and behave following intuitive text instructions. Such capability +is of necessity for glass-devices or autonomous robots to localize referred +objects in the real-world. In the conventional referring expression +comprehension tasks of images, however, datasets are mostly constructed based +on the web-crawled data and don't reflect diverse real-world structures on the +task of grounding textual expressions in diverse objects in the real world. +Recently, a massive-scale egocentric video dataset of Ego4D was proposed. Ego4D +covers around the world diverse real-world scenes including numerous indoor and +outdoor situations such as shopping, cooking, walking, talking, manufacturing, +etc. Based on egocentric videos of Ego4D, we constructed a broad coverage of +the video-based referring expression comprehension dataset: RefEgo. Our dataset +includes more than 12k video clips and 41 hours for video-based referring +expression comprehension annotation. In experiments, we combine the +state-of-the-art 2D referring expression comprehension models with the object +tracking algorithm, achieving the video-wise referred object tracking even in +difficult conditions: the referred object becomes out-of-frame in the middle of +the video or multiple similar objects are presented in the video. + +
+
+ comment: 15 pages, 11 figures. ICCV2023 +
+
+
+
+
+ + ☆ Distribution-Aware Calibration for Object Detection with Noisy Bounding + Boxes + + +
+ Large-scale well-annotated datasets are of great importance for training an +effective object detector. However, obtaining accurate bounding box annotations +is laborious and demanding. Unfortunately, the resultant noisy bounding boxes +could cause corrupt supervision signals and thus diminish detection +performance. Motivated by the observation that the real ground-truth is usually +situated in the aggregation region of the proposals assigned to a noisy +ground-truth, we propose DIStribution-aware CalibratiOn (DISCO) to model the +spatial distribution of proposals for calibrating supervision signals. In +DISCO, spatial distribution modeling is performed to statistically extract the +potential locations of objects. Based on the modeled distribution, three +distribution-aware techniques, i.e., distribution-aware proposal augmentation +(DA-Aug), distribution-aware box refinement (DA-Ref), and distribution-aware +confidence estimation (DA-Est), are developed to improve classification, +localization, and interpretability, respectively. Extensive experiments on +large-scale noisy image datasets (i.e., Pascal VOC and MS-COCO) demonstrate +that DISCO can achieve state-of-the-art detection performance, especially at +high noise levels. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ☆ StofNet: Super-resolution Time of Flight Network + + +
+ Time of Flight (ToF) is a prevalent depth sensing technology in the fields of +robotics, medical imaging, and non-destructive testing. Yet, ToF sensing faces +challenges from complex ambient conditions making an inverse modelling from the +sparse temporal information intractable. This paper highlights the potential of +modern super-resolution techniques to learn varying surroundings for a reliable +and accurate ToF detection. Unlike existing models, we tailor an architecture +for sub-sample precise semi-global signal localization by combining +super-resolution with an efficient residual contraction block to balance +between fine signal details and large scale contextual information. We +consolidate research on ToF by conducting a benchmark comparison against six +state-of-the-art methods for which we employ two publicly available datasets. +This includes the release of our SToF-Chirp dataset captured by an airborne +ultrasound transducer. Results showcase the superior performance of our +proposed StofNet in terms of precision, reliability and model complexity. Our +code is available at https://github.com/hahnec/stofnet. + +
+
+ comment: pre-print +
+
+
+
+
+ + ☆ Multi-stage Factorized Spatio-Temporal Representation for RGB-D Action + and Gesture Recognition ACM MM'23 + + +
+ RGB-D action and gesture recognition remain an interesting topic in +human-centered scene understanding, primarily due to the multiple granularities +and large variation in human motion. Although many RGB-D based action and +gesture recognition approaches have demonstrated remarkable results by +utilizing highly integrated spatio-temporal representations across multiple +modalities (i.e., RGB and depth data), they still encounter several challenges. +Firstly, vanilla 3D convolution makes it hard to capture fine-grained motion +differences between local clips under different modalities. Secondly, the +intricate nature of highly integrated spatio-temporal modeling can lead to +optimization difficulties. Thirdly, duplicate and unnecessary information can +add complexity and complicate entangled spatio-temporal modeling. To address +the above issues, we propose an innovative heuristic architecture called +Multi-stage Factorized Spatio-Temporal (MFST) for RGB-D action and gesture +recognition. The proposed MFST model comprises a 3D Central Difference +Convolution Stem (CDC-Stem) module and multiple factorized spatio-temporal +stages. The CDC-Stem enriches fine-grained temporal perception, and the +multiple hierarchical spatio-temporal stages construct dimension-independent +higher-order semantic primitives. Specifically, the CDC-Stem module captures +bottom-level spatio-temporal features and passes them successively to the +following spatio-temporal factored stages to capture the hierarchical spatial +and temporal features through the Multi- Scale Convolution and Transformer +(MSC-Trans) hybrid block and Weight-shared Multi-Scale Transformer (WMS-Trans) +block. The seamless integration of these innovative designs results in a robust +spatio-temporal representation that outperforms state-of-the-art approaches on +RGB-D action and gesture recognition datasets. + +
+
+ comment: ACM MM'23 has accepted this paper +
+
+
+
+
+ + ☆ Local Distortion Aware Efficient Transformer Adaptation for Image + Quality Assessment + + +
+ Image Quality Assessment (IQA) constitutes a fundamental task within the +field of computer vision, yet it remains an unresolved challenge, owing to the +intricate distortion conditions, diverse image contents, and limited +availability of data. Recently, the community has witnessed the emergence of +numerous large-scale pretrained foundation models, which greatly benefit from +dramatically increased data and parameter capacities. However, it remains an +open problem whether the scaling law in high-level tasks is also applicable to +IQA task which is closely related to low-level clues. In this paper, we +demonstrate that with proper injection of local distortion features, a larger +pretrained and fixed foundation model performs better in IQA tasks. +Specifically, for the lack of local distortion structure and inductive bias of +vision transformer (ViT), alongside the large-scale pretrained ViT, we use +another pretrained convolution neural network (CNN), which is well known for +capturing the local structure, to extract multi-scale image features. Further, +we propose a local distortion extractor to obtain local distortion features +from the pretrained CNN and a local distortion injector to inject the local +distortion features into ViT. By only training the extractor and injector, our +method can benefit from the rich knowledge in the powerful foundation models +and achieve state-of-the-art performance on popular IQA datasets, indicating +that IQA is not only a low-level problem but also benefits from stronger +high-level features drawn from large-scale pretrained models. + +
+
+
+
+
+ + ☆ Progressive Feature Mining and External Knowledge-Assisted + Text-Pedestrian Image Retrieval + + +
+ Text-Pedestrian Image Retrieval aims to use the text describing pedestrian +appearance to retrieve the corresponding pedestrian image. This task involves +not only modality discrepancy, but also the challenge of the textual diversity +of pedestrians with the same identity. At present, although existing research +progress has been made in text-pedestrian image retrieval, these methods do not +comprehensively consider the above-mentioned problems. Considering these, this +paper proposes a progressive feature mining and external knowledge-assisted +feature purification method. Specifically, we use a progressive mining mode to +enable the model to mine discriminative features from neglected information, +thereby avoiding the loss of discriminative information and improving the +expression ability of features. In addition, to further reduce the negative +impact of modal discrepancy and text diversity on cross-modal matching, we +propose to use other sample knowledge of the same modality, i.e., external +knowledge to enhance identity-consistent features and weaken +identity-inconsistent features. This process purifies features and alleviates +the interference caused by textual diversity and negative sample correlation +features of the same modal. Extensive experiments on three challenging datasets +demonstrate the effectiveness and superiority of the proposed method, and the +retrieval performance even surpasses that of the large-scale model-based method +on large-scale datasets. + +
+
+
+
+
+ + ☆ RankMixup: Ranking-Based Mixup Training for Network Calibration ICCV 2023 + + +
+ Network calibration aims to accurately estimate the level of confidences, +which is particularly important for employing deep neural networks in +real-world systems. Recent approaches leverage mixup to calibrate the network's +predictions during training. However, they do not consider the problem that +mixtures of labels in mixup may not accurately represent the actual +distribution of augmented samples. In this paper, we present RankMixup, a novel +mixup-based framework alleviating the problem of the mixture of labels for +network calibration. To this end, we propose to use an ordinal ranking +relationship between raw and mixup-augmented samples as an alternative +supervisory signal to the label mixtures for network calibration. We +hypothesize that the network should estimate a higher level of confidence for +the raw samples than the augmented ones (Fig.1). To implement this idea, we +introduce a mixup-based ranking loss (MRL) that encourages lower confidences +for augmented samples compared to raw ones, maintaining the ranking +relationship. We also propose to leverage the ranking relationship among +multiple mixup-augmented samples to further improve the calibration capability. +Augmented samples with larger mixing coefficients are expected to have higher +confidences and vice versa (Fig.1). That is, the order of confidences should be +aligned with that of mixing coefficients. To this end, we introduce a novel +loss, M-NDCG, in order to reduce the number of misaligned pairs of the +coefficients and confidences. Extensive experimental results on standard +benchmarks for network calibration demonstrate the effectiveness of RankMixup. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Multi-Modal Multi-Task (3MT) Road Segmentation + + +
+ Multi-modal systems have the capacity of producing more reliable results than +systems with a single modality in road detection due to perceiving different +aspects of the scene. We focus on using raw sensor inputs instead of, as it is +typically done in many SOTA works, leveraging architectures that require high +pre-processing costs such as surface normals or dense depth predictions. By +using raw sensor inputs, we aim to utilize a low-cost model thatminimizes both +the pre-processing andmodel computation costs. This study presents a +cost-effective and highly accurate solution for road segmentation by +integrating data from multiple sensorswithin a multi-task learning +architecture.Afusion architecture is proposed in which RGB and LiDAR depth +images constitute the inputs of the network. Another contribution of this study +is to use IMU/GNSS (inertial measurement unit/global navigation satellite +system) inertial navigation system whose data is collected synchronously and +calibrated with a LiDAR-camera to compute aggregated dense LiDAR depth images. +It has been demonstrated by experiments on the KITTI dataset that the proposed +method offers fast and high-performance solutions. We have also shown the +performance of our method on Cityscapes where raw LiDAR data is not available. +The segmentation results obtained for both full and half resolution images are +competitive with existing methods. Therefore, we conclude that our method is +not dependent only on raw LiDAR data; rather, it can be used with different +sensor modalities. The inference times obtained in all experiments are very +promising for real-time experiments. + +
+
+
+
+
+ + ☆ Rotation-Invariant Completion Network + + +
+ Real-world point clouds usually suffer from incompleteness and display +different poses. While current point cloud completion methods excel in +reproducing complete point clouds with consistent poses as seen in the training +set, their performance tends to be unsatisfactory when handling point clouds +with diverse poses. We propose a network named Rotation-Invariant Completion +Network (RICNet), which consists of two parts: a Dual Pipeline Completion +Network (DPCNet) and an enhancing module. Firstly, DPCNet generates a coarse +complete point cloud. The feature extraction module of DPCNet can extract +consistent features, no matter if the input point cloud has undergone rotation +or translation. Subsequently, the enhancing module refines the fine-grained +details of the final generated point cloud. RICNet achieves better rotation +invariance in feature extraction and incorporates structural relationships in +man-made objects. To assess the performance of RICNet and existing methods on +point clouds with various poses, we applied random transformations to the point +clouds in the MVP dataset and conducted experiments on them. Our experiments +demonstrate that RICNet exhibits superior completion performance compared to +existing methods. + +
+
+ comment: 12 pages, accepted to PRCV 2023 (The 6th Chinese Conference on + Pattern Recognition and Computer Vision) +
+
+
+
+
+ + ☆ Blending-NeRF: Text-Driven Localized Editing in Neural Radiance Fields ICCV 2023 + + +
+ Text-driven localized editing of 3D objects is particularly difficult as +locally mixing the original 3D object with the intended new object and style +effects without distorting the object's form is not a straightforward process. +To address this issue, we propose a novel NeRF-based model, Blending-NeRF, +which consists of two NeRF networks: pretrained NeRF and editable NeRF. +Additionally, we introduce new blending operations that allow Blending-NeRF to +properly edit target regions which are localized by text. By using a pretrained +vision-language aligned model, CLIP, we guide Blending-NeRF to add new objects +with varying colors and densities, modify textures, and remove parts of the +original object. Our extensive experiments demonstrate that Blending-NeRF +produces naturally and locally edited 3D objects from various text prompts. + +
+
+ comment: Accepted to ICCV 2023. The first two authors contributed equally to + this work +
+
+
+
+
+ + ☆ EVE: Efficient Vision-Language Pre-training with Masked Prediction and + Modality-Aware MoE + + +
+ Building scalable vision-language models to learn from diverse, multimodal +data remains an open challenge. In this paper, we introduce an Efficient +Vision-languagE foundation model, namely EVE, which is one unified multimodal +Transformer pre-trained solely by one unified pre-training task. Specifically, +EVE encodes both vision and language within a shared Transformer network +integrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which +capture modality-specific information by selectively switching to different +experts. To unify pre-training tasks of vision and language, EVE performs +masked signal modeling on image-text pairs to reconstruct masked signals, i.e., +image pixels and text tokens, given visible signals. This simple yet effective +pre-training objective accelerates training by 3.5x compared to the model +pre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing +to the combination of the unified architecture and pre-training task, EVE is +easy to scale up, enabling better downstream performance with fewer resources +and faster training speed. Despite its simplicity, EVE achieves +state-of-the-art performance on various vision-language downstream tasks, +including visual question answering, visual reasoning, and image-text +retrieval. + +
+
+
+
+
+ + ☆ Anisotropic Hybrid Networks for liver tumor segmentation with + uncertainty quantification MICCAI + + +
+ The burden of liver tumors is important, ranking as the fourth leading cause +of cancer mortality. In case of hepatocellular carcinoma (HCC), the delineation +of liver and tumor on contrast-enhanced magnetic resonance imaging (CE-MRI) is +performed to guide the treatment strategy. As this task is time-consuming, +needs high expertise and could be subject to inter-observer variability there +is a strong need for automatic tools. However, challenges arise from the lack +of available training data, as well as the high variability in terms of image +resolution and MRI sequence. In this work we propose to compare two different +pipelines based on anisotropic models to obtain the segmentation of the liver +and tumors. The first pipeline corresponds to a baseline multi-class model that +performs the simultaneous segmentation of the liver and tumor classes. In the +second approach, we train two distinct binary models, one segmenting the liver +only and the other the tumors. Our results show that both pipelines exhibit +different strengths and weaknesses. Moreover we propose an uncertainty +quantification strategy allowing the identification of potential false positive +tumor lesions. Both solutions were submitted to the MICCAI 2023 Atlas challenge +regarding liver and tumor segmentation. + +
+
+ comment: Accepted for presentation at MICCAI Workshop on 2nd + Resource-Efficient Medical Image Analysis (REMIA) +
+
+
+
+
+ + ☆ Pose Modulated Avatars from Video + + +
+ It is now possible to reconstruct dynamic human motion and shape from a +sparse set of cameras using Neural Radiance Fields (NeRF) driven by an +underlying skeleton. However, a challenge remains to model the deformation of +cloth and skin in relation to skeleton pose. Unlike existing avatar models that +are learned implicitly or rely on a proxy surface, our approach is motivated by +the observation that different poses necessitate unique frequency assignments. +Neglecting this distinction yields noisy artifacts in smooth areas or blurs +fine-grained texture and shape details in sharp regions. We develop a +two-branch neural network that is adaptive and explicit in the frequency +domain. The first branch is a graph neural network that models correlations +among body parts locally, taking skeleton pose as input. The second branch +combines these correlation features to a set of global frequencies and then +modulates the feature encoding. Our experiments demonstrate that our network +outperforms state-of-the-art methods in terms of preserving details and +generalization capabilities. + +
+
+
+
+
+ + ☆ High-quality Image Dehazing with Diffusion Model + + +
+ Image dehazing is quite challenging in dense-haze scenarios, where quite less +original information remains in the hazy image. Though previous methods have +made marvelous progress, they still suffer from information loss in content and +color in dense-haze scenarios. The recently emerged Denoising Diffusion +Probabilistic Model (DDPM) exhibits strong generation ability, showing +potential for solving this problem. However, DDPM fails to consider the physics +property of dehazing task, limiting its information completion capacity. In +this work, we propose DehazeDDPM: A DDPM-based and physics-aware image dehazing +framework that applies to complex hazy scenarios. Specifically, DehazeDDPM +works in two stages. The former stage physically models the dehazing task with +the Atmospheric Scattering Model (ASM), pulling the distribution closer to the +clear data and endowing DehazeDDPM with fog-aware ability. The latter stage +exploits the strong generation ability of DDPM to compensate for the +haze-induced huge information loss, by working in conjunction with the physical +modelling. Extensive experiments demonstrate that our method attains +state-of-the-art performance on both synthetic and real-world hazy datasets. + +
+
+
+
+
+ + ☆ Efficient Transfer Learning in Diffusion Models via Adversarial Noise + + +
+ Diffusion Probabilistic Models (DPMs) have demonstrated substantial promise +in image generation tasks but heavily rely on the availability of large amounts +of training data. Previous works, like GANs, have tackled the limited data +problem by transferring pre-trained models learned with sufficient data. +However, those methods are hard to be utilized in DPMs since the distinct +differences between DPM-based and GAN-based methods, showing in the unique +iterative denoising process integral and the need for many timesteps with +no-targeted noise in DPMs. In this paper, we propose a novel DPMs-based +transfer learning method, TAN, to address the limited data problem. It includes +two strategies: similarity-guided training, which boosts transfer with a +classifier, and adversarial noise selection which adaptive chooses targeted +noise based on the input image. Extensive experiments in the context of +few-shot image generation tasks demonstrate that our method is not only +efficient but also excels in terms of image quality and diversity when compared +to existing GAN-based and DDPM-based methods. + +
+
+
+
+
+ + ☆ LongDanceDiff: Long-term Dance Generation with Conditional Diffusion + Model + + +
+ Dancing with music is always an essential human art form to express emotion. +Due to the high temporal-spacial complexity, long-term 3D realist dance +generation synchronized with music is challenging. Existing methods suffer from +the freezing problem when generating long-term dances due to error accumulation +and training-inference discrepancy. To address this, we design a conditional +diffusion model, LongDanceDiff, for this sequence-to-sequence long-term dance +generation, addressing the challenges of temporal coherency and spatial +constraint. LongDanceDiff contains a transformer-based diffusion model, where +the input is a concatenation of music, past motions, and noised future motions. +This partial noising strategy leverages the full-attention mechanism and learns +the dependencies among music and past motions. To enhance the diversity of +generated dance motions and mitigate the freezing problem, we introduce a +mutual information minimization objective that regularizes the dependency +between past and future motions. We also address common visual quality issues +in dance generation, such as foot sliding and unsmooth motion, by incorporating +spatial constraints through a Global-Trajectory Modulation (GTM) layer and +motion perceptual losses, thereby improving the smoothness and naturalness of +motion generation. Extensive experiments demonstrate a significant improvement +in our approach over the existing state-of-the-art methods. We plan to release +our codes and models soon. + +
+
+
+
+
+ + ☆ Boosting Diffusion Models with an Adaptive Momentum Sampler + + +
+ Diffusion probabilistic models (DPMs) have been shown to generate +high-quality images without the need for delicate adversarial training. +However, the current sampling process in DPMs is prone to violent shaking. In +this paper, we present a novel reverse sampler for DPMs inspired by the +widely-used Adam optimizer. Our proposed sampler can be readily applied to a +pre-trained diffusion model, utilizing momentum mechanisms and adaptive +updating to smooth the reverse sampling process and ensure stable generation, +resulting in outputs of enhanced quality. By implicitly reusing update +directions from early steps, our proposed sampler achieves a better balance +between high-level semantics and low-level details. Additionally, this sampler +is flexible and can be easily integrated into pre-trained DPMs regardless of +the sampler used during training. Our experimental results on multiple +benchmarks demonstrate that our proposed reverse sampler yields remarkable +improvements over different baselines. We will make the source code available. + +
+
+
+
+
+ + ☆ Learning Bottleneck Transformer for Event Image-Voxel Feature Fusion + based Classification + + +
+ Recognizing target objects using an event-based camera draws more and more +attention in recent years. Existing works usually represent the event streams +into point-cloud, voxel, image, etc, and learn the feature representations +using various deep neural networks. Their final results may be limited by the +following factors: monotonous modal expressions and the design of the network +structure. To address the aforementioned challenges, this paper proposes a +novel dual-stream framework for event representation, extraction, and fusion. +This framework simultaneously models two common representations: event images +and event voxels. By utilizing Transformer and Structured Graph Neural Network +(GNN) architectures, spatial information and three-dimensional stereo +information can be learned separately. Additionally, a bottleneck Transformer +is introduced to facilitate the fusion of the dual-stream information. +Extensive experiments demonstrate that our proposed framework achieves +state-of-the-art performance on two widely used event-based classification +datasets. The source code of this work is available at: +\url{https://github.com/Event-AHU/EFV_event_classification} + +
+
+ comment: Accepted by PRCV-2023 +
+
+
+
+
+ + ☆ Synergistic Multiscale Detail Refinement via Intrinsic Supervision for + Underwater Image Enhancement + + +
+ Visual restoration of underwater scenes is crucial for visual tasks, and +avoiding interference from underwater media has become a prominent concern. In +this work, we present a synergistic multiscale detail refinement via intrinsic +supervision (SMDR-IS) to recover underwater scene details. The low-degradation +stage provides multiscale detail for original stage, which achieves synergistic +multiscale detail refinement through feature propagation via the adaptive +selective intrinsic supervised feature module (ASISF), which achieves +synergistic multiscale detail refinement. ASISF is developed using intrinsic +supervision to precisely control and guide feature transmission in the +multi-degradation stages. ASISF improves the multiscale detail refinement while +reducing interference from irrelevant scene information from the +low-degradation stage. Additionally, within the multi-degradation +encoder-decoder of SMDR-IS, we introduce a bifocal intrinsic-context attention +module (BICA). This module is designed to effectively leverage multi-scale +scene information found in images, using intrinsic supervision principles as +its foundation. BICA facilitates the guidance of higher-resolution spaces by +leveraging lower-resolution spaces, considering the significant dependency of +underwater image restoration on spatial contextual relationships. During the +training process, the network gains advantages from the integration of a +multi-degradation loss function. This function serves as a constraint, enabling +the network to effectively exploit information across various scales. When +compared with state-of-the-art methods, SMDR-IS demonstrates its outstanding +performance. Code will be made publicly available. + +
+
+
+
+
+ + ☆ OFVL-MS: Once for Visual Localization across Multiple Indoor Scenes + + +
+ In this work, we seek to predict camera poses across scenes with a multi-task +learning manner, where we view the localization of each scene as a new task. We +propose OFVL-MS, a unified framework that dispenses with the traditional +practice of training a model for each individual scene and relieves gradient +conflict induced by optimizing multiple scenes collectively, enabling efficient +storage yet precise visual localization for all scenes. Technically, in the +forward pass of OFVL-MS, we design a layer-adaptive sharing policy with a +learnable score for each layer to automatically determine whether the layer is +shared or not. Such sharing policy empowers us to acquire task-shared +parameters for a reduction of storage cost and task-specific parameters for +learning scene-related features to alleviate gradient conflict. In the backward +pass of OFVL-MS, we introduce a gradient normalization algorithm that +homogenizes the gradient magnitude of the task-shared parameters so that all +tasks converge at the same pace. Furthermore, a sparse penalty loss is applied +on the learnable scores to facilitate parameter sharing for all tasks without +performance degradation. We conduct comprehensive experiments on multiple +benchmarks and our new released indoor dataset LIVL, showing that OFVL-MS +families significantly outperform the state-of-the-arts with fewer parameters. +We also verify that OFVL-MS can generalize to a new scene with much few +parameters while gaining superior localization performance. + +
+
+
+
+
+ + ☆ Recovering a Molecule's 3D Dynamics from Liquid-phase Electron + Microscopy Movies + + +
+ The dynamics of biomolecules are crucial for our understanding of their +functioning in living systems. However, current 3D imaging techniques, such as +cryogenic electron microscopy (cryo-EM), require freezing the sample, which +limits the observation of their conformational changes in real time. The +innovative liquid-phase electron microscopy (liquid-phase EM) technique allows +molecules to be placed in the native liquid environment, providing a unique +opportunity to observe their dynamics. In this paper, we propose TEMPOR, a +Temporal Electron MicroscoPy Object Reconstruction algorithm for liquid-phase +EM that leverages an implicit neural representation (INR) and a dynamical +variational auto-encoder (DVAE) to recover time series of molecular structures. +We demonstrate its advantages in recovering different motion dynamics from two +simulated datasets, 7bcq and Cas9. To our knowledge, our work is the first +attempt to directly recover 3D structures of a temporally-varying particle from +liquid-phase EM movies. It provides a promising new approach for studying +molecules' 3D dynamics in structural biology. + +
+
+
+
+
+ + ☆ Concept Bottleneck with Visual Concept Filtering for Explainable Medical + Image Classification MICCAI 2023 + + +
+ Interpretability is a crucial factor in building reliable models for various +medical applications. Concept Bottleneck Models (CBMs) enable interpretable +image classification by utilizing human-understandable concepts as intermediate +targets. Unlike conventional methods that require extensive human labor to +construct the concept set, recent works leveraging Large Language Models (LLMs) +for generating concepts made automatic concept generation possible. However, +those methods do not consider whether a concept is visually relevant or not, +which is an important factor in computing meaningful concept scores. Therefore, +we propose a visual activation score that measures whether the concept contains +visual cues or not, which can be easily computed with unlabeled image data. +Computed visual activation scores are then used to filter out the less visible +concepts, thus resulting in a final concept set with visually meaningful +concepts. Our experimental results show that adopting the proposed visual +activation score for concept filtering consistently boosts performance compared +to the baseline. Moreover, qualitative analyses also validate that visually +relevant concepts are successfully selected with the visual activation score. + +
+
+ comment: Accepted to MedAGI Workshop at MICCAI 2023 (Oral Presentation) +
+
+
+
+
+ + ☆ AMSP-UOD: When Vortex Convolution and Stochastic Perturbation Meet + Underwater Object Detection + + +
+ In this paper, we present a novel Amplitude-Modulated Stochastic Perturbation +and Vortex Convolutional Network, AMSP-UOD, designed for underwater object +detection. AMSP-UOD specifically addresses the impact of non-ideal imaging +factors on detection accuracy in complex underwater environments. To mitigate +the influence of noise on object detection performance, we propose AMSP Vortex +Convolution (AMSP-VConv) to disrupt the noise distribution, enhance feature +extraction capabilities, effectively reduce parameters, and improve network +robustness. We design the Feature Association Decoupling Cross Stage Partial +(FAD-CSP) module, which strengthens the association of long and short-range +features, improving the network performance in complex underwater environments. +Additionally, our sophisticated post-processing method, based on non-maximum +suppression with aspect-ratio similarity thresholds, optimizes detection in +dense scenes, such as waterweed and schools of fish, improving object detection +accuracy. Extensive experiments on the URPC and RUOD datasets demonstrate that +our method outperforms existing state-of-the-art methods in terms of accuracy +and noise immunity. AMSP-UOD proposes an innovative solution with the potential +for real-world applications. Code will be made publicly available. + +
+
+
+
+
+ + ☆ LFS-GAN: Lifelong Few-Shot Image Generation ICCV 2023 + + +
+ We address a challenging lifelong few-shot image generation task for the +first time. In this situation, a generative model learns a sequence of tasks +using only a few samples per task. Consequently, the learned model encounters +both catastrophic forgetting and overfitting problems at a time. Existing +studies on lifelong GANs have proposed modulation-based methods to prevent +catastrophic forgetting. However, they require considerable additional +parameters and cannot generate high-fidelity and diverse images from limited +data. On the other hand, the existing few-shot GANs suffer from severe +catastrophic forgetting when learning multiple tasks. To alleviate these +issues, we propose a framework called Lifelong Few-Shot GAN (LFS-GAN) that can +generate high-quality and diverse images in lifelong few-shot image generation +task. Our proposed framework learns each task using an efficient task-specific +modulator - Learnable Factorized Tensor (LeFT). LeFT is rank-constrained and +has a rich representation ability due to its unique reconstruction technique. +Furthermore, we propose a novel mode seeking loss to improve the diversity of +our model in low-data circumstances. Extensive experiments demonstrate that the +proposed LFS-GAN can generate high-fidelity and diverse images without any +forgetting and mode collapse in various domains, achieving state-of-the-art in +lifelong few-shot image generation task. Surprisingly, we find that our LFS-GAN +even outperforms the existing few-shot GANs in the few-shot image generation +task. The code is available at Github. + +
+
+ comment: 20 pages, 19 figures, 14 tables, ICCV 2023 Poster +
+
+
+
+
+ + ☆ Semantic-Aware Implicit Template Learning via Part Deformation + Consistency ICCV + + +
+ Learning implicit templates as neural fields has recently shown impressive +performance in unsupervised shape correspondence. Despite the success, we +observe current approaches, which solely rely on geometric information, often +learn suboptimal deformation across generic object shapes, which have high +structural variability. In this paper, we highlight the importance of part +deformation consistency and propose a semantic-aware implicit template learning +framework to enable semantically plausible deformation. By leveraging semantic +prior from a self-supervised feature extractor, we suggest local conditioning +with novel semantic-aware deformation code and deformation consistency +regularizations regarding part deformation, global deformation, and global +scaling. Our extensive experiments demonstrate the superiority of the proposed +method over baselines in various tasks: keypoint transfer, part label transfer, +and texture transfer. More interestingly, our framework shows a larger +performance gain under more challenging settings. We also provide qualitative +analyses to validate the effectiveness of semantic-aware deformation. The code +is available at https://github.com/mlvlab/PDC. + +
+
+ comment: ICCV camera-ready version +
+
+
+
+
+ + ☆ ACLS: Adaptive and Conditional Label Smoothing for Network Calibration ICCV 2023 + + +
+ We address the problem of network calibration adjusting miscalibrated +confidences of deep neural networks. Many approaches to network calibration +adopt a regularization-based method that exploits a regularization term to +smooth the miscalibrated confidences. Although these approaches have shown the +effectiveness on calibrating the networks, there is still a lack of +understanding on the underlying principles of regularization in terms of +network calibration. We present in this paper an in-depth analysis of existing +regularization-based methods, providing a better understanding on how they +affect to network calibration. Specifically, we have observed that 1) the +regularization-based methods can be interpreted as variants of label smoothing, +and 2) they do not always behave desirably. Based on the analysis, we introduce +a novel loss function, dubbed ACLS, that unifies the merits of existing +regularization methods, while avoiding the limitations. We show extensive +experimental results for image classification and semantic segmentation on +standard benchmarks, including CIFAR10, Tiny-ImageNet, ImageNet, and PASCAL +VOC, demonstrating the effectiveness of our loss function. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Edge-aware Hard Clustering Graph Pooling for Brain Imaging Data + + +
+ Graph Convolutional Networks (GCNs) can capture non-Euclidean spatial +dependence between different brain regions, and the graph pooling operator in +GCNs is key to enhancing the representation learning capability and acquiring +abnormal brain maps. However, the majority of existing research designs graph +pooling operators only from the perspective of nodes while disregarding the +original edge features, in a way that not only confines graph pooling +application scenarios, but also diminishes its ability to capture critical +substructures. In this study, a clustering graph pooling method that first +supports multidimensional edge features, called Edge-aware hard clustering +graph pooling (EHCPool), is developed. EHCPool proposes the first +'Edge-to-node' score evaluation criterion based on edge features to assess node +feature significance. To more effectively capture the critical subgraphs, a +novel Iteration n-top strategy is further designed to adaptively learn sparse +hard clustering assignments for graphs. Subsequently, an innovative N-E +Aggregation strategy is presented to aggregate node and edge feature +information in each independent subgraph. The proposed model was evaluated on +multi-site brain imaging public datasets and yielded state-of-the-art +performance. We believe this method is the first deep learning tool with the +potential to probe different types of abnormal functional brain networks from +data-driven perspective. + +
+
+
+
+
+ + ☆ Rethinking Data Perturbation and Model Stabilization for Semi-supervised + Medical Image Segmentation + + +
+ Studies on semi-supervised medical image segmentation (SSMIS) have seen fast +progress recently. Due to the limited labelled data, SSMIS methods mainly focus +on effectively leveraging unlabeled data to enhance the segmentation +performance. However, despite their promising performance, current +state-of-the-art methods often prioritize integrating complex techniques and +loss terms rather than addressing the core challenges of semi-supervised +scenarios directly. We argue that the key to SSMIS lies in generating +substantial and appropriate prediction disagreement on unlabeled data. To this +end, we emphasize the crutiality of data perturbation and model stabilization +in semi-supervised segmentation, and propose a simple yet effective approach to +boost SSMIS performance significantly, dubbed DPMS. Specifically, we first +revisit SSMIS from three distinct perspectives: the data, the model, and the +loss, and conduct a comprehensive study of corresponding strategies to examine +their effectiveness. Based on these examinations, we then propose DPMS, which +adopts a plain teacher-student framework with a standard supervised loss and +unsupervised consistency loss. To produce appropriate prediction disagreements, +DPMS perturbs the unlabeled data via strong augmentations to enlarge prediction +disagreements considerably. On the other hand, using EMA teacher when strong +augmentation is applied does not necessarily improve performance. DPMS further +utilizes a forwarding-twice and momentum updating strategies for normalization +statistics to stabilize the training on unlabeled data effectively. Despite its +simplicity, DPMS can obtain new state-of-the-art performance on the public 2D +ACDC and 3D LA datasets across various semi-supervised settings, e.g. obtaining +a remarkable 22.62% improvement against previous SOTA on ACDC with 5% labels. + +
+
+ comment: Code and logs are available at https://github.com/ZhenZHAO/DPMS +
+
+
+
+
+ + ☆ Camera-Driven Representation Learning for Unsupervised Domain Adaptive + Person Re-identification ICCV 2023 + + +
+ We present a novel unsupervised domain adaption method for person +re-identification (reID) that generalizes a model trained on a labeled source +domain to an unlabeled target domain. We introduce a camera-driven curriculum +learning (CaCL) framework that leverages camera labels of person images to +transfer knowledge from source to target domains progressively. To this end, we +divide target domain dataset into multiple subsets based on the camera labels, +and initially train our model with a single subset (i.e., images captured by a +single camera). We then gradually exploit more subsets for training, according +to a curriculum sequence obtained with a camera-driven scheduling rule. The +scheduler considers maximum mean discrepancies (MMD) between each subset and +the source domain dataset, such that the subset closer to the source domain is +exploited earlier within the curriculum. For each curriculum sequence, we +generate pseudo labels of person images in a target domain to train a reID +model in a supervised way. We have observed that the pseudo labels are highly +biased toward cameras, suggesting that person images obtained from the same +camera are likely to have the same pseudo labels, even for different IDs. To +address the camera bias problem, we also introduce a camera-diversity (CD) loss +encouraging person images of the same pseudo label, but captured across various +cameras, to involve more for discriminative feature learning, providing person +representations robust to inter-camera variations. Experimental results on +standard benchmarks, including real-to-real and synthetic-to-real scenarios, +demonstrate the effectiveness of our framework. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ HashReID: Dynamic Network with Binary Codes for Efficient Person + Re-identification WACV 2024 + + +
+ Biometric applications, such as person re-identification (ReID), are often +deployed on energy constrained devices. While recent ReID methods prioritize +high retrieval performance, they often come with large computational costs and +high search time, rendering them less practical in real-world settings. In this +work, we propose an input-adaptive network with multiple exit blocks, that can +terminate computation early if the retrieval is straightforward or noisy, +saving a lot of computation. To assess the complexity of the input, we +introduce a temporal-based classifier driven by a new training strategy. +Furthermore, we adopt a binary hash code generation approach instead of relying +on continuous-valued features, which significantly improves the search process +by a factor of 20. To ensure similarity preservation, we utilize a new ranking +regularizer that bridges the gap between continuous and binary features. +Extensive analysis of our proposed method is conducted on three datasets: +Market1501, MSMT17 (Multi-Scene Multi-Time), and the BGC1 (BRIAR Government +Collection). Using our approach, more than 70% of the samples with compact hash +codes exit early on the Market1501 dataset, saving 80% of the networks +computational cost and improving over other hash-based methods by 60%. These +results demonstrate a significant improvement over dynamic networks and +showcase comparable accuracy performance to conventional ReID methods. Code +will be made available. + +
+
+ comment: WACV 2024 +
+
+
+
+
+ + ☆ Exploring the Optimization Objective of One-Class Classification for + Anomaly Detection + + +
+ One-class classification (OCC) is a longstanding method for anomaly +detection. With the powerful representation capability of the pre-trained +backbone, OCC methods have witnessed significant performance improvements. +Typically, most of these OCC methods employ transfer learning to enhance the +discriminative nature of the pre-trained backbone's features, thus achieving +remarkable efficacy. While most current approaches emphasize feature transfer +strategies, we argue that the optimization objective space within OCC methods +could also be an underlying critical factor influencing performance. In this +work, we conducted a thorough investigation into the optimization objective of +OCC. Through rigorous theoretical analysis and derivation, we unveil a key +insights: any space with the suitable norm can serve as an equivalent +substitute for the hypersphere center, without relying on the distribution +assumption of training samples. Further, we provide guidelines for determining +the feasible domain of norms for the OCC optimization objective. This novel +insight sparks a simple and data-agnostic deep one-class classification method. +Our method is straightforward, with a single 1x1 convolutional layer as a +trainable projector and any space with suitable norm as the optimization +objective. Extensive experiments validate the reliability and efficacy of our +findings and the corresponding methodology, resulting in state-of-the-art +performance in both one-class classification and industrial vision anomaly +detection and segmentation tasks. + +
+
+ comment: 15 paegs, 10 figures +
+
+
+
+
+ + ☆ Age Prediction From Face Images Via Contrastive Learning + + +
+ This paper presents a novel approach for accurately estimating age from face +images, which overcomes the challenge of collecting a large dataset of +individuals with the same identity at different ages. Instead, we leverage +readily available face datasets of different people at different ages and aim +to extract age-related features using contrastive learning. Our method +emphasizes these relevant features while suppressing identity-related features +using a combination of cosine similarity and triplet margin losses. We +demonstrate the effectiveness of our proposed approach by achieving +state-of-the-art performance on two public datasets, FG-NET and MORPH-II. + +
+
+ comment: MVA2023 +
+
+
+
+
+ + ☆ Does Physical Adversarial Example Really Matter to Autonomous Driving? + Towards System-Level Effect of Adversarial Object Evasion Attack ICCV 2023 + + +
+ In autonomous driving (AD), accurate perception is indispensable to achieving +safe and secure driving. Due to its safety-criticality, the security of AD +perception has been widely studied. Among different attacks on AD perception, +the physical adversarial object evasion attacks are especially severe. However, +we find that all existing literature only evaluates their attack effect at the +targeted AI component level but not at the system level, i.e., with the entire +system semantics and context such as the full AD pipeline. Thereby, this raises +a critical research question: can these existing researches effectively achieve +system-level attack effects (e.g., traffic rule violations) in the real-world +AD context? In this work, we conduct the first measurement study on whether and +how effectively the existing designs can lead to system-level effects, +especially for the STOP sign-evasion attacks due to their popularity and +severity. Our evaluation results show that all the representative prior works +cannot achieve any system-level effects. We observe two design limitations in +the prior works: 1) physical model-inconsistent object size distribution in +pixel sampling and 2) lack of vehicle plant model and AD system model +consideration. Then, we propose SysAdv, a novel system-driven attack design in +the AD context and our evaluation results show that the system-level effects +can be significantly improved, i.e., the violation rate increases by around +70%. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ A Unified Framework for 3D Point Cloud Visual Grounding + + +
+ 3D point cloud visual grounding plays a critical role in 3D scene +comprehension, encompassing 3D referring expression comprehension (3DREC) and +segmentation (3DRES). We argue that 3DREC and 3DRES should be unified in one +framework, which is also a natural progression in the community. To explain, +3DREC can help 3DRES locate the referent, while 3DRES can also facilitate 3DREC +via more finegrained language-visual alignment. To achieve this, this paper +takes the initiative step to integrate 3DREC and 3DRES into a unified +framework, termed 3D Referring Transformer (3DRefTR). Its key idea is to build +upon a mature 3DREC model and leverage ready query embeddings and visual tokens +from the 3DREC model to construct a dedicated mask branch. Specially, we +propose Superpoint Mask Branch, which serves a dual purpose: i) By leveraging +the heterogeneous CPU-GPU parallelism, while the GPU is occupied generating +visual tokens, the CPU concurrently produces superpoints, equivalently +accomplishing the upsampling computation; ii) By harnessing on the inherent +association between the superpoints and point cloud, it eliminates the heavy +computational overhead on the high-resolution visual features for upsampling. +This elegant design enables 3DRefTR to achieve both well-performing 3DRES and +3DREC capacities with only a 6% additional latency compared to the original +3DREC model. Empirical evaluations affirm the superiority of 3DRefTR. +Specifically, on the ScanRefer dataset, 3DRefTR surpasses the state-of-the-art +3DRES method by 12.43% in mIoU and improves upon the SOTA 3DREC method by 0.6% +Acc@0.25IoU. + +
+
+
+
+
+ + ☆ SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal + Targets ICCV 2023 + + +
+ Scene understanding using multi-modal data is necessary in many applications, +e.g., autonomous navigation. To achieve this in a variety of situations, +existing models must be able to adapt to shifting data distributions without +arduous data annotation. Current approaches assume that the source data is +available during adaptation and that the source consists of paired multi-modal +data. Both these assumptions may be problematic for many applications. Source +data may not be available due to privacy, security, or economic concerns. +Assuming the existence of paired multi-modal data for training also entails +significant data collection costs and fails to take advantage of widely +available freely distributed pre-trained uni-modal models. In this work, we +relax both of these assumptions by addressing the problem of adapting a set of +models trained independently on uni-modal data to a target domain consisting of +unlabeled multi-modal data, without having access to the original source +dataset. Our proposed approach solves this problem through a switching +framework which automatically chooses between two complementary methods of +cross-modal pseudo-label fusion -- agreement filtering and entropy weighting -- +based on the estimated domain gap. We demonstrate our work on the semantic +segmentation problem. Experiments across seven challenging adaptation scenarios +verify the efficacy of our approach, achieving results comparable to, and in +some cases outperforming, methods which assume access to source data. Our +method achieves an improvement in mIoU of up to 12% over competing baselines. +Our code is publicly available at https://github.com/csimo005/SUMMIT. + +
+
+ comment: 12 pages, 5 figures, 9 tables, ICCV 2023 +
+
+
+
+
+ + ☆ Integrated Image and Location Analysis for Wound Classification: A Deep + Learning Approach + + +
+ The global burden of acute and chronic wounds presents a compelling case for +enhancing wound classification methods, a vital step in diagnosing and +determining optimal treatments. Recognizing this need, we introduce an +innovative multi-modal network based on a deep convolutional neural network for +categorizing wounds into four categories: diabetic, pressure, surgical, and +venous ulcers. Our multi-modal network uses wound images and their +corresponding body locations for more precise classification. A unique aspect +of our methodology is incorporating a body map system that facilitates accurate +wound location tagging, improving upon traditional wound image classification +techniques. A distinctive feature of our approach is the integration of models +such as VGG16, ResNet152, and EfficientNet within a novel architecture. This +architecture includes elements like spatial and channel-wise +Squeeze-and-Excitation modules, Axial Attention, and an Adaptive Gated +Multi-Layer Perceptron, providing a robust foundation for classification. Our +multi-modal network was trained and evaluated on two distinct datasets +comprising relevant images and corresponding location information. Notably, our +proposed network outperformed traditional methods, reaching an accuracy range +of 74.79% to 100% for Region of Interest (ROI) without location +classifications, 73.98% to 100% for ROI with location classifications, and +78.10% to 100% for whole image classifications. This marks a significant +enhancement over previously reported performance metrics in the literature. Our +results indicate the potential of our multi-modal network as an effective +decision-support tool for wound image classification, paving the way for its +application in various clinical contexts. + +
+
+
+
+
+ + ☆ Motion-to-Matching: A Mixed Paradigm for 3D Single Object Tracking + + +
+ 3D single object tracking with LiDAR points is an important task in the +computer vision field. Previous methods usually adopt the matching-based or +motion-centric paradigms to estimate the current target status. However, the +former is sensitive to the similar distractors and the sparseness of point +cloud due to relying on appearance matching, while the latter usually focuses +on short-term motion clues (eg. two frames) and ignores the long-term motion +pattern of target. To address these issues, we propose a mixed paradigm with +two stages, named MTM-Tracker, which combines motion modeling with feature +matching into a single network. Specifically, in the first stage, we exploit +the continuous historical boxes as motion prior and propose an encoder-decoder +structure to locate target coarsely. Then, in the second stage, we introduce a +feature interaction module to extract motion-aware features from consecutive +point clouds and match them to refine target movement as well as regress other +target states. Extensive experiments validate that our paradigm achieves +competitive performance on large-scale datasets (70.9% in KITTI and 51.70% in +NuScenes). The code will be open soon at +https://github.com/LeoZhiheng/MTM-Tracker.git. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ☆ Semi-Supervised Learning via Weight-aware Distillation under Class + Distribution Mismatch ICCV 2023 + + +
+ Semi-Supervised Learning (SSL) under class distribution mismatch aims to +tackle a challenging problem wherein unlabeled data contain lots of unknown +categories unseen in the labeled ones. In such mismatch scenarios, traditional +SSL suffers severe performance damage due to the harmful invasion of the +instances with unknown categories into the target classifier. In this study, by +strict mathematical reasoning, we reveal that the SSL error under class +distribution mismatch is composed of pseudo-labeling error and invasion error, +both of which jointly bound the SSL population risk. To alleviate the SSL +error, we propose a robust SSL framework called Weight-Aware Distillation (WAD) +that, by weights, selectively transfers knowledge beneficial to the target task +from unsupervised contrastive representation to the target classifier. +Specifically, WAD captures adaptive weights and high-quality pseudo labels to +target instances by exploring point mutual information (PMI) in representation +space to maximize the role of unlabeled data and filter unknown categories. +Theoretically, we prove that WAD has a tight upper bound of population risk +under class distribution mismatch. Experimentally, extensive results +demonstrate that WAD outperforms five state-of-the-art SSL approaches and one +standard baseline on two benchmark datasets, CIFAR10 and CIFAR100, and an +artificial cross-dataset. The code is available at +https://github.com/RUC-DWBI-ML/research/tree/main/WAD-master. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ CoC-GAN: Employing Context Cluster for Unveiling a New Pathway in Image + Generation + + +
+ Image generation tasks are traditionally undertaken using Convolutional +Neural Networks (CNN) or Transformer architectures for feature aggregating and +dispatching. Despite the frequent application of convolution and attention +structures, these structures are not fundamentally required to solve the +problem of instability and the lack of interpretability in image generation. In +this paper, we propose a unique image generation process premised on the +perspective of converting images into a set of point clouds. In other words, we +interpret an image as a set of points. As such, our methodology leverages +simple clustering methods named Context Clustering (CoC) to generate images +from unordered point sets, which defies the convention of using convolution or +attention mechanisms. Hence, we exclusively depend on this clustering +technique, combined with the multi-layer perceptron (MLP) in a generative +model. Furthermore, we implement the integration of a module termed the 'Point +Increaser' for the model. This module is just an MLP tasked with generating +additional points for clustering, which are subsequently integrated within the +paradigm of the Generative Adversarial Network (GAN). We introduce this model +with the novel structure as the Context Clustering Generative Adversarial +Network (CoC-GAN), which offers a distinctive viewpoint in the domain of +feature aggregating and dispatching. Empirical evaluations affirm that our +CoC-GAN, devoid of convolution and attention mechanisms, exhibits outstanding +performance. Its interpretability, endowed by the CoC module, also allows for +visualization in our experiments. The promising results underscore the +feasibility of our method and thus warrant future investigations of applying +Context Clustering to more novel and interpretable image generation. + +
+
+
+
+
+ + ☆ Compressed Models Decompress Race Biases: What Quantized Models Forget + for Fair Face Recognition + + +
+ With the ever-growing complexity of deep learning models for face +recognition, it becomes hard to deploy these systems in real life. Researchers +have two options: 1) use smaller models; 2) compress their current models. +Since the usage of smaller models might lead to concerning biases, compression +gains relevance. However, compressing might be also responsible for an increase +in the bias of the final model. We investigate the overall performance, the +performance on each ethnicity subgroup and the racial bias of a +State-of-the-Art quantization approach when used with synthetic and real data. +This analysis provides a few more details on potential benefits of performing +quantization with synthetic data, for instance, the reduction of biases on the +majority of test scenarios. We tested five distinct architectures and three +different training datasets. The models were evaluated on a fourth dataset +which was collected to infer and compare the performance of face recognition +models on different ethnicity. + +
+
+ comment: Accepted for Oral at BIOSIG 2023 +
+
+
+
+
+ + ♻ ☆ Randomized Quantization: A Generic Augmentation for Data Agnostic + Self-supervised Learning ICCV 2023 + + +
+ Self-supervised representation learning follows a paradigm of withholding +some part of the data and tasking the network to predict it from the remaining +part. Among many techniques, data augmentation lies at the core for creating +the information gap. Towards this end, masking has emerged as a generic and +powerful tool where content is withheld along the sequential dimension, e.g., +spatial in images, temporal in audio, and syntactic in language. In this paper, +we explore the orthogonal channel dimension for generic data augmentation by +exploiting precision redundancy. The data for each channel is quantized through +a non-uniform quantizer, with the quantized value sampled randomly within +randomly sampled quantization bins. From another perspective, quantization is +analogous to channel-wise masking, as it removes the information within each +bin, but preserves the information across bins. Our approach significantly +surpasses existing generic data augmentation methods, while showing on par +performance against modality-specific augmentations. We comprehensively +evaluate our approach on vision, audio, 3D point clouds, as well as the DABS +benchmark which is comprised of various data modalities. The code is available +at https: //github.com/microsoft/random_quantize. + +
+
+ comment: Accepted by ICCV 2023. The code is available at https: + //github.com/microsoft/random_quantize +
+
+
+
+
+ + ♻ ☆ Back to Optimization: Diffusion-based Zero-Shot 3D Human Pose Estimation + + +
+ Learning-based methods have dominated the 3D human pose estimation (HPE) +tasks with significantly better performance in most benchmarks than traditional +optimization-based methods. Nonetheless, 3D HPE in the wild is still the +biggest challenge of learning-based models, whether with 2D-3D lifting, +image-to-3D, or diffusion-based methods, since the trained networks implicitly +learn camera intrinsic parameters and domain-based 3D human pose distributions +and estimate poses by statistical average. On the other hand, the +optimization-based methods estimate results case-by-case, which can predict +more diverse and sophisticated human poses in the wild. By combining the +advantages of optimization-based and learning-based methods, we propose the +Zero-shot Diffusion-based Optimization (ZeDO) pipeline for 3D HPE to solve the +problem of cross-domain and in-the-wild 3D HPE. Our multi-hypothesis ZeDO +achieves state-of-the-art (SOTA) performance on Human3.6M as minMPJPE $51.4$mm +without training with any 2D-3D or image-3D pairs. Moreover, our +single-hypothesis ZeDO achieves SOTA performance on 3DPW dataset with PA-MPJPE +$42.6$mm on cross-dataset evaluation, which even outperforms learning-based +methods trained on 3DPW. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Selective Labeling for More Effective Semi-Supervised + Learning ECCV 2022 + + +
+ Given an unlabeled dataset and an annotation budget, we study how to +selectively label a fixed number of instances so that semi-supervised learning +(SSL) on such a partially labeled dataset is most effective. We focus on +selecting the right data to label, in addition to usual SSL's propagating +labels from labeled data to the rest unlabeled data. This instance selection +task is challenging, as without any labeled data we do not know what the +objective of learning should be. Intuitively, no matter what the downstream +task is, instances to be labeled must be representative and diverse: The former +would facilitate label propagation to unlabeled data, whereas the latter would +ensure coverage of the entire dataset. We capture this idea by selecting +cluster prototypes, either in a pretrained feature space, or along with feature +optimization, both without labels. Our unsupervised selective labeling +consistently improves SSL methods over state-of-the-art active learning given +labeled data, by 8 to 25 times in label efficiency. For example, it boosts +FixMatch by 10% (14%) in accuracy on CIFAR-10 (ImageNet-1K) with 0.08% (0.2%) +labeled data, demonstrating that small computation spent on selecting what data +to label brings significant gain especially under a low annotation budget. Our +work sets a new standard for practical and efficient SSL. + +
+
+ comment: Accepted by ECCV 2022; Fixed a few typos +
+
+
+
+
+ + ♻ ☆ Methods and datasets for segmentation of minimally invasive surgical + instruments in endoscopic images and videos: A review of the state of the art + + +
+ In the field of computer- and robot-assisted minimally invasive surgery, +enormous progress has been made in recent years based on the recognition of +surgical instruments in endoscopic images and videos. In particular, the +determination of the position and type of instruments is of great interest. +Current work involves both spatial and temporal information, with the idea that +predicting the movement of surgical tools over time may improve the quality of +the final segmentations. The provision of publicly available datasets has +recently encouraged the development of new methods, mainly based on deep +learning. In this review, we identify and characterize datasets used for method +development and evaluation and quantify their frequency of use in the +literature. We further present an overview of the current state of research +regarding the segmentation and tracking of minimally invasive surgical +instruments in endoscopic images and videos. The paper focuses on methods that +work purely visually, without markers of any kind attached to the instruments, +considering both single-frame semantic and instance segmentation approaches, as +well as those that incorporate temporal information. The publications analyzed +were identified through the platforms Google Scholar, Web of Science, and +PubMed. The search terms used were "instrument segmentation", "instrument +tracking", "surgical tool segmentation", and "surgical tool tracking", +resulting in a total of 741 articles published between 01/2015 and 07/2023, of +which 123 were included using systematic selection criteria. A discussion of +the reviewed literature is provided, highlighting existing shortcomings and +emphasizing the available potential for future developments. + +
+
+ comment: 29 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Learning from Semantic Alignment between Unpaired Multiviews for + Egocentric Video Recognition ICCV + + +
+ We are concerned with a challenging scenario in unpaired multiview video +learning. In this case, the model aims to learn comprehensive multiview +representations while the cross-view semantic information exhibits variations. +We propose Semantics-based Unpaired Multiview Learning (SUM-L) to tackle this +unpaired multiview learning problem. The key idea is to build cross-view +pseudo-pairs and do view-invariant alignment by leveraging the semantic +information of videos. To facilitate the data efficiency of multiview learning, +we further perform video-text alignment for first-person and third-person +videos, to fully leverage the semantic knowledge to improve video +representations. Extensive experiments on multiple benchmark datasets verify +the effectiveness of our framework. Our method also outperforms multiple +existing view-alignment methods, under the more challenging scenario than +typical paired or unpaired multimodal or multiview learning. Our code is +available at https://github.com/wqtwjt1996/SUM-L. + +
+
+ comment: Proceedings of IEEE International Conference on Computer Vision + (ICCV) 2023 +
+
+
+
+
+ + ♻ ☆ Advancing Volumetric Medical Image Segmentation via Global-Local Masked + Autoencoder + + +
+ Masked autoencoder (MAE) is a promising self-supervised pre-training +technique that can improve the representation learning of a neural network +without human intervention. However, applying MAE directly to volumetric +medical images poses two challenges: (i) a lack of global information that is +crucial for understanding the clinical context of the holistic data, (ii) no +guarantee of stabilizing the representations learned from randomly masked +inputs. To address these limitations, we propose the +\textbf{G}lobal-\textbf{L}ocal \textbf{M}asked \textbf{A}uto\textbf{E}ncoder +(GL-MAE), a simple yet effective self-supervised pre-training strategy. In +addition to reconstructing masked local views, as in previous methods, GL-MAE +incorporates global context learning by reconstructing masked global views. +Furthermore, a complete global view is integrated as an anchor to guide the +reconstruction and stabilize the learning process through global-to-global +consistency learning and global-to-local consistency learning. Finetuning +results on multiple datasets demonstrate the superiority of our method over +other state-of-the-art self-supervised algorithms, highlighting its +effectiveness on versatile volumetric medical image segmentation tasks, even +when annotations are scarce. Our codes and models will be released upon +acceptance. + +
+
+
+
+
+ + ♻ ☆ Label-Efficient Online Continual Object Detection in Streaming Video ICCV 2023 + + +
+ Humans can watch a continuous video stream and effortlessly perform continual +acquisition and transfer of new knowledge with minimal supervision yet +retaining previously learnt experiences. In contrast, existing continual +learning (CL) methods require fully annotated labels to effectively learn from +individual frames in a video stream. Here, we examine a more realistic and +challenging problem$\unicode{x2014}$Label-Efficient Online Continual Object +Detection (LEOCOD) in streaming video. We propose a plug-and-play module, +Efficient-CLS, that can be easily inserted into and improve existing continual +learners for object detection in video streams with reduced data annotation +costs and model retraining time. We show that our method has achieved +significant improvement with minimal forgetting across all supervision levels +on two challenging CL benchmarks for streaming real-world videos. Remarkably, +with only 25% annotated video frames, our method still outperforms the base CL +learners, which are trained with 100% annotations on all video frames. The data +and source code will be publicly available at +https://github.com/showlab/Efficient-CLS. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Non-Exemplar Online Class-incremental Continual Learning via + Dual-prototype Self-augment and Refinement + + +
+ This paper investigates a new, practical, but challenging problem named +Non-exemplar Online Class-incremental continual Learning (NO-CL), which aims to +preserve the discernibility of base classes without buffering data examples and +efficiently learn novel classes continuously in a single-pass (i.e., online) +data stream. The challenges of this task are mainly two-fold: (1) Both base and +novel classes suffer from severe catastrophic forgetting as no previous samples +are available for replay. (2) As the online data can only be observed once, +there is no way to fully re-train the whole model, e.g., re-calibrate the +decision boundaries via prototype alignment or feature distillation. In this +paper, we propose a novel Dual-prototype Self-augment and Refinement method +(DSR) for NO-CL problem, which consists of two strategies: 1) Dual class +prototypes: vanilla and high-dimensional prototypes are exploited to utilize +the pre-trained information and obtain robust quasi-orthogonal representations +rather than example buffers for both privacy preservation and memory reduction. +2) Self-augment and refinement: Instead of updating the whole network, we +optimize high-dimensional prototypes alternatively with the extra projection +module based on self-augment vanilla prototypes, through a bi-level +optimization problem. Extensive experiments demonstrate the effectiveness and +superiority of the proposed DSR in NO-CL. + +
+
+
+
+
+ + ♻ ☆ Radar-Camera Fusion for Object Detection and Semantic Segmentation in + Autonomous Driving: A Comprehensive Review + + +
+ Driven by deep learning techniques, perception technology in autonomous +driving has developed rapidly in recent years, enabling vehicles to accurately +detect and interpret surrounding environment for safe and efficient navigation. +To achieve accurate and robust perception capabilities, autonomous vehicles are +often equipped with multiple sensors, making sensor fusion a crucial part of +the perception system. Among these fused sensors, radars and cameras enable a +complementary and cost-effective perception of the surrounding environment +regardless of lighting and weather conditions. This review aims to provide a +comprehensive guideline for radar-camera fusion, particularly concentrating on +perception tasks related to object detection and semantic segmentation.Based on +the principles of the radar and camera sensors, we delve into the data +processing process and representations, followed by an in-depth analysis and +summary of radar-camera fusion datasets. In the review of methodologies in +radar-camera fusion, we address interrogative questions, including "why to +fuse", "what to fuse", "where to fuse", "when to fuse", and "how to fuse", +subsequently discussing various challenges and potential research directions +within this domain. To ease the retrieval and comparison of datasets and fusion +methods, we also provide an interactive website: +https://radar-camera-fusion.github.io. + +
+
+ comment: Accepted by IEEE Transactions on Intelligent Vehicles (T-IV) +
+
+
+
+
+ + ♻ ☆ Black-box Source-free Domain Adaptation via Two-stage Knowledge + Distillation IJCAI 1 + + +
+ Source-free domain adaptation aims to adapt deep neural networks using only +pre-trained source models and target data. However, accessing the source model +still has a potential concern about leaking the source data, which reveals the +patient's privacy. In this paper, we study the challenging but practical +problem: black-box source-free domain adaptation where only the outputs of the +source model and target data are available. We propose a simple but effective +two-stage knowledge distillation method. In Stage +\uppercase\expandafter{\romannumeral1}, we train the target model from scratch +with soft pseudo-labels generated by the source model in a knowledge +distillation manner. In Stage \uppercase\expandafter{\romannumeral2}, we +initialize another model as the new student model to avoid the error +accumulation caused by noisy pseudo-labels. We feed the images with weak +augmentation to the teacher model to guide the learning of the student model. +Our method is simple and flexible, and achieves surprising results on three +cross-domain segmentation tasks. + +
+
+ comment: The short version is accepted by IJCAI 1st International Workshop on + Generalizing from Limited Resources in the Open World. (This version is long + version) +
+
+
+
+
+ + ♻ ☆ Learning Interpretable Dynamics from Images of a Freely Rotating 3D + Rigid Body + + +
+ In many real-world settings, image observations of freely rotating 3D rigid +bodies, such as satellites, may be available when low-dimensional measurements +are not. However, the high-dimensionality of image data precludes the use of +classical estimation techniques to learn the dynamics and a lack of +interpretability reduces the usefulness of standard deep learning methods. In +this work, we present a physics-informed neural network model to estimate and +predict 3D rotational dynamics from image sequences. We achieve this using a +multi-stage prediction pipeline that maps individual images to a latent +representation homeomorphic to $\mathbf{SO}(3)$, computes angular velocities +from latent pairs, and predicts future latent states using the Hamiltonian +equations of motion with a learned representation of the Hamiltonian. We +demonstrate the efficacy of our approach on a new rotating rigid-body dataset +with sequences of rotating cubes and rectangular prisms with uniform and +non-uniform density. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ On the link between generative semi-supervised learning and generative + open-set recognition + + +
+ This study investigates the relationship between semi-supervised learning +(SSL, which is training off partially labelled datasets) and open-set +recognition (OSR, which is classification with simultaneous novelty detection) +under the context of generative adversarial networks (GANs). Although no +previous study has formally linked SSL and OSR, their respective methods share +striking similarities. Specifically, SSL-GANs and OSR-GANs require their +generators to produce 'bad-looking' samples which are used to regularise their +classifier networks. We hypothesise that the definitions of bad-looking samples +in SSL and OSR represents the same concept and realises the same goal. More +formally, bad-looking samples lie in the complementary space, which is the area +between and around the boundaries of the labelled categories within the +classifier's embedding space. By regularising a classifier with samples in the +complementary space, classifiers achieve improved generalisation for SSL and +also generalise the open space for OSR. To test this hypothesis, we compare a +foundational SSL-GAN with the state-of-the-art OSR-GAN under the same SSL-OSR +experimental conditions. Our results find that SSL-GANs achieve near identical +results to OSR-GANs, proving the SSL-OSR link. Subsequently, to further this +new research path, we compare several SSL-GANs various SSL-OSR setups which +this first benchmark results. A combined framework of SSL-OSR certainly +improves the practicality and cost-efficiency of classifier training, and so +further theoretical and application studies are also discussed. + +
+
+
+
+
+ + ♻ ☆ Backdooring Textual Inversion for Concept Censorship + + +
+ Recent years have witnessed success in AIGC (AI Generated Content). People +can make use of a pre-trained diffusion model to generate images of high +quality or freely modify existing pictures with only prompts in nature +language. More excitingly, the emerging personalization techniques make it +feasible to create specific-desired images with only a few images as +references. However, this induces severe threats if such advanced techniques +are misused by malicious users, such as spreading fake news or defaming +individual reputations. Thus, it is necessary to regulate personalization +models (i.e., concept censorship) for their development and advancement. + In this paper, we focus on the personalization technique dubbed Textual +Inversion (TI), which is becoming prevailing for its lightweight nature and +excellent performance. TI crafts the word embedding that contains detailed +information about a specific object. Users can easily download the word +embedding from public websites like Civitai and add it to their own stable +diffusion model without fine-tuning for personalization. To achieve the concept +censorship of a TI model, we propose leveraging the backdoor technique for good +by injecting backdoors into the Textual Inversion embeddings. Briefly, we +select some sensitive words as triggers during the training of TI, which will +be censored for normal use. In the subsequent generation stage, if the triggers +are combined with personalized embeddings as final prompts, the model will +output a pre-defined target image rather than images including the desired +malicious concept. + To demonstrate the effectiveness of our approach, we conduct extensive +experiments on Stable Diffusion, a prevailing open-sourced text-to-image model. +Our code, data, and results are available at +https://concept-censorship.github.io. + +
+
+
+
+
+ + ♻ ☆ Knowledge-Aware Federated Active Learning with Non-IID Data ICCV23 + + +
+ Federated learning enables multiple decentralized clients to learn +collaboratively without sharing the local training data. However, the expensive +annotation cost to acquire data labels on local clients remains an obstacle in +utilizing local data. In this paper, we propose a federated active learning +paradigm to efficiently learn a global model with limited annotation budget +while protecting data privacy in a decentralized learning way. The main +challenge faced by federated active learning is the mismatch between the active +sampling goal of the global model on the server and that of the asynchronous +local clients. This becomes even more significant when data is distributed +non-IID across local clients. To address the aforementioned challenge, we +propose Knowledge-Aware Federated Active Learning (KAFAL), which consists of +Knowledge-Specialized Active Sampling (KSAS) and Knowledge-Compensatory +Federated Update (KCFU). KSAS is a novel active sampling method tailored for +the federated active learning problem. It deals with the mismatch challenge by +sampling actively based on the discrepancies between local and global models. +KSAS intensifies specialized knowledge in local clients, ensuring the sampled +data to be informative for both the local clients and the global model. KCFU, +in the meantime, deals with the client heterogeneity caused by limited data and +non-IID data distributions. It compensates for each client's ability in weak +classes by the assistance of the global model. Extensive experiments and +analyses are conducted to show the superiority of KSAS over the +state-of-the-art active learning methods and the efficiency of KCFU under the +federated active learning framework. + +
+
+ comment: 14 pages, 12 figures, ICCV23 +
+
+
+
+
+ + ♻ ☆ Zero-Shot In-Distribution Detection in Multi-Object Settings Using + Vision-Language Foundation Models + + +
+ Extracting in-distribution (ID) images from noisy images scraped from the +Internet is an important preprocessing for constructing datasets, which has +traditionally been done manually. Automating this preprocessing with deep +learning techniques presents two key challenges. First, images should be +collected using only the name of the ID class without training on the ID data. +Second, as we can see why COCO was created, it is crucial to identify images +containing not only ID objects but also both ID and out-of-distribution (OOD) +objects as ID images to create robust recognizers. In this paper, we propose a +novel problem setting called zero-shot in-distribution (ID) detection, where we +identify images containing ID objects as ID images (even if they contain OOD +objects), and images lacking ID objects as OOD images without any training. To +solve this problem, we leverage the powerful zero-shot capability of CLIP and +present a simple and effective approach, Global-Local Maximum Concept Matching +(GL-MCM), based on both global and local visual-text alignments of CLIP +features. Extensive experiments demonstrate that GL-MCM outperforms comparison +methods on both multi-object datasets and single-object ImageNet benchmarks. +The code will be available via https://github.com/AtsuMiyai/GL-MCM. + +
+
+ comment: v3: I fixed some typos from v2 +
+
+
+
+
+ + ♻ ☆ Multimodal Garment Designer: Human-Centric Latent Diffusion Models for + Fashion Image Editing ICCV 2023 + + +
+ Fashion illustration is used by designers to communicate their vision and to +bring the design idea from conceptualization to realization, showing how +clothes interact with the human body. In this context, computer vision can thus +be used to improve the fashion design process. Differently from previous works +that mainly focused on the virtual try-on of garments, we propose the task of +multimodal-conditioned fashion image editing, guiding the generation of +human-centric fashion images by following multimodal prompts, such as text, +human body poses, and garment sketches. We tackle this problem by proposing a +new architecture based on latent diffusion models, an approach that has not +been used before in the fashion domain. Given the lack of existing datasets +suitable for the task, we also extend two existing fashion datasets, namely +Dress Code and VITON-HD, with multimodal annotations collected in a +semi-automatic manner. Experimental results on these new datasets demonstrate +the effectiveness of our proposal, both in terms of realism and coherence with +the given multimodal inputs. Source code and collected multimodal annotations +are publicly available at: +https://github.com/aimagelab/multimodal-garment-designer. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Learnable Differencing Center for Nighttime Depth Perception + + +
+ Depth completion is the task of recovering dense depth maps from sparse ones, +usually with the help of color images. Existing image-guided methods perform +well on daytime depth perception self-driving benchmarks, but struggle in +nighttime scenarios with poor visibility and complex illumination. To address +these challenges, we propose a simple yet effective framework called LDCNet. +Our key idea is to use Recurrent Inter-Convolution Differencing (RICD) and +Illumination-Affinitive Intra-Convolution Differencing (IAICD) to enhance the +nighttime color images and reduce the negative effects of the varying +illumination, respectively. RICD explicitly estimates global illumination by +differencing two convolutions with different kernels, treating the +small-kernel-convolution feature as the center of the large-kernel-convolution +feature in a new perspective. IAICD softly alleviates local relative light +intensity by differencing a single convolution, where the center is dynamically +aggregated based on neighboring pixels and the estimated illumination map in +RICD. On both nighttime depth completion and depth estimation tasks, extensive +experiments demonstrate the effectiveness of our LDCNet, reaching the state of +the art. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ HiFace: High-Fidelity 3D Face Reconstruction by Learning Static and + Dynamic Details ICCV 2023 + + +
+ 3D Morphable Models (3DMMs) demonstrate great potential for reconstructing +faithful and animatable 3D facial surfaces from a single image. The facial +surface is influenced by the coarse shape, as well as the static detail (e,g., +person-specific appearance) and dynamic detail (e.g., expression-driven +wrinkles). Previous work struggles to decouple the static and dynamic details +through image-level supervision, leading to reconstructions that are not +realistic. In this paper, we aim at high-fidelity 3D face reconstruction and +propose HiFace to explicitly model the static and dynamic details. +Specifically, the static detail is modeled as the linear combination of a +displacement basis, while the dynamic detail is modeled as the linear +interpolation of two displacement maps with polarized expressions. We exploit +several loss functions to jointly learn the coarse shape and fine details with +both synthetic and real-world datasets, which enable HiFace to reconstruct +high-fidelity 3D shapes with animatable details. Extensive quantitative and +qualitative experiments demonstrate that HiFace presents state-of-the-art +reconstruction quality and faithfully recovers both the static and dynamic +details. Our project page can be found at https://project-hiface.github.io. + +
+
+ comment: Accepted to ICCV 2023, camera-ready version; Project page: + https://project-hiface.github.io/ +
+
+
+
+
+ + ♻ ☆ Deep Image Fingerprint: Towards Low Budget Synthetic Image Detection and + Model Lineage Analysis + + +
+ The generation of high-quality images has become widely accessible and is a +rapidly evolving process. As a result, anyone can generate images that are +indistinguishable from real ones. This leads to a wide range of applications, +including malicious usage with deceptive intentions. Despite advances in +detection techniques for generated images, a robust detection method still +eludes us. Furthermore, model personalization techniques might affect the +detection capabilities of existing methods. In this work, we utilize the +architectural properties of convolutional neural networks (CNNs) to develop a +new detection method. Our method can detect images from a known generative +model and enable us to establish relationships between fine-tuned generative +models. We tested the method on images produced by both Generative Adversarial +Networks (GANs) and recent large text-to-image models (LTIMs) that rely on +Diffusion Models. Our approach outperforms others trained under identical +conditions and achieves comparable performance to state-of-the-art pre-trained +detection methods on images generated by Stable Diffusion and MidJourney, with +significantly fewer required train samples. + +
+
+
+
+
+ + ♻ ☆ GridMM: Grid Memory Map for Vision-and-Language Navigation ICCV 2023 + + +
+ Vision-and-language navigation (VLN) enables the agent to navigate to a +remote location following the natural language instruction in 3D environments. +To represent the previously visited environment, most approaches for VLN +implement memory using recurrent states, topological maps, or top-down semantic +maps. In contrast to these approaches, we build the top-down egocentric and +dynamically growing Grid Memory Map (i.e., GridMM) to structure the visited +environment. From a global perspective, historical observations are projected +into a unified grid map in a top-down view, which can better represent the +spatial relations of the environment. From a local perspective, we further +propose an instruction relevance aggregation method to capture fine-grained +visual clues in each grid region. Extensive experiments are conducted on both +the REVERIE, R2R, SOON datasets in the discrete environments, and the R2R-CE +dataset in the continuous environments, showing the superiority of our proposed +method. + +
+
+ comment: Accepted by ICCV 2023. The code is available at + https://github.com/MrZihan/GridMM +
+
+
+
+
+ + ♻ ☆ UTRNet: High-Resolution Urdu Text Recognition In Printed Documents ICDAR 2023 + + +
+ In this paper, we propose a novel approach to address the challenges of +printed Urdu text recognition using high-resolution, multi-scale semantic +feature extraction. Our proposed UTRNet architecture, a hybrid CNN-RNN model, +demonstrates state-of-the-art performance on benchmark datasets. To address the +limitations of previous works, which struggle to generalize to the intricacies +of the Urdu script and the lack of sufficient annotated real-world data, we +have introduced the UTRSet-Real, a large-scale annotated real-world dataset +comprising over 11,000 lines and UTRSet-Synth, a synthetic dataset with 20,000 +lines closely resembling real-world and made corrections to the ground truth of +the existing IIITH dataset, making it a more reliable resource for future +research. We also provide UrduDoc, a benchmark dataset for Urdu text line +detection in scanned documents. Additionally, we have developed an online tool +for end-to-end Urdu OCR from printed documents by integrating UTRNet with a +text detection model. Our work not only addresses the current limitations of +Urdu OCR but also paves the way for future research in this area and +facilitates the continued advancement of Urdu OCR technology. The project page +with source code, datasets, annotations, trained models, and online tool is +available at abdur75648.github.io/UTRNet. + +
+
+ comment: Accepted at The 17th International Conference on Document Analysis + and Recognition (ICDAR 2023) +
+
+
+
+
+ + ♻ ☆ EDO-Net: Learning Elastic Properties of Deformable Objects from Graph + Dynamics + + +
+ We study the problem of learning graph dynamics of deformable objects that +generalizes to unknown physical properties. Our key insight is to leverage a +latent representation of elastic physical properties of cloth-like deformable +objects that can be extracted, for example, from a pulling interaction. In this +paper we propose EDO-Net (Elastic Deformable Object - Net), a model of graph +dynamics trained on a large variety of samples with different elastic +properties that does not rely on ground-truth labels of the properties. EDO-Net +jointly learns an adaptation module, and a forward-dynamics module. The former +is responsible for extracting a latent representation of the physical +properties of the object, while the latter leverages the latent representation +to predict future states of cloth-like objects represented as graphs. We +evaluate EDO-Net both in simulation and real world, assessing its capabilities +of: 1) generalizing to unknown physical properties, 2) transferring the learned +representation to new downstream tasks. + +
+
+
+
+
+ + ♻ ☆ Learning to Generalize towards Unseen Domains via a Content-Aware Style + Invariant Model for Disease Detection from Chest X-rays + + +
+ Performance degradation due to source domain mismatch is a longstanding +challenge in deep learning-based medical image analysis, particularly for chest +X-rays (CXRs). Several methods (e.g., adversarial training, multi-domain +mixups) have been proposed to extract domain-invariant high-level features to +address this domain shift. However, these methods do not explicitly regularize +the content and style characteristics of the extracted domain-invariant +features. Recent studies have demonstrated that CNN models exhibit a strong +bias toward styles (e.g., uninformative textures) rather than content (e.g., +shape), in stark contrast to the human-vision system. Radiologists tend to +learn visual cues from CXRs and thus perform well across multiple domains. +Therefore, in medical imaging for pathology diagnosis from CXR images, models +should extract domain-invariant features that are style-invariant and +content-biased. Motivated by this, we employ the novel style randomization +modules (SRMs) at both image and feature levels that work together +hierarchically to create rich style perturbed features on the fly while keeping +the content intact. In addition, we leverage consistency regularizations +between global semantic features and predicted probability distributions, +respectively, for with and without style perturbed versions of the same CXR +image to tweak the model's sensitivity toward content markers for accurate +predictions. Extensive experiments with three large-scale thoracic disease +datasets, i.e., CheXpert, MIMIC-CXR, and BRAX, demonstrate that our proposed +framework is more robust in the presence of domain shift and achieves +state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ Neural Spherical Harmonics for structurally coherent continuous + representation of diffusion MRI signal MICCAI 2023 + + +
+ We present a novel way to model diffusion magnetic resonance imaging (dMRI) +datasets, that benefits from the structural coherence of the human brain while +only using data from a single subject. Current methods model the dMRI signal in +individual voxels, disregarding the intervoxel coherence that is present. We +use a neural network to parameterize a spherical harmonics series (NeSH) to +represent the dMRI signal of a single subject from the Human Connectome Project +dataset, continuous in both the angular and spatial domain. The reconstructed +dMRI signal using this method shows a more structurally coherent representation +of the data. Noise in gradient images is removed and the fiber orientation +distribution functions show a smooth change in direction along a fiber tract. +We showcase how the reconstruction can be used to calculate mean diffusivity, +fractional anisotropy, and total apparent fiber density. These results can be +achieved with a single model architecture, tuning only one hyperparameter. In +this paper we also demonstrate how upsampling in both the angular and spatial +domain yields reconstructions that are on par or better than existing methods. + +
+
+ comment: 12 pages, 6 figures, accepted for cdMRI workshop at MICCAI 2023 + Updated to fix typo in author name (Villanova -> Vilanova) +
+
+
+
+
+ + ♻ ☆ Information Theory-Guided Heuristic Progressive Multi-View Coding + + +
+ Multi-view representation learning aims to capture comprehensive information +from multiple views of a shared context. Recent works intuitively apply +contrastive learning to different views in a pairwise manner, which is still +scalable: view-specific noise is not filtered in learning view-shared +representations; the fake negative pairs, where the negative terms are actually +within the same class as the positive, and the real negative pairs are +coequally treated; evenly measuring the similarities between terms might +interfere with optimization. Importantly, few works study the theoretical +framework of generalized self-supervised multi-view learning, especially for +more than two views. To this end, we rethink the existing multi-view learning +paradigm from the perspective of information theory and then propose a novel +information theoretical framework for generalized multi-view learning. Guided +by it, we build a multi-view coding method with a three-tier progressive +architecture, namely Information theory-guided hierarchical Progressive +Multi-view Coding (IPMC). In the distribution-tier, IPMC aligns the +distribution between views to reduce view-specific noise. In the set-tier, IPMC +constructs self-adjusted contrasting pools, which are adaptively modified by a +view filter. Lastly, in the instance-tier, we adopt a designed unified loss to +learn representations and reduce the gradient interference. Theoretically and +empirically, we demonstrate the superiority of IPMC over state-of-the-art +methods. + +
+
+ comment: This paper is accepted by the jourcal of Neural Networks (Elsevier) + by 2023. arXiv admin note: substantial text overlap with arXiv:2109.02344 +
+
+
+
+
+ + ♻ ☆ BallGAN: 3D-aware Image Synthesis with a Spherical Background ICCV 2023 + + +
+ 3D-aware GANs aim to synthesize realistic 3D scenes such that they can be +rendered in arbitrary perspectives to produce images. Although previous methods +produce realistic images, they suffer from unstable training or degenerate +solutions where the 3D geometry is unnatural. We hypothesize that the 3D +geometry is underdetermined due to the insufficient constraint, i.e., being +classified as real image to the discriminator is not enough. To solve this +problem, we propose to approximate the background as a spherical surface and +represent a scene as a union of the foreground placed in the sphere and the +thin spherical background. It reduces the degree of freedom in the background +field. Accordingly, we modify the volume rendering equation and incorporate +dedicated constraints to design a novel 3D-aware GAN framework named BallGAN. +BallGAN has multiple advantages as follows. 1) It produces more reasonable 3D +geometry; the images of a scene across different viewpoints have better +photometric consistency and fidelity than the state-of-the-art methods. 2) The +training becomes much more stable. 3) The foreground can be separately rendered +on top of different arbitrary backgrounds. + +
+
+ comment: ICCV 2023, Project Page: https://minjung-s.github.io/ballgan +
+
+
+
+
+ + ♻ ☆ Foundation Model-oriented Robustness: Robust Image Model Evaluation with + Pretrained Models + + +
+ Machine learning has demonstrated remarkable performance over finite +datasets, yet whether the scores over the fixed benchmarks can sufficiently +indicate the model's performance in the real world is still in discussion. In +reality, an ideal robust model will probably behave similarly to the oracle +(e.g., the human users), thus a good evaluation protocol is probably to +evaluate the models' behaviors in comparison to the oracle. In this paper, we +introduce a new robustness measurement that directly measures the image +classification model's performance compared with a surrogate oracle (i.e., a +foundation model). Besides, we design a simple method that can accomplish the +evaluation beyond the scope of the benchmarks. Our method extends the image +datasets with new samples that are sufficiently perturbed to be distinct from +the ones in the original sets, but are still bounded within the same +image-label structure the original test image represents, constrained by a +foundation model pretrained with a large amount of samples. As a result, our +new method will offer us a new way to evaluate the models' robustness +performance, free of limitations of fixed benchmarks or constrained +perturbations, although scoped by the power of the oracle. In addition to the +evaluation results, we also leverage our generated data to understand the +behaviors of the model and our new evaluation strategies. + +
+
+
+
+
+ + ♻ ☆ AutoPoster: A Highly Automatic and Content-aware Design System for + Advertising Poster Generation ACM MM 2023 + + +
+ Advertising posters, a form of information presentation, combine visual and +linguistic modalities. Creating a poster involves multiple steps and +necessitates design experience and creativity. This paper introduces +AutoPoster, a highly automatic and content-aware system for generating +advertising posters. With only product images and titles as inputs, AutoPoster +can automatically produce posters of varying sizes through four key stages: +image cleaning and retargeting, layout generation, tagline generation, and +style attribute prediction. To ensure visual harmony of posters, two +content-aware models are incorporated for layout and tagline generation. +Moreover, we propose a novel multi-task Style Attribute Predictor (SAP) to +jointly predict visual style attributes. Meanwhile, to our knowledge, we +propose the first poster generation dataset that includes visual attribute +annotations for over 76k posters. Qualitative and quantitative outcomes from +user studies and experiments substantiate the efficacy of our system and the +aesthetic superiority of the generated posters compared to other poster +generation methods. + +
+
+ comment: Accepted for ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Learning Multiscale Consistency for Self-supervised Electron Microscopy + Instance Segmentation + + +
+ Instance segmentation in electron microscopy (EM) volumes poses a significant +challenge due to the complex morphology of instances and insufficient +annotations. Self-supervised learning has recently emerged as a promising +solution, enabling the acquisition of prior knowledge of cellular tissue +structures that are essential for EM instance segmentation. However, existing +pretraining methods often lack the ability to capture complex visual patterns +and relationships between voxels, which results in the acquired prior knowledge +being insufficient for downstream EM analysis tasks. In this paper, we propose +a novel pretraining framework that leverages multiscale visual representations +to capture both voxel-level and feature-level consistency in EM volumes. +Specifically, our framework enforces voxel-level consistency between the +outputs of a Siamese network by a reconstruction function, and incorporates a +cross-attention mechanism for soft feature matching to achieve fine-grained +feature-level consistency. Moreover, we propose a contrastive learning scheme +on the feature pyramid to extract discriminative features across multiple +scales. We extensively pretrain our method on four large-scale EM datasets, +achieving promising performance improvements in representative tasks of neuron +and mitochondria instance segmentation. + +
+
+
+
+
+ + ♻ ☆ Iteratively Coupled Multiple Instance Learning from Instance to Bag + Classifier for Whole Slide Image Classification + + +
+ Whole Slide Image (WSI) classification remains a challenge due to their +extremely high resolution and the absence of fine-grained labels. Presently, +WSI classification is usually regarded as a Multiple Instance Learning (MIL) +problem when only slide-level labels are available. MIL methods involve a patch +embedding module and a bag-level classification module, but they are +prohibitively expensive to be trained in an end-to-end manner. Therefore, +existing methods usually train them separately, or directly skip the training +of the embedder. Such schemes hinder the patch embedder's access to slide-level +semantic labels, resulting in inconsistency within the entire MIL pipeline. To +overcome this issue, we propose a novel framework called Iteratively Coupled +MIL (ICMIL), which bridges the loss back-propagation process from the bag-level +classifier to the patch embedder. In ICMIL, we use category information in the +bag-level classifier to guide the patch-level fine-tuning of the patch feature +extractor. The refined embedder then generates better instance representations +for achieving a more accurate bag-level classifier. By coupling the patch +embedder and bag classifier at a low cost, our proposed framework enables +information exchange between the two modules, benefiting the entire MIL +classification model. We tested our framework on two datasets using three +different backbones, and our experimental results demonstrate consistent +performance improvements over state-of-the-art MIL methods. The code is +available at: https://github.com/Dootmaan/ICMIL. + +
+
+
+
+
+ + ♻ ☆ Target-Grounded Graph-Aware Transformer for Aerial Vision-and-Dialog + Navigation + + +
+ This report details the method of the winning entry of the AVDN Challenge in +ICCV 2023. The competition addresses the Aerial Navigation from Dialog History +(ANDH) task, which requires a drone agent to associate dialog history with +aerial observations to reach the destination. For better cross-modal grounding +abilities of the drone agent, we propose a Target-Grounded Graph-Aware +Transformer (TG-GAT) framework. Concretely, TG-GAT first leverages a +graph-aware transformer to capture spatiotemporal dependency, which benefits +navigation state tracking and robust action planning. In addition, an auxiliary +visual grounding task is devised to boost the agent's awareness of referred +landmarks. Moreover, a hybrid augmentation strategy based on large language +models is utilized to mitigate data scarcity limitations. Our TG-GAT framework +won the AVDN Challenge 2023, with 2.2% and 3.0% absolute improvements over the +baseline on SPL and SR metrics, respectively. The code is available at +https://github.com/yifeisu/avdn-challenge. + +
+
+
+
+
+ + ♻ ☆ BHSD: A 3D Multi-Class Brain Hemorrhage Segmentation Dataset + + +
+ Intracranial hemorrhage (ICH) is a pathological condition characterized by +bleeding inside the skull or brain, which can be attributed to various factors. +Identifying, localizing and quantifying ICH has important clinical +implications, in a bleed-dependent manner. While deep learning techniques are +widely used in medical image segmentation and have been applied to the ICH +segmentation task, existing public ICH datasets do not support the multi-class +segmentation problem. To address this, we develop the Brain Hemorrhage +Segmentation Dataset (BHSD), which provides a 3D multi-class ICH dataset +containing 192 volumes with pixel-level annotations and 2200 volumes with +slice-level annotations across five categories of ICH. To demonstrate the +utility of the dataset, we formulate a series of supervised and semi-supervised +ICH segmentation tasks. We provide experimental results with state-of-the-art +models as reference benchmarks for further model developments and evaluations +on this dataset. + +
+
+ comment: Accepted by MLMI 2023 +
+
+
+
+
+ + ♻ ☆ Positive Label Is All You Need for Multi-Label Classification + + +
+ Multi-label classification (MLC) suffers from the inevitable label noise in +training data due to the difficulty in annotating various semantic labels in +each image. To mitigate the influence of noisy labels, existing methods mainly +devote to identifying and correcting the label mistakes via a trained MLC +model. However, these methods still involve annoying noisy labels in training, +which can result in imprecise recognition of noisy labels and weaken the +performance. In this paper, considering that the negative labels are +substantially more than positive labels, and most noisy labels are from the +negative labels, we directly discard all the negative labels in the dataset, +and propose a new method dubbed positive and unlabeled multi-label +classification (PU-MLC). By extending positive-unlabeled learning into MLC +task, our method trains model with only positive labels and unlabeled data, and +introduces adaptive re-balance factor and adaptive temperature coefficient in +the loss function to alleviate the catastrophic imbalance in label distribution +and over-smoothing of probabilities in training. Furthermore, to capture both +local and global dependencies in the image, we also introduce a local-global +convolution module, which supplements global information into existing +convolution layers with no retraining of backbone required. Our PU-MLC is +simple and effective, and it is applicable to both MLC and MLC with partial +labels (MLC-PL) tasks. Extensive experiments on MS-COCO and PASCAL VOC datasets +demonstrate that our PU-MLC achieves significantly improvements on both MLC and +MLC-PL settings with even fewer annotations. Code will be released. + +
+
+
+
+
+ + ♻ ☆ AltDiffusion: A Multilingual Text-to-Image Diffusion Model + + +
+ Large Text-to-Image(T2I) diffusion models have shown a remarkable capability +to produce photorealistic and diverse images based on text inputs. However, +existing works only support limited language input, e.g., English, Chinese, and +Japanese, leaving users beyond these languages underserved and blocking the +global expansion of T2I models. Therefore, this paper presents AltDiffusion, a +novel multilingual T2I diffusion model that supports eighteen different +languages. Specifically, we first train a multilingual text encoder based on +the knowledge distillation. Then we plug it into a pretrained English-only +diffusion model and train the model with a two-stage schema to enhance the +multilingual capability, including concept alignment and quality improvement +stage on a large-scale multilingual dataset. Furthermore, we introduce a new +benchmark, which includes Multilingual-General-18(MG-18) and +Multilingual-Cultural-18(MC-18) datasets, to evaluate the capabilities of T2I +diffusion models for generating high-quality images and capturing +culture-specific concepts in different languages. Experimental results on both +MG-18 and MC-18 demonstrate that AltDiffusion outperforms current +state-of-the-art T2I models, e.g., Stable Diffusion in multilingual +understanding, especially with respect to culture-specific concepts, while +still having comparable capability for generating high-quality images. All +source code and checkpoints could be found in +https://github.com/superhero-7/AltDiffuson. + +
+
+ comment: 15 pages; 17 figures +
+
+
+
+
+ + ♻ ☆ Chain-of-Thought Prompt Distillation for Multimodal Named Entity + Recognition and Multimodal Relation Extraction + + +
+ Multimodal Named Entity Recognition (MNER) and Multimodal Relation Extraction +(MRE) necessitate the fundamental reasoning capacity for intricate linguistic +and multimodal comprehension. In this study, we explore distilling the +reasoning ability of large language models (LLMs) into a more compact student +model by generating a \textit{chain of thought} (CoT) -- a sequence of +intermediate reasoning steps. Specifically, we commence by exemplifying the +elicitation of such reasoning ability from LLMs through CoT prompts covering +multi-grain (noun, sentence, multimodality) and data-augmentation (style, +entity, image) dimensions. Subsequently, we present a novel conditional prompt +distillation method to assimilate the commonsense reasoning ability from LLMs, +thereby enhancing the utility of the student model in addressing text-only +inputs without the requisite addition of image and CoT knowledge. Extensive +experiments reveal that our approach attains state-of-the-art accuracy and +manifests a plethora of advantages concerning interpretability, data +efficiency, and cross-domain generalization on MNER and MRE datasets. + +
+
+ comment: modification +
+
+
+
+
+ + ♻ ☆ MSECNet: Accurate and Robust Normal Estimation for 3D Point Clouds by + Multi-Scale Edge Conditioning ACM MM 2023 + + +
+ Estimating surface normals from 3D point clouds is critical for various +applications, including surface reconstruction and rendering. While existing +methods for normal estimation perform well in regions where normals change +slowly, they tend to fail where normals vary rapidly. To address this issue, we +propose a novel approach called MSECNet, which improves estimation in normal +varying regions by treating normal variation modeling as an edge detection +problem. MSECNet consists of a backbone network and a multi-scale edge +conditioning (MSEC) stream. The MSEC stream achieves robust edge detection +through multi-scale feature fusion and adaptive edge detection. The detected +edges are then combined with the output of the backbone network using the edge +conditioning module to produce edge-aware representations. Extensive +experiments show that MSECNet outperforms existing methods on both synthetic +(PCPNet) and real-world (SceneNN) datasets while running significantly faster. +We also conduct various analyses to investigate the contribution of each +component in the MSEC stream. Finally, we demonstrate the effectiveness of our +approach in surface reconstruction. + +
+
+ comment: Accepted for ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Spherical Space Feature Decomposition for Guided Depth Map + Super-Resolution ICCV 2023 + + +
+ Guided depth map super-resolution (GDSR), as a hot topic in multi-modal image +processing, aims to upsample low-resolution (LR) depth maps with additional +information involved in high-resolution (HR) RGB images from the same scene. +The critical step of this task is to effectively extract domain-shared and +domain-private RGB/depth features. In addition, three detailed issues, namely +blurry edges, noisy surfaces, and over-transferred RGB texture, need to be +addressed. In this paper, we propose the Spherical Space feature Decomposition +Network (SSDNet) to solve the above issues. To better model cross-modality +features, Restormer block-based RGB/depth encoders are employed for extracting +local-global features. Then, the extracted features are mapped to the spherical +space to complete the separation of private features and the alignment of +shared features. Shared features of RGB are fused with the depth features to +complete the GDSR task. Subsequently, a spherical contrast refinement (SCR) +module is proposed to further address the detail issues. Patches that are +classified according to imperfect categories are input into the SCR module, +where the patch features are pulled closer to the ground truth and pushed away +from the corresponding imperfect samples in the spherical feature space via +contrastive learning. Extensive experiments demonstrate that our method can +achieve state-of-the-art results on four test datasets, as well as successfully +generalize to real-world scenes. The code is available at +\url{https://github.com/Zhaozixiang1228/GDSR-SSDNet}. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SEAM: Searching Transferable Mixed-Precision Quantization Policy through + Large Margin Regularization + + +
+ Mixed-precision quantization (MPQ) suffers from the time-consuming process of +searching the optimal bit-width allocation i.e., the policy) for each layer, +especially when using large-scale datasets such as ISLVRC-2012. This limits the +practicality of MPQ in real-world deployment scenarios. To address this issue, +this paper proposes a novel method for efficiently searching for effective MPQ +policies using a small proxy dataset instead of the large-scale dataset used +for training the model. Deviating from the established norm of employing a +consistent dataset for both model training and MPQ policy search stages, our +approach, therefore, yields a substantial enhancement in the efficiency of MPQ +exploration. Nonetheless, using discrepant datasets poses challenges in +searching for a transferable MPQ policy. Driven by the observation that +quantization noise of sub-optimal policy exerts a detrimental influence on the +discriminability of feature representations -- manifesting as diminished class +margins and ambiguous decision boundaries -- our method aims to identify +policies that uphold the discriminative nature of feature representations, +i.e., intra-class compactness and inter-class separation. This general and +dataset-independent property makes us search for the MPQ policy over a rather +small-scale proxy dataset and then the policy can be directly used to quantize +the model trained on a large-scale dataset. Our method offers several +advantages, including high proxy data utilization, no excessive hyper-parameter +tuning, and high searching efficiency. We search high-quality MPQ policies with +the proxy dataset that has only 4% of the data scale compared to the +large-scale target dataset, achieving the same accuracy as searching directly +on the latter, improving MPQ searching efficiency by up to 300 times. + +
+
+
+
+
+ + ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models: + A Comprehensive Survey + + +
+ Diffusion models and large language models have emerged as leading-edge +generative models and have sparked a revolutionary impact on various aspects of +human life. However, the practical implementation of these models has also +exposed inherent risks, highlighting their dual nature and raising concerns +regarding their trustworthiness. Despite the abundance of literature on this +subject, a comprehensive survey specifically delving into the intersection of +large-scale generative models and their trustworthiness remains largely absent. +To bridge this gap, This paper investigates both the long-standing and emerging +threats associated with these models across four fundamental dimensions: +privacy, security, fairness, and responsibility. In this way, we construct an +extensive map outlining the trustworthiness of these models, while also +providing practical recommendations and identifying future directions. These +efforts are crucial for promoting the trustworthy deployment of these models, +ultimately benefiting society as a whole. + +
+
+ comment: Draft Version +
+
+
+
+
+ + ♻ ☆ CLIP2Point: Transfer CLIP to Point Cloud Classification with Image-Depth + Pre-training ICCV2023 + + +
+ Pre-training across 3D vision and language remains under development because +of limited training data. Recent works attempt to transfer vision-language +pre-training models to 3D vision. PointCLIP converts point cloud data to +multi-view depth maps, adopting CLIP for shape classification. However, its +performance is restricted by the domain gap between rendered depth maps and +images, as well as the diversity of depth distributions. To address this issue, +we propose CLIP2Point, an image-depth pre-training method by contrastive +learning to transfer CLIP to the 3D domain, and adapt it to point cloud +classification. We introduce a new depth rendering setting that forms a better +visual effect, and then render 52,460 pairs of images and depth maps from +ShapeNet for pre-training. The pre-training scheme of CLIP2Point combines +cross-modality learning to enforce the depth features for capturing expressive +visual and textual features and intra-modality learning to enhance the +invariance of depth aggregation. Additionally, we propose a novel Dual-Path +Adapter (DPA) module, i.e., a dual-path structure with simplified adapters for +few-shot learning. The dual-path structure allows the joint use of CLIP and +CLIP2Point, and the simplified adapter can well fit few-shot tasks without +post-search. Experimental results show that CLIP2Point is effective in +transferring CLIP knowledge to 3D vision. Our CLIP2Point outperforms PointCLIP +and other self-supervised 3D networks, achieving state-of-the-art results on +zero-shot and few-shot classification. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ Diffusion-Based 3D Human Pose Estimation with Multi-Hypothesis + Aggregation ICCV 2023 + + +
+ In this paper, a novel Diffusion-based 3D Pose estimation (D3DP) method with +Joint-wise reProjection-based Multi-hypothesis Aggregation (JPMA) is proposed +for probabilistic 3D human pose estimation. On the one hand, D3DP generates +multiple possible 3D pose hypotheses for a single 2D observation. It gradually +diffuses the ground truth 3D poses to a random distribution, and learns a +denoiser conditioned on 2D keypoints to recover the uncontaminated 3D poses. +The proposed D3DP is compatible with existing 3D pose estimators and supports +users to balance efficiency and accuracy during inference through two +customizable parameters. On the other hand, JPMA is proposed to assemble +multiple hypotheses generated by D3DP into a single 3D pose for practical use. +It reprojects 3D pose hypotheses to the 2D camera plane, selects the best +hypothesis joint-by-joint based on the reprojection errors, and combines the +selected joints into the final pose. The proposed JPMA conducts aggregation at +the joint level and makes use of the 2D prior information, both of which have +been overlooked by previous approaches. Extensive experiments on Human3.6M and +MPI-INF-3DHP datasets show that our method outperforms the state-of-the-art +deterministic and probabilistic approaches by 1.5% and 8.9%, respectively. Code +is available at https://github.com/paTRICK-swk/D3DP. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Novel Class Discovery for Long-tailed Recognition + + +
+ While the novel class discovery has recently made great progress, existing +methods typically focus on improving algorithms on class-balanced benchmarks. +However, in real-world recognition tasks, the class distributions of their +corresponding datasets are often imbalanced, which leads to serious performance +degeneration of those methods. In this paper, we consider a more realistic +setting for novel class discovery where the distributions of novel and known +classes are long-tailed. One main challenge of this new problem is to discover +imbalanced novel classes with the help of long-tailed known classes. To tackle +this problem, we propose an adaptive self-labeling strategy based on an +equiangular prototype representation of classes. Our method infers high-quality +pseudo-labels for the novel classes by solving a relaxed optimal transport +problem and effectively mitigates the class biases in learning the known and +novel classes. We perform extensive experiments on CIFAR100, ImageNet100, +Herbarium19 and large-scale iNaturalist18 datasets, and the results demonstrate +the superiority of our method. Our code is available at +https://github.com/kleinzcy/NCDLR. + +
+
+ comment: TMLR2023, Final version +
+
+
+
+
+ + ♻ ☆ On the Choice of Perception Loss Function for Learned Video Compression + + +
+ We study causal, low-latency, sequential video compression when the output is +subjected to both a mean squared-error (MSE) distortion loss as well as a +perception loss to target realism. Motivated by prior approaches, we consider +two different perception loss functions (PLFs). The first, PLF-JD, considers +the joint distribution (JD) of all the video frames up to the current one, +while the second metric, PLF-FMD, considers the framewise marginal +distributions (FMD) between the source and reconstruction. Using information +theoretic analysis and deep-learning based experiments, we demonstrate that the +choice of PLF can have a significant effect on the reconstruction, especially +at low-bit rates. In particular, while the reconstruction based on PLF-JD can +better preserve the temporal correlation across frames, it also imposes a +significant penalty in distortion compared to PLF-FMD and further makes it more +difficult to recover from errors made in the earlier output frames. Although +the choice of PLF decisively affects reconstruction quality, we also +demonstrate that it may not be essential to commit to a particular PLF during +encoding and the choice of PLF can be delegated to the decoder. In particular, +encoded representations generated by training a system to minimize the MSE +(without requiring either PLF) can be {\em near universal} and can generate +close to optimal reconstructions for either choice of PLF at the decoder. We +validate our results using (one-shot) information-theoretic analysis, detailed +study of the rate-distortion-perception tradeoff of the Gauss-Markov source +model as well as deep-learning based experiments on moving MNIST and KTH +datasets. + +
+
+
+
+
+ + ♻ ☆ SERE: Exploring Feature Self-relation for Self-supervised Transformer + + +
+ Learning representations with self-supervision for convolutional networks +(CNN) has been validated to be effective for vision tasks. As an alternative to +CNN, vision transformers (ViT) have strong representation ability with spatial +self-attention and channel-level feedforward networks. Recent works reveal that +self-supervised learning helps unleash the great potential of ViT. Still, most +works follow self-supervised strategies designed for CNN, e.g., instance-level +discrimination of samples, but they ignore the properties of ViT. We observe +that relational modeling on spatial and channel dimensions distinguishes ViT +from other networks. To enforce this property, we explore the feature +SElf-RElation (SERE) for training self-supervised ViT. Specifically, instead of +conducting self-supervised learning solely on feature embeddings from multiple +views, we utilize the feature self-relations, i.e., spatial/channel +self-relations, for self-supervised learning. Self-relation based learning +further enhances the relation modeling ability of ViT, resulting in stronger +representations that stably improve performance on multiple downstream tasks. +Our source code will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ GAEI-UNet: Global Attention and Elastic Interaction U-Net for Vessel + Image Segmentation + + +
+ Vessel image segmentation plays a pivotal role in medical diagnostics, aiding +in the early detection and treatment of vascular diseases. While segmentation +based on deep learning has shown promising results, effectively segmenting +small structures and maintaining connectivity between them remains challenging. +To address these limitations, we propose GAEI-UNet, a novel model that combines +global attention and elastic interaction-based techniques. GAEI-UNet leverages +global spatial and channel context information to enhance high-level semantic +understanding within the U-Net architecture, enabling precise segmentation of +small vessels. Additionally, we adopt an elastic interaction-based loss +function to improve connectivity among these fine structures. By capturing the +forces generated by misalignment between target and predicted shapes, our model +effectively learns to preserve the correct topology of vessel networks. +Evaluation on retinal vessel dataset -- DRIVE demonstrates the superior +performance of GAEI-UNet in terms of SE and connectivity of small structures, +without significantly increasing computational complexity. This research aims +to advance the field of vessel image segmentation, providing more accurate and +reliable diagnostic tools for the medical community. The implementation code is +available on Code. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2004.03696 by other authors +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ Learning from Negative User Feedback and Measuring Responsiveness for + Sequential Recommenders RecSys 2023 + + +
+ Sequential recommenders have been widely used in industry due to their +strength in modeling user preferences. While these models excel at learning a +user's positive interests, less attention has been paid to learning from +negative user feedback. Negative user feedback is an important lever of user +control, and comes with an expectation that recommenders should respond quickly +and reduce similar recommendations to the user. However, negative feedback +signals are often ignored in the training objective of sequential retrieval +models, which primarily aim at predicting positive user interactions. In this +work, we incorporate explicit and implicit negative user feedback into the +training objective of sequential recommenders in the retrieval stage using a +"not-to-recommend" loss function that optimizes for the log-likelihood of not +recommending items with negative feedback. We demonstrate the effectiveness of +this approach using live experiments on a large-scale industrial recommender +system. Furthermore, we address a challenge in measuring recommender +responsiveness to negative feedback by developing a counterfactual simulation +framework to compare recommender responses between different user actions, +showing improved responsiveness from the modeling change. + +
+
+ comment: RecSys 2023 Industry Track +
+
+
+
+
+ + ☆ LLMRec: Benchmarking Large Language Models on Recommendation Task + + +
+ Recently, the fast development of Large Language Models (LLMs) such as +ChatGPT has significantly advanced NLP tasks by enhancing the capabilities of +conversational models. However, the application of LLMs in the recommendation +domain has not been thoroughly investigated. To bridge this gap, we propose +LLMRec, a LLM-based recommender system designed for benchmarking LLMs on +various recommendation tasks. Specifically, we benchmark several popular +off-the-shelf LLMs, such as ChatGPT, LLaMA, ChatGLM, on five recommendation +tasks, including rating prediction, sequential recommendation, direct +recommendation, explanation generation, and review summarization. Furthermore, +we investigate the effectiveness of supervised finetuning to improve LLMs' +instruction compliance ability. The benchmark results indicate that LLMs +displayed only moderate proficiency in accuracy-based tasks such as sequential +and direct recommendation. However, they demonstrated comparable performance to +state-of-the-art methods in explainability-based tasks. We also conduct +qualitative evaluations to further evaluate the quality of contents generated +by different models, and the results show that LLMs can truly understand the +provided information and generate clearer and more reasonable results. We +aspire that this benchmark will serve as an inspiration for researchers to +delve deeper into the potential of LLMs in enhancing recommendation +performance. Our codes, processed data and benchmark results are available at +https://github.com/williamliujl/LLMRec. + +
+
+
+
+
+ + ☆ Counterfactual Graph Augmentation for Consumer Unfairness Mitigation in + Recommender Systems CIKM 2023 + + +
+ In recommendation literature, explainability and fairness are becoming two +prominent perspectives to consider. However, prior works have mostly addressed +them separately, for instance by explaining to consumers why a certain item was +recommended or mitigating disparate impacts in recommendation utility. None of +them has leveraged explainability techniques to inform unfairness mitigation. +In this paper, we propose an approach that relies on counterfactual +explanations to augment the set of user-item interactions, such that using them +while inferring recommendations leads to fairer outcomes. Modeling user-item +interactions as a bipartite graph, our approach augments the latter by +identifying new user-item edges that not only can explain the original +unfairness by design, but can also mitigate it. Experiments on two public data +sets show that our approach effectively leads to a better trade-off between +fairness and recommendation utility compared with state-of-the-art mitigation +procedures. We further analyze the characteristics of added edges to highlight +key unfairness patterns. Source code available at +https://github.com/jackmedda/RS-BGExplainer/tree/cikm2023. + +
+
+ comment: Accepted as a short paper at CIKM 2023 +
+
+
+
+
+ + ☆ Hybrid Retrieval and Multi-stage Text Ranking Solution at TREC 2022 Deep + Learning Track + + +
+ Large-scale text retrieval technology has been widely used in various +practical business scenarios. This paper presents our systems for the TREC 2022 +Deep Learning Track. We explain the hybrid text retrieval and multi-stage text +ranking method adopted in our solution. The retrieval stage combined the two +structures of traditional sparse retrieval and neural dense retrieval. In the +ranking stage, in addition to the full interaction-based ranking model built on +large pre-trained language model, we also proposes a lightweight sub-ranking +module to further enhance the final text ranking performance. Evaluation +results demonstrate the effectiveness of our proposed approach. Our models +achieve the 1st and 4th rank on the test set of passage ranking and document +ranking respectively. + +
+
+ comment: TREC 2022 Deep Learning Track +
+
+
+
+
+ + ☆ LKPNR: LLM and KG for Personalized News Recommendation Framework + + +
+ Accurately recommending candidate news articles to users is a basic challenge +faced by personalized news recommendation systems. Traditional methods are +usually difficult to grasp the complex semantic information in news texts, +resulting in unsatisfactory recommendation results. Besides, these traditional +methods are more friendly to active users with rich historical behaviors. +However, they can not effectively solve the "long tail problem" of inactive +users. To address these issues, this research presents a novel general +framework that combines Large Language Models (LLM) and Knowledge Graphs (KG) +into semantic representations of traditional methods. In order to improve +semantic understanding in complex news texts, we use LLMs' powerful text +understanding ability to generate news representations containing rich semantic +information. In addition, our method combines the information about news +entities and mines high-order structural information through multiple hops in +KG, thus alleviating the challenge of long tail distribution. Experimental +results demonstrate that compared with various traditional models, the +framework significantly improves the recommendation effect. The successful +integration of LLM and KG in our framework has established a feasible path for +achieving more accurate personalized recommendations in the news field. Our +code is available at https://github.com/Xuan-ZW/LKPNR. + +
+
+
+
+
+ + ☆ Economic Recommender Systems -- A Systematic Review + + +
+ Many of today's online services provide personalized recommendations to their +users. Such recommendations are typically designed to serve certain user needs, +e.g., to quickly find relevant content in situations of information overload. +Correspondingly, the academic literature in the field largely focuses on the +value of recommender systems for the end user. In this context, one underlying +assumption is that the improved service that is achieved through the +recommendations will in turn positively impact the organization's goals, e.g., +in the form of higher customer retention or loyalty. However, in reality, +recommender systems can be used to target organizational economic goals more +directly by incorporating monetary considerations such as price awareness and +profitability aspects into the underlying recommendation models. In this work, +we survey the existing literature on what we call Economic Recommender Systems +based on a systematic review approach that helped us identify 133 relevant +papers. We first categorize existing works along different dimensions and then +review the most important technical approaches from the literature. +Furthermore, we discuss common methodologies to evaluate such systems and +finally outline the limitations of today's research and future directions. + +
+
+
+
+
+ + ☆ Integrating the Wikidata Taxonomy into YAGO + + +
+ Wikidata is one of the largest public general-purpose Knowledge Bases (KBs). +Yet, due to its collaborative nature, its schema and taxonomy have become +convoluted. For the YAGO 4 KB, we combined Wikidata with the ontology from +Schema.org, which reduced and cleaned up the taxonomy and constraints and made +it possible to run automated reasoners on the data. However, it also cut away +large parts of the Wikidata taxonomy. In this paper, we present our effort to +merge the entire Wikidata taxonomy into the YAGO KB as much as possible. We pay +particular attention to logical constraints and a careful distinction of +classes and instances. Our work creates YAGO 4.5, which adds a rich layer of +informative classes to YAGO, while at the same time keeping the KB logically +consistent. + +
+
+
+
+
+ + ♻ ☆ Task Relation-aware Continual User Representation Learning KDD 2023 + + +
+ User modeling, which learns to represent users into a low-dimensional +representation space based on their past behaviors, got a surge of interest +from the industry for providing personalized services to users. Previous +efforts in user modeling mainly focus on learning a task-specific user +representation that is designed for a single task. However, since learning +task-specific user representations for every task is infeasible, recent studies +introduce the concept of universal user representation, which is a more +generalized representation of a user that is relevant to a variety of tasks. +Despite their effectiveness, existing approaches for learning universal user +representations are impractical in real-world applications due to the data +requirement, catastrophic forgetting and the limited learning capability for +continually added tasks. In this paper, we propose a novel continual user +representation learning method, called TERACON, whose learning capability is +not limited as the number of learned tasks increases while capturing the +relationship between the tasks. The main idea is to introduce an embedding for +each task, i.e., task embedding, which is utilized to generate task-specific +soft masks that not only allow the entire model parameters to be updated until +the end of training sequence, but also facilitate the relationship between the +tasks to be captured. Moreover, we introduce a novel knowledge retention module +with pseudo-labeling strategy that successfully alleviates the long-standing +problem of continual learning, i.e., catastrophic forgetting. Extensive +experiments on public and proprietary real-world datasets demonstrate the +superiority and practicality of TERACON. Our code is available at +https://github.com/Sein-Kim/TERACON. + +
+
+ comment: KDD 2023 +
+
+
+
+
+ + ♻ ☆ A Tale of Two Graphs: Freezing and Denoising Graph Structures for + Multimodal Recommendation + + +
+ Multimodal recommender systems utilizing multimodal features (e.g., images +and textual descriptions) typically show better recommendation accuracy than +general recommendation models based solely on user-item interactions. +Generally, prior work fuses multimodal features into item ID embeddings to +enrich item representations, thus failing to capture the latent semantic +item-item structures. In this context, LATTICE proposes to learn the latent +structure between items explicitly and achieves state-of-the-art performance +for multimodal recommendations. However, we argue the latent graph structure +learning of LATTICE is both inefficient and unnecessary. Experimentally, we +demonstrate that freezing its item-item structure before training can also +achieve competitive performance. Based on this finding, we propose a simple yet +effective model, dubbed as FREEDOM, that FREEzes the item-item graph and +DenOises the user-item interaction graph simultaneously for Multimodal +recommendation. Theoretically, we examine the design of FREEDOM through a graph +spectral perspective and demonstrate that it possesses a tighter upper bound on +the graph spectrum. In denoising the user-item interaction graph, we devise a +degree-sensitive edge pruning method, which rejects possibly noisy edges with a +high probability when sampling the graph. We evaluate the proposed model on +three real-world datasets and show that FREEDOM can significantly outperform +current strongest baselines. Compared with LATTICE, FREEDOM achieves an average +improvement of 19.07% in recommendation accuracy while reducing its memory cost +up to 6$\times$ on large graphs. The source code is available at: +https://github.com/enoche/FREEDOM. + +
+
+ comment: Accepted to ACM Multimedia (MM) 2023 +
+
+
+
+
+
+
+
+ + Machine Learning 128 + +
+
+
+ + ☆ D4: Improving LLM Pretraining via Document De-Duplication and + Diversification + + +
+ Over recent years, an increasing amount of compute and data has been poured +into training large language models (LLMs), usually by doing one-pass learning +on as many tokens as possible randomly selected from large-scale web corpora. +While training on ever-larger portions of the internet leads to consistent +performance improvements, the size of these improvements diminishes with scale, +and there has been little work exploring the effect of data selection on +pre-training and downstream performance beyond simple de-duplication methods +such as MinHash. Here, we show that careful data selection (on top of +de-duplicated data) via pre-trained model embeddings can speed up training (20% +efficiency gains) and improves average downstream accuracy on 16 NLP tasks (up +to 2%) at the 6.7B model scale. Furthermore, we show that repeating data +intelligently consistently outperforms baseline training (while repeating +random data performs worse than baseline training). Our results indicate that +clever data selection can significantly improve LLM pre-training, calls into +question the common practice of training for a single epoch on as much data as +possible, and demonstrates a path to keep improving our models past the limits +of randomly sampling web data. + +
+
+
+
+
+ + ☆ Extended Linear Regression: A Kalman Filter Approach for Minimizing Loss + via Area Under the Curve + + +
+ This research enhances linear regression models by integrating a Kalman +filter and analysing curve areas to minimize loss. The goal is to develop an +optimal linear regression equation using stochastic gradient descent (SGD) for +weight updating. Our approach involves a stepwise process, starting with +user-defined parameters. The linear regression model is trained using SGD, +tracking weights and loss separately and zipping them finally. A Kalman filter +is then trained based on weight and loss arrays to predict the next +consolidated weights. Predictions result from multiplying input averages with +weights, evaluated for loss to form a weight-versus-loss curve. The curve's +equation is derived using the two-point formula, and area under the curve is +calculated via integration. The linear regression equation with minimum area +becomes the optimal curve for prediction. Benefits include avoiding constant +weight updates via gradient descent and working with partial datasets, unlike +methods needing the entire set. However, computational complexity should be +considered. The Kalman filter's accuracy might diminish beyond a certain +prediction range. + +
+
+
+
+
+ + ☆ On-Manifold Projected Gradient Descent + + +
+ This work provides a computable, direct, and mathematically rigorous +approximation to the differential geometry of class manifolds for +high-dimensional data, along with nonlinear projections from input space onto +these class manifolds. The tools are applied to the setting of neural network +image classifiers, where we generate novel, on-manifold data samples, and +implement a projected gradient descent algorithm for on-manifold adversarial +training. The susceptibility of neural networks (NNs) to adversarial attack +highlights the brittle nature of NN decision boundaries in input space. +Introducing adversarial examples during training has been shown to reduce the +susceptibility of NNs to adversarial attack; however, it has also been shown to +reduce the accuracy of the classifier if the examples are not valid examples +for that class. Realistic "on-manifold" examples have been previously generated +from class manifolds in the latent of an autoencoder. Our work explores these +phenomena in a geometric and computational setting that is much closer to the +raw, high-dimensional input space than can be provided by VAE or other black +box dimensionality reductions. We employ conformally invariant diffusion maps +(CIDM) to approximate class manifolds in diffusion coordinates, and develop the +Nystr\"{o}m projection to project novel points onto class manifolds in this +setting. On top of the manifold approximation, we leverage the spectral +exterior calculus (SEC) to determine geometric quantities such as tangent +vectors of the manifold. We use these tools to obtain adversarial examples that +reside on a class manifold, yet fool a classifier. These misclassifications +then become explainable in terms of human-understandable manipulations within +the data, by expressing the on-manifold adversary in the semantic basis on the +manifold. + +
+
+
+
+
+ + ☆ Language Reward Modulation for Pretraining Reinforcement Learning + + +
+ Using learned reward functions (LRFs) as a means to solve sparse-reward +reinforcement learning (RL) tasks has yielded some steady progress in +task-complexity through the years. In this work, we question whether today's +LRFs are best-suited as a direct replacement for task rewards. Instead, we +propose leveraging the capabilities of LRFs as a pretraining signal for RL. +Concretely, we propose $\textbf{LA}$nguage Reward $\textbf{M}$odulated +$\textbf{P}$retraining (LAMP) which leverages the zero-shot capabilities of +Vision-Language Models (VLMs) as a $\textit{pretraining}$ utility for RL as +opposed to a downstream task reward. LAMP uses a frozen, pretrained VLM to +scalably generate noisy, albeit shaped exploration rewards by computing the +contrastive alignment between a highly diverse collection of language +instructions and the image observations of an agent in its pretraining +environment. LAMP optimizes these rewards in conjunction with standard +novelty-seeking exploration rewards with reinforcement learning to acquire a +language-conditioned, pretrained policy. Our VLM pretraining approach, which is +a departure from previous attempts to use LRFs, can warmstart sample-efficient +learning on robot manipulation tasks in RLBench. + +
+
+ comment: Code available at https://github.com/ademiadeniji/lamp +
+
+
+
+
+ + ☆ FECoM: A Step towards Fine-Grained Energy Measurement for Deep Learning + + +
+ With the increasing usage, scale, and complexity of Deep Learning (DL) +models, their rapidly growing energy consumption has become a critical concern. +Promoting green development and energy awareness at different granularities is +the need of the hour to limit carbon emissions of DL systems. However, the lack +of standard and repeatable tools to accurately measure and optimize energy +consumption at a fine granularity (e.g., at method level) hinders progress in +this area. In this paper, we introduce FECoM (Fine-grained Energy Consumption +Meter), a framework for fine-grained DL energy consumption measurement. +Specifically, FECoM provides researchers and developers a mechanism to profile +DL APIs. FECoM addresses the challenges of measuring energy consumption at +fine-grained level by using static instrumentation and considering various +factors, including computational load and temperature stability. We assess +FECoM's capability to measure fine-grained energy consumption for one of the +most popular open-source DL frameworks, namely TensorFlow. Using FECoM, we also +investigate the impact of parameter size and execution time on energy +consumption, enriching our understanding of TensorFlow APIs' energy profiles. +Furthermore, we elaborate on the considerations, issues, and challenges that +one needs to consider while designing and implementing a fine-grained energy +consumption measurement tool. We hope this work will facilitate further +advances in DL energy measurement and the development of energy-aware practices +for DL systems. + +
+
+
+
+
+ + ☆ Learning from Negative User Feedback and Measuring Responsiveness for + Sequential Recommenders RecSys 2023 + + +
+ Sequential recommenders have been widely used in industry due to their +strength in modeling user preferences. While these models excel at learning a +user's positive interests, less attention has been paid to learning from +negative user feedback. Negative user feedback is an important lever of user +control, and comes with an expectation that recommenders should respond quickly +and reduce similar recommendations to the user. However, negative feedback +signals are often ignored in the training objective of sequential retrieval +models, which primarily aim at predicting positive user interactions. In this +work, we incorporate explicit and implicit negative user feedback into the +training objective of sequential recommenders in the retrieval stage using a +"not-to-recommend" loss function that optimizes for the log-likelihood of not +recommending items with negative feedback. We demonstrate the effectiveness of +this approach using live experiments on a large-scale industrial recommender +system. Furthermore, we address a challenge in measuring recommender +responsiveness to negative feedback by developing a counterfactual simulation +framework to compare recommender responses between different user actions, +showing improved responsiveness from the modeling change. + +
+
+ comment: RecSys 2023 Industry Track +
+
+
+
+
+ + ☆ How Safe Am I Given What I See? Calibrated Prediction of Safety Chances + for Image-Controlled Autonomy + + +
+ End-to-end learning has emerged as a major paradigm for developing autonomous +systems. Unfortunately, with its performance and convenience comes an even +greater challenge of safety assurance. A key factor of this challenge is the +absence of the notion of a low-dimensional and interpretable dynamical state, +around which traditional assurance methods revolve. Focusing on the online +safety prediction problem, this paper proposes a configurable family of +learning pipelines based on generative world models, which do not require +low-dimensional states. To implement these pipelines, we overcome the +challenges of learning safety-informed latent representations and missing +safety labels under prediction-induced distribution shift. These pipelines come +with statistical calibration guarantees on their safety chance predictions +based on conformal prediction. We perform an extensive evaluation of the +proposed learning pipelines on two case studies of image-controlled systems: a +racing car and a cartpole. + +
+
+
+
+
+ + ☆ How to Protect Copyright Data in Optimization of Large Language Models? + + +
+ Large language models (LLMs) and generative AI have played a transformative +role in computer research and applications. Controversy has arisen as to +whether these models output copyrighted data, which can occur if the data the +models are trained on is copyrighted. LLMs are built on the transformer neural +network architecture, which in turn relies on a mathematical computation called +Attention that uses the softmax function. + In this paper, we show that large language model training and optimization +can be seen as a softmax regression problem. We then establish a method of +efficiently performing softmax regression, in a way that prevents the +regression function from generating copyright data. This establishes a +theoretical method of training large language models in a way that avoids +generating copyright data. + +
+
+
+
+
+ + ☆ Multi-Objective Optimization for Sparse Deep Neural Network Training + + +
+ Different conflicting optimization criteria arise naturally in various Deep +Learning scenarios. These can address different main tasks (i.e., in the +setting of Multi-Task Learning), but also main and secondary tasks such as loss +minimization versus sparsity. The usual approach is a simple weighting of the +criteria, which formally only works in the convex setting. In this paper, we +present a Multi-Objective Optimization algorithm using a modified Weighted +Chebyshev scalarization for training Deep Neural Networks (DNNs) with respect +to several tasks. By employing this scalarization technique, the algorithm can +identify all optimal solutions of the original problem while reducing its +complexity to a sequence of single-objective problems. The simplified problems +are then solved using an Augmented Lagrangian method, enabling the use of +popular optimization techniques such as Adam and Stochastic Gradient Descent, +while efficaciously handling constraints. Our work aims to address the +(economical and also ecological) sustainability issue of DNN models, with a +particular focus on Deep Multi-Task models, which are typically designed with a +very large number of weights to perform equally well on multiple tasks. Through +experiments conducted on two Machine Learning datasets, we demonstrate the +possibility of adaptively sparsifying the model during training without +significantly impacting its performance, if we are willing to apply +task-specific adaptations to the network weights. Code is available at +https://github.com/salomonhotegni/MDMTN. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ Critical Learning Periods Emerge Even in Deep Linear Networks + + +
+ Critical learning periods are periods early in development where temporary +sensory deficits can have a permanent effect on behavior and learned +representations. Despite the radical differences between biological and +artificial networks, critical learning periods have been empirically observed +in both systems. This suggests that critical periods may be fundamental to +learning and not an accident of biology. Yet, why exactly critical periods +emerge in deep networks is still an open question, and in particular it is +unclear whether the critical periods observed in both systems depend on +particular architectural or optimization details. To isolate the key underlying +factors, we focus on deep linear network models, and show that, surprisingly, +such networks also display much of the behavior seen in biology and artificial +networks, while being amenable to analytical treatment. We show that critical +periods depend on the depth of the model and structure of the data +distribution. We also show analytically and in simulations that the learning of +features is tied to competition between sources. Finally, we extend our +analysis to multi-task learning to show that pre-training on certain tasks can +damage the transfer performance on new tasks, and show how this depends on the +relationship between tasks and the duration of the pre-training stage. To the +best of our knowledge, our work provides the first analytically tractable model +that sheds light into why critical learning periods emerge in biological and +artificial networks. + +
+
+
+
+
+ + ☆ Diffusion Language Models Can Perform Many Tasks with Scaling and + Instruction-Finetuning + + +
+ The recent surge of generative AI has been fueled by the generative power of +diffusion probabilistic models and the scalable capabilities of large language +models. Despite their potential, it remains elusive whether diffusion language +models can solve general language tasks comparable to their autoregressive +counterparts. This paper demonstrates that scaling diffusion models w.r.t. +data, sizes, and tasks can effectively make them strong language learners. We +build competent diffusion language models at scale by first acquiring knowledge +from massive data via masked language modeling pretraining thanks to their +intrinsic connections. We then reprogram pretrained masked language models into +diffusion language models via diffusive adaptation, wherein task-specific +finetuning and instruction finetuning are explored to unlock their versatility +in solving general language tasks. Experiments show that scaling diffusion +language models consistently improves performance across downstream language +tasks. We further discover that instruction finetuning can elicit zero-shot and +few-shot in-context learning abilities that help tackle many unseen tasks by +following natural language instructions, and show promise in advanced and +challenging abilities such as reasoning + +
+
+
+
+
+ + ☆ The Challenges of Machine Learning for Trust and Safety: A Case Study on + Misinformation Detection + + +
+ We examine the disconnect between scholarship and practice in applying +machine learning to trust and safety problems, using misinformation detection +as a case study. We systematize literature on automated detection of +misinformation across a corpus of 270 well-cited papers in the field. We then +examine subsets of papers for data and code availability, design missteps, +reproducibility, and generalizability. We find significant shortcomings in the +literature that call into question claimed performance and practicality. +Detection tasks are often meaningfully distinct from the challenges that online +services actually face. Datasets and model evaluation are often +non-representative of real-world contexts, and evaluation frequently is not +independent of model training. Data and code availability is poor. Models do +not generalize well to out-of-domain data. Based on these results, we offer +recommendations for evaluating machine learning applications to trust and +safety problems. Our aim is for future work to avoid the pitfalls that we +identify. + +
+
+
+
+
+ + ☆ Learning to Learn Financial Networks for Optimising Momentum Strategies + + +
+ Network momentum provides a novel type of risk premium, which exploits the +interconnections among assets in a financial network to predict future returns. +However, the current process of constructing financial networks relies heavily +on expensive databases and financial expertise, limiting accessibility for +small-sized and academic institutions. Furthermore, the traditional approach +treats network construction and portfolio optimisation as separate tasks, +potentially hindering optimal portfolio performance. To address these +challenges, we propose L2GMOM, an end-to-end machine learning framework that +simultaneously learns financial networks and optimises trading signals for +network momentum strategies. The model of L2GMOM is a neural network with a +highly interpretable forward propagation architecture, which is derived from +algorithm unrolling. The L2GMOM is flexible and can be trained with diverse +loss functions for portfolio performance, e.g. the negative Sharpe ratio. +Backtesting on 64 continuous future contracts demonstrates a significant +improvement in portfolio profitability and risk control, with a Sharpe ratio of +1.74 across a 20-year period. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ ULDP-FL: Federated Learning with Across Silo User-Level Differential + Privacy + + +
+ Differentially Private Federated Learning (DP-FL) has garnered attention as a +collaborative machine learning approach that ensures formal privacy. Most DP-FL +approaches ensure DP at the record-level within each silo for cross-silo FL. +However, a single user's data may extend across multiple silos, and the desired +user-level DP guarantee for such a setting remains unknown. In this study, we +present ULDP-FL, a novel FL framework designed to guarantee user-level DP in +cross-silo FL where a single user's data may belong to multiple silos. Our +proposed algorithm directly ensures user-level DP through per-user weighted +clipping, departing from group-privacy approaches. We provide a theoretical +analysis of the algorithm's privacy and utility. Additionally, we enhance the +algorithm's utility and showcase its private implementation using cryptographic +building blocks. Empirical experiments on real-world datasets show substantial +improvements in our methods in privacy-utility trade-offs under user-level DP +compared to baseline methods. To the best of our knowledge, our work is the +first FL framework that effectively provides user-level DP in the general +cross-silo FL setting. + +
+
+
+
+
+ + ☆ Curriculum Learning with Adam: The Devil Is in the Wrong Details + + +
+ Curriculum learning (CL) posits that machine learning models -- similar to +humans -- may learn more efficiently from data that match their current +learning progress. However, CL methods are still poorly understood and, in +particular for natural language processing (NLP), have achieved only limited +success. In this paper, we explore why. Starting from an attempt to replicate +and extend a number of recent curriculum methods, we find that their results +are surprisingly brittle when applied to NLP. A deep dive into the +(in)effectiveness of the curricula in some scenarios shows us why: when +curricula are employed in combination with the popular Adam optimisation +algorithm, they oftentimes learn to adapt to suboptimally chosen optimisation +parameters for this algorithm. We present a number of different case studies +with different common hand-crafted and automated CL approaches to illustrate +this phenomenon, and we find that none of them outperforms optimisation with +only Adam with well-chosen hyperparameters. As such, our results contribute to +understanding why CL methods work, but at the same time urge caution when +claiming positive results. + +
+
+
+
+
+ + ☆ Self-Supervised Knowledge-Driven Deep Learning for 3D Magnetic Inversion + + +
+ The magnetic inversion method is one of the non-destructive geophysical +methods, which aims to estimate the subsurface susceptibility distribution from +surface magnetic anomaly data. Recently, supervised deep learning methods have +been widely utilized in lots of geophysical fields including magnetic +inversion. However, these methods rely heavily on synthetic training data, +whose performance is limited since the synthetic data is not independently and +identically distributed with the field data. Thus, we proposed to realize +magnetic inversion by self-supervised deep learning. The proposed +self-supervised knowledge-driven 3D magnetic inversion method (SSKMI) learns on +the target field data by a closed loop of the inversion and forward models. +Given that the parameters of the forward model are preset, SSKMI can optimize +the inversion model by minimizing the mean absolute error between observed and +re-estimated surface magnetic anomalies. Besides, there is a knowledge-driven +module in the proposed inversion model, which makes the deep learning method +more explicable. Meanwhile, comparative experiments demonstrate that the +knowledge-driven module can accelerate the training of the proposed method and +achieve better results. Since magnetic inversion is an ill-pose task, SSKMI +proposed to constrain the inversion model by a guideline in the auxiliary loop. +The experimental results demonstrate that the proposed method is a reliable +magnetic inversion method with outstanding performance. + +
+
+ comment: 11 pages, 14 figures +
+
+
+
+
+ + ☆ Robustness Analysis of Continuous-Depth Models with Lagrangian + Techniques + + +
+ This paper presents, in a unified fashion, deterministic as well as +statistical Lagrangian-verification techniques. They formally quantify the +behavioral robustness of any time-continuous process, formulated as a +continuous-depth model. To this end, we review LRT-NG, SLR, and GoTube, +algorithms for constructing a tight reachtube, that is, an over-approximation +of the set of states reachable within a given time-horizon, and provide +guarantees for the reachtube bounds. We compare the usage of the variational +equations, associated to the system equations, the mean value theorem, and the +Lipschitz constants, in achieving deterministic and statistical guarantees. In +LRT-NG, the Lipschitz constant is used as a bloating factor of the initial +perturbation, to compute the radius of an ellipsoid in an optimal metric, which +over-approximates the set of reachable states. In SLR and GoTube, we get +statistical guarantees, by using the Lipschitz constants to compute local balls +around samples. These are needed to calculate the probability of having found +an upper bound, of the true maximum perturbation at every timestep. Our +experiments demonstrate the superior performance of Lagrangian techniques, when +compared to LRT, Flow*, and CAPD, and illustrate their use in the robustness +analysis of various continuous-depth models. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2107.08467 +
+
+
+
+
+ + ☆ Development and external validation of a lung cancer risk estimation + tool using gradient-boosting + + +
+ Lung cancer is a significant cause of mortality worldwide, emphasizing the +importance of early detection for improved survival rates. In this study, we +propose a machine learning (ML) tool trained on data from the PLCO Cancer +Screening Trial and validated on the NLST to estimate the likelihood of lung +cancer occurrence within five years. The study utilized two datasets, the PLCO +(n=55,161) and NLST (n=48,595), consisting of comprehensive information on risk +factors, clinical measurements, and outcomes related to lung cancer. Data +preprocessing involved removing patients who were not current or former smokers +and those who had died of causes unrelated to lung cancer. Additionally, a +focus was placed on mitigating bias caused by censored data. Feature selection, +hyper-parameter optimization, and model calibration were performed using +XGBoost, an ensemble learning algorithm that combines gradient boosting and +decision trees. The ML model was trained on the pre-processed PLCO dataset and +tested on the NLST dataset. The model incorporated features such as age, +gender, smoking history, medical diagnoses, and family history of lung cancer. +The model was well-calibrated (Brier score=0.044). ROC-AUC was 82% on the PLCO +dataset and 70% on the NLST dataset. PR-AUC was 29% and 11% respectively. When +compared to the USPSTF guidelines for lung cancer screening, our model provided +the same recall with a precision of 13.1% vs. 9.3% on the PLCO dataset and 3.2% +vs. 3.1% on the NLST dataset. The developed ML tool provides a freely available +web application for estimating the likelihood of developing lung cancer within +five years. By utilizing risk factors and clinical data, individuals can assess +their risk and make informed decisions regarding lung cancer screening. This +research contributes to the efforts in early detection and prevention +strategies, aiming to reduce lung cancer-related mortality rates. + +
+
+ comment: 14 pages, 4 figures, 4 tables, 1 Github repository, see + http://github.com/plbenveniste/LungCancerRisk +
+
+
+
+
+ + ☆ Unsupervised anomalies detection in IIoT edge devices networks using + federated learning + + +
+ In a connection of many IoT devices that each collect data, normally training +a machine learning model would involve transmitting the data to a central +server which requires strict privacy rules. However, some owners are reluctant +of availing their data out of the company due to data security concerns. +Federated learning(FL) as a distributed machine learning approach performs +training of a machine learning model on the device that gathered the data +itself. In this scenario, data is not share over the network for training +purpose. Fedavg as one of FL algorithms permits a model to be copied to +participating devices during a training session. The devices could be chosen at +random, and a device can be aborted. The resulting models are sent to the +coordinating server and then average models from the devices that finished +training. The process is repeated until a desired model accuracy is achieved. +By doing this, FL approach solves the privacy problem for IoT/ IIoT devices +that held sensitive data for the owners. In this paper, we leverage the +benefits of FL and implemented Fedavg algorithm on a recent dataset that +represent the modern IoT/ IIoT device networks. The results were almost the +same as the centralized machine learning approach. We also evaluated some +shortcomings of Fedavg such as unfairness that happens during the training when +struggling devices do not participate for every stage of training. This +inefficient training of local or global model could lead in a high number of +false alarms in intrusion detection systems for IoT/IIoT gadgets developed +using Fedavg. Hence, after evaluating the FedAv deep auto encoder with +centralized deep auto encoder ML, we further proposed and designed a Fair +Fedavg algorithm that will be evaluated in the future work. + +
+
+ comment: Accepted for PuBlication in machine learning journals +
+
+
+
+
+ + ☆ Data-driven decision-focused surrogate modeling + + +
+ We introduce the concept of decision-focused surrogate modeling for solving +computationally challenging nonlinear optimization problems in real-time +settings. The proposed data-driven framework seeks to learn a simpler, e.g. +convex, surrogate optimization model that is trained to minimize the decision +prediction error, which is defined as the difference between the optimal +solutions of the original and the surrogate optimization models. The learning +problem, formulated as a bilevel program, can be viewed as a data-driven +inverse optimization problem to which we apply a decomposition-based solution +algorithm from previous work. We validate our framework through numerical +experiments involving the optimization of common nonlinear chemical processes +such as chemical reactors, heat exchanger networks, and material blending +systems. We also present a detailed comparison of decision-focused surrogate +modeling with standard data-driven surrogate modeling methods and demonstrate +that our approach is significantly more data-efficient while producing simple +surrogate models with high decision prediction accuracy. + +
+
+
+
+
+ + ☆ A Probabilistic Fluctuation based Membership Inference Attack for + Generative Models + + +
+ Membership Inference Attack (MIA) identifies whether a record exists in a +machine learning model's training set by querying the model. MIAs on the +classic classification models have been well-studied, and recent works have +started to explore how to transplant MIA onto generative models. Our +investigation indicates that existing MIAs designed for generative models +mainly depend on the overfitting in target models. However, overfitting can be +avoided by employing various regularization techniques, whereas existing MIAs +demonstrate poor performance in practice. Unlike overfitting, memorization is +essential for deep learning models to attain optimal performance, making it a +more prevalent phenomenon. Memorization in generative models leads to an +increasing trend in the probability distribution of generating records around +the member record. Therefore, we propose a Probabilistic Fluctuation Assessing +Membership Inference Attack (PFAMI), a black-box MIA that infers memberships by +detecting these trends via analyzing the overall probabilistic fluctuations +around given records. We conduct extensive experiments across multiple +generative models and datasets, which demonstrate PFAMI can improve the attack +success rate (ASR) by about 27.9% when compared with the best baseline. + +
+
+
+
+
+ + ☆ Masking Strategies for Background Bias Removal in Computer Vision Models ICCV + + +
+ Models for fine-grained image classification tasks, where the difference +between some classes can be extremely subtle and the number of samples per +class tends to be low, are particularly prone to picking up background-related +biases and demand robust methods to handle potential examples with +out-of-distribution (OOD) backgrounds. To gain deeper insights into this +critical problem, our research investigates the impact of background-induced +bias on fine-grained image classification, evaluating standard backbone models +such as Convolutional Neural Network (CNN) and Vision Transformers (ViT). We +explore two masking strategies to mitigate background-induced bias: Early +masking, which removes background information at the (input) image level, and +late masking, which selectively masks high-level spatial features corresponding +to the background. Extensive experiments assess the behavior of CNN and ViT +models under different masking strategies, with a focus on their generalization +to OOD backgrounds. The obtained findings demonstrate that both proposed +strategies enhance OOD performance compared to the baseline models, with early +masking consistently exhibiting the best OOD performance. Notably, a ViT +variant employing GAP-Pooled Patch token-based classification combined with +early masking achieves the highest OOD robustness. + +
+
+ comment: Accepted at the 2023 IEEE/CVF International Conference on Computer + Vision Workshop (ICCVW) on Out Of Distribution Generalization in Computer + Vision (OOD-CV) +
+
+
+
+
+ + ☆ An Accelerated Block Proximal Framework with Adaptive Momentum for + Nonconvex and Nonsmooth Optimization + + +
+ We propose an accelerated block proximal linear framework with adaptive +momentum (ABPL$^+$) for nonconvex and nonsmooth optimization. We analyze the +potential causes of the extrapolation step failing in some algorithms, and +resolve this issue by enhancing the comparison process that evaluates the +trade-off between the proximal gradient step and the linear extrapolation step +in our algorithm. Furthermore, we extends our algorithm to any scenario +involving updating block variables with positive integers, allowing each cycle +to randomly shuffle the update order of the variable blocks. Additionally, +under mild assumptions, we prove that ABPL$^+$ can monotonically decrease the +function value without strictly restricting the extrapolation parameters and +step size, demonstrates the viability and effectiveness of updating these +blocks in a random order, and we also more obviously and intuitively +demonstrate that the derivative set of the sequence generated by our algorithm +is a critical point set. Moreover, we demonstrate the global convergence as +well as the linear and sublinear convergence rates of our algorithm by +utilizing the Kurdyka-Lojasiewicz (K{\L}) condition. To enhance the +effectiveness and flexibility of our algorithm, we also expand the study to the +imprecise version of our algorithm and construct an adaptive extrapolation +parameter strategy, which improving its overall performance. We apply our +algorithm to multiple non-negative matrix factorization with the $\ell_0$ norm, +nonnegative tensor decomposition with the $\ell_0$ norm, and perform extensive +numerical experiments to validate its effectiveness and efficiency. + +
+
+
+
+
+ + ☆ An Open-Source ML-Based Full-Stack Optimization Framework for Machine + Learning Accelerators + + +
+ Parameterizable machine learning (ML) accelerators are the product of recent +breakthroughs in ML. To fully enable their design space exploration (DSE), we +propose a physical-design-driven, learning-based prediction framework for +hardware-accelerated deep neural network (DNN) and non-DNN ML algorithms. It +adopts a unified approach that combines backend power, performance, and area +(PPA) analysis with frontend performance simulation, thereby achieving a +realistic estimation of both backend PPA and system metrics such as runtime and +energy. In addition, our framework includes a fully automated DSE technique, +which optimizes backend and system metrics through an automated search of +architectural and backend parameters. Experimental studies show that our +approach consistently predicts backend PPA and system metrics with an average +7% or less prediction error for the ASIC implementation of two deep learning +accelerator platforms, VTA and VeriGOOD-ML, in both a commercial 12 nm process +and a research-oriented 45 nm process. + +
+
+ comment: This is an extended version of our work titled "Physically Accurate + Learning-based Performance Prediction of Hardware-accelerated ML Algorithms" + published in MLCAD 2022 +
+
+
+
+
+ + ☆ Less is More -- Towards parsimonious multi-task models using structured + sparsity + + +
+ Group sparsity in Machine Learning (ML) encourages simpler, more +interpretable models with fewer active parameter groups. This work aims to +incorporate structured group sparsity into the shared parameters of a +Multi-Task Learning (MTL) framework, to develop parsimonious models that can +effectively address multiple tasks with fewer parameters while maintaining +comparable or superior performance to a dense model. Sparsifying the model +during training helps decrease the model's memory footprint, computation +requirements, and prediction time during inference. We use channel-wise l1/l2 +group sparsity in the shared layers of the Convolutional Neural Network (CNN). +This approach not only facilitates the elimination of extraneous groups +(channels) but also imposes a penalty on the weights, thereby enhancing the +learning of all tasks. We compare the outcomes of single-task and multi-task +experiments under group sparsity on two publicly available MTL datasets, NYU-v2 +and CelebAMask-HQ. We also investigate how changing the sparsification degree +impacts both the performance of the model and the sparsity of groups. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Generalized Continual Category Discovery + + +
+ Most of Continual Learning (CL) methods push the limit of supervised learning +settings, where an agent is expected to learn new labeled tasks and not forget +previous knowledge. However, these settings are not well aligned with real-life +scenarios, where a learning agent has access to a vast amount of unlabeled data +encompassing both novel (entirely unlabeled) classes and examples from known +classes. Drawing inspiration from Generalized Category Discovery (GCD), we +introduce a novel framework that relaxes this assumption. Precisely, in any +task, we allow for the existence of novel and known classes, and one must use +continual version of unsupervised learning methods to discover them. We call +this setting Generalized Continual Category Discovery (GCCD). It unifies CL and +GCD, bridging the gap between synthetic benchmarks and real-life scenarios. +With a series of experiments, we present that existing methods fail to +accumulate knowledge from subsequent tasks in which unlabeled samples of novel +classes are present. In light of these limitations, we propose a method that +incorporates both supervised and unsupervised signals and mitigates the +forgetting through the use of centroid adaptation. Our method surpasses strong +CL methods adopted for GCD techniques and presents a superior representation +learning performance. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Quantifying degeneracy in singular models via the learning coefficient + + +
+ Deep neural networks (DNN) are singular statistical models which exhibit +complex degeneracies. In this work, we illustrate how a quantity known as the +\emph{learning coefficient} introduced in singular learning theory quantifies +precisely the degree of degeneracy in deep neural networks. Importantly, we +will demonstrate that degeneracy in DNN cannot be accounted for by simply +counting the number of "flat" directions. We propose a computationally scalable +approximation of a localized version of the learning coefficient using +stochastic gradient Langevin dynamics. To validate our approach, we demonstrate +its accuracy in low-dimensional models with known theoretical values. +Importantly, the local learning coefficient can correctly recover the ordering +of degeneracy between various parameter regions of interest. An experiment on +MNIST shows the local learning coefficient can reveal the inductive bias of +stochastic opitmizers for more or less degenerate critical points. + +
+
+ comment: 22 pages, 10 figures +
+
+
+
+
+ + ☆ Cached Operator Reordering: A Unified View for Fast GNN Training + + +
+ Graph Neural Networks (GNNs) are a powerful tool for handling structured +graph data and addressing tasks such as node classification, graph +classification, and clustering. However, the sparse nature of GNN computation +poses new challenges for performance optimization compared to traditional deep +neural networks. We address these challenges by providing a unified view of GNN +computation, I/O, and memory. By analyzing the computational graphs of the +Graph Convolutional Network (GCN) and Graph Attention (GAT) layers -- two +widely used GNN layers -- we propose alternative computation strategies. We +present adaptive operator reordering with caching, which achieves a speedup of +up to 2.43x for GCN compared to the current state-of-the-art. Furthermore, an +exploration of different caching schemes for GAT yields a speedup of up to +1.94x. The proposed optimizations save memory, are easily implemented across +various hardware platforms, and have the potential to alleviate performance +bottlenecks in training large-scale GNN models. + +
+
+
+
+
+ + ☆ Stabilizing RNN Gradients through Pre-training + + +
+ Numerous theories of learning suggest to prevent the gradient variance from +exponential growth with depth or time, to stabilize and improve training. +Typically, these analyses are conducted on feed-forward fully-connected neural +networks or single-layer recurrent neural networks, given their mathematical +tractability. In contrast, this study demonstrates that pre-training the +network to local stability can be effective whenever the architectures are too +complex for an analytical initialization. Furthermore, we extend known +stability theories to encompass a broader family of deep recurrent networks, +requiring minimal assumptions on data and parameter distribution, a theory that +we refer to as the Local Stability Condition (LSC). Our investigation reveals +that the classical Glorot, He, and Orthogonal initialization schemes satisfy +the LSC when applied to feed-forward fully-connected neural networks. However, +analysing deep recurrent networks, we identify a new additive source of +exponential explosion that emerges from counting gradient paths in a +rectangular grid in depth and time. We propose a new approach to mitigate this +issue, that consists on giving a weight of a half to the time and depth +contributions to the gradient, instead of the classical weight of one. Our +empirical results confirm that pre-training both feed-forward and recurrent +networks to fulfill the LSC often results in improved final performance across +models. This study contributes to the field by providing a means to stabilize +networks of any complexity. Our approach can be implemented as an additional +step before pre-training on large augmented datasets, and as an alternative to +finding stable initializations analytically. + +
+
+
+
+
+ + ☆ Identifying Reaction-Aware Driving Styles of Stochastic Model Predictive + Controlled Vehicles by Inverse Reinforcement Learning + + +
+ The driving style of an Autonomous Vehicle (AV) refers to how it behaves and +interacts with other AVs. In a multi-vehicle autonomous driving system, an AV +capable of identifying the driving styles of its nearby AVs can reliably +evaluate the risk of collisions and make more reasonable driving decisions. +However, there has not been a consistent definition of driving styles for an AV +in the literature, although it is considered that the driving style is encoded +in the AV's trajectories and can be identified using Maximum Entropy Inverse +Reinforcement Learning (ME-IRL) methods as a cost function. Nevertheless, an +important indicator of the driving style, i.e., how an AV reacts to its nearby +AVs, is not fully incorporated in the feature design of previous ME-IRL +methods. In this paper, we describe the driving style as a cost function of a +series of weighted features. We design additional novel features to capture the +AV's reaction-aware characteristics. Then, we identify the driving styles from +the demonstration trajectories generated by the Stochastic Model Predictive +Control (SMPC) using a modified ME-IRL method with our newly proposed features. +The proposed method is validated using MATLAB simulation and an off-the-shelf +experiment. + +
+
+
+
+
+ + ☆ InstructionGPT-4: A 200-Instruction Paradigm for Fine-Tuning MiniGPT-4 + + +
+ Multimodal large language models acquire their instruction-following +capabilities through a two-stage training process: pre-training on image-text +pairs and fine-tuning on supervised vision-language instruction data. Recent +studies have shown that large language models can achieve satisfactory results +even with a limited amount of high-quality instruction-following data. In this +paper, we introduce InstructionGPT-4, which is fine-tuned on a small dataset +comprising only 200 examples, amounting to approximately 6% of the +instruction-following data used in the alignment dataset for MiniGPT-4. We +first propose several metrics to access the quality of multimodal instruction +data. Based on these metrics, we present a simple and effective data selector +to automatically identify and filter low-quality vision-language data. By +employing this method, InstructionGPT-4 outperforms the original MiniGPT-4 on +various evaluations (e.g., visual question answering, GPT-4 preference). +Overall, our findings demonstrate that less but high-quality instruction tuning +data is efficient to enable multimodal large language models to generate better +output. + +
+
+
+
+
+ + ☆ Pre-gated MoE: An Algorithm-System Co-Design for Fast and Scalable + Mixture-of-Expert Inference + + +
+ Large language models (LLMs) based on transformers have made significant +strides in recent years, the success of which is driven by scaling up their +model size. Despite their high algorithmic performance, the computational and +memory requirements of LLMs present unprecedented challenges. To tackle the +high compute requirements of LLMs, the Mixture-of-Experts (MoE) architecture +was introduced which is able to scale its model size without proportionally +scaling up its computational requirements. Unfortunately, MoE's high memory +demands and dynamic activation of sparse experts restrict its applicability to +real-world problems. Previous solutions that offload MoE's memory-hungry expert +parameters to CPU memory fall short because the latency to migrate activated +experts from CPU to GPU incurs high performance overhead. Our proposed +Pre-gated MoE system effectively tackles the compute and memory challenges of +conventional MoE architectures using our algorithm-system co-design. Pre-gated +MoE employs our novel pre-gating function which alleviates the dynamic nature +of sparse expert activation, allowing our proposed system to address the large +memory footprint of MoEs while also achieving high performance. We demonstrate +that Pre-gated MoE is able to improve performance, reduce GPU memory +consumption, while also maintaining the same level of model quality. These +features allow our Pre-gated MoE system to cost-effectively deploy large-scale +LLMs using just a single GPU with high performance. + +
+
+
+
+
+ + ☆ Ensembling Uncertainty Measures to Improve Safety of Black-Box + Classifiers ECAI23 + + +
+ Machine Learning (ML) algorithms that perform classification may predict the +wrong class, experiencing misclassifications. It is well-known that +misclassifications may have cascading effects on the encompassing system, +possibly resulting in critical failures. This paper proposes SPROUT, a Safety +wraPper thROugh ensembles of UncertainTy measures, which suspects +misclassifications by computing uncertainty measures on the inputs and outputs +of a black-box classifier. If a misclassification is detected, SPROUT blocks +the propagation of the output of the classifier to the encompassing system. The +resulting impact on safety is that SPROUT transforms erratic outputs +(misclassifications) into data omission failures, which can be easily managed +at the system level. SPROUT has a broad range of applications as it fits binary +and multi-class classification, comprising image and tabular datasets. We +experimentally show that SPROUT always identifies a huge fraction of the +misclassifications of supervised classifiers, and it is able to detect all +misclassifications in specific cases. SPROUT implementation contains +pre-trained wrappers, it is publicly available and ready to be deployed with +minimal effort. + +
+
+ comment: To appear at ECAI23 in October23 +
+
+
+
+
+ + ☆ HarvestNet: A Dataset for Detecting Smallholder Farming Activity Using + Harvest Piles and Remote Sensing + + +
+ Small farms contribute to a large share of the productive land in developing +countries. In regions such as sub-Saharan Africa, where 80% of farms are small +(under 2 ha in size), the task of mapping smallholder cropland is an important +part of tracking sustainability measures such as crop productivity. However, +the visually diverse and nuanced appearance of small farms has limited the +effectiveness of traditional approaches to cropland mapping. Here we introduce +a new approach based on the detection of harvest piles characteristic of many +smallholder systems throughout the world. We present HarvestNet, a dataset for +mapping the presence of farms in the Ethiopian regions of Tigray and Amhara +during 2020-2023, collected using expert knowledge and satellite images, +totaling 7k hand-labeled images and 2k ground collected labels. We also +benchmark a set of baselines including SOTA models in remote sensing with our +best models having around 80% classification performance on hand labelled data +and 90%, 98% accuracy on ground truth data for Tigray, Amhara respectively. We +also perform a visual comparison with a widely used pre-existing coverage map +and show that our model detects an extra 56,621 hectares of cropland in Tigray. +We conclude that remote sensing of harvest piles can contribute to more timely +and accurate cropland assessments in food insecure region. + +
+
+ comment: 18 pages, 22 figures +
+
+
+
+
+ + ☆ Manipulating Embeddings of Stable Diffusion Prompts + + +
+ Generative text-to-image models such as Stable Diffusion allow users to +generate images based on a textual description, the prompt. Changing the prompt +is still the primary means for the user to change a generated image as desired. +However, changing the image by reformulating the prompt remains a difficult +process of trial and error, which has led to the emergence of prompt +engineering as a new field of research. We propose and analyze methods to +change the embedding of a prompt directly instead of the prompt text. It allows +for more fine-grained and targeted control that takes into account user +intentions. Our approach treats the generative text-to-image model as a +continuous function and passes gradients between the image space and the prompt +embedding space. By addressing different user interaction problems, we can +apply this idea in three scenarios: (1) Optimization of a metric defined in +image space that could measure, for example, image style. (2) Assistance of +users in creative tasks by enabling them to navigate the image space along a +selection of directions of "near" prompt embeddings. (3) Changing the embedding +of the prompt to include information that the user has seen in a particular +seed but finds difficult to describe in the prompt. Our experiments demonstrate +the feasibility of the described methods. + +
+
+
+
+
+ + ☆ Sample Complexity of Robust Learning against Evasion Attacks + + +
+ It is becoming increasingly important to understand the vulnerability of +machine learning models to adversarial attacks. One of the fundamental problems +in adversarial machine learning is to quantify how much training data is needed +in the presence of evasion attacks, where data is corrupted at test time. In +this thesis, we work with the exact-in-the-ball notion of robustness and study +the feasibility of adversarially robust learning from the perspective of +learning theory, considering sample complexity. + We first explore the setting where the learner has access to random examples +only, and show that distributional assumptions are essential. We then focus on +learning problems with distributions on the input data that satisfy a Lipschitz +condition and show that robustly learning monotone conjunctions has sample +complexity at least exponential in the adversary's budget (the maximum number +of bits it can perturb on each input). However, if the adversary is restricted +to perturbing $O(\log n)$ bits, then one can robustly learn conjunctions and +decision lists w.r.t. log-Lipschitz distributions. + We then study learning models where the learner is given more power. We first +consider local membership queries, where the learner can query the label of +points near the training sample. We show that, under the uniform distribution, +the exponential dependence on the adversary's budget to robustly learn +conjunctions remains inevitable. We then introduce a local equivalence query +oracle, which returns whether the hypothesis and target concept agree in a +given region around a point in the training sample, and a counterexample if it +exists. We show that if the query radius is equal to the adversary's budget, we +can develop robust empirical risk minimization algorithms in the +distribution-free setting. We give general query complexity upper and lower +bounds, as well as for concrete concept classes. + +
+
+ comment: DPhil (PhD) Thesis - University of Oxford +
+
+
+
+
+ + ☆ Layer-wise Feedback Propagation + + +
+ In this paper, we present Layer-wise Feedback Propagation (LFP), a novel +training approach for neural-network-like predictors that utilizes +explainability, specifically Layer-wise Relevance Propagation(LRP), to assign +rewards to individual connections based on their respective contributions to +solving a given task. This differs from traditional gradient descent, which +updates parameters towards anestimated loss minimum. LFP distributes a reward +signal throughout the model without the need for gradient computations. It then +strengthens structures that receive positive feedback while reducingthe +influence of structures that receive negative feedback. We establish the +convergence of LFP theoretically and empirically, and demonstrate its +effectiveness in achieving comparable performance to gradient descent on +various models and datasets. Notably, LFP overcomes certain limitations +associated with gradient-based methods, such as reliance on meaningful +derivatives. We further investigate how the different LRP-rules can be extended +to LFP, what their effects are on training, as well as potential applications, +such as training models with no meaningful derivatives, e.g., step-function +activated Spiking Neural Networks (SNNs), or for transfer learning, to +efficiently utilize existing knowledge. + +
+
+
+
+
+ + ☆ A multiobjective continuation method to compute the regularization path + of deep neural networks + + +
+ Sparsity is a highly desired feature in deep neural networks (DNNs) since it +ensures numerical efficiency, improves the interpretability of models (due to +the smaller number of relevant features), and robustness. In machine learning +approaches based on linear models, it is well known that there exists a +connecting path between the sparsest solution in terms of the $\ell^1$ norm +(i.e., zero weights) and the non-regularized solution, which is called the +regularization path. Very recently, there was a first attempt to extend the +concept of regularization paths to DNNs by means of treating the empirical loss +and sparsity ($\ell^1$ norm) as two conflicting criteria and solving the +resulting multiobjective optimization problem. However, due to the +non-smoothness of the $\ell^1$ norm and the high number of parameters, this +approach is not very efficient from a computational perspective. To overcome +this limitation, we present an algorithm that allows for the approximation of +the entire Pareto front for the above-mentioned objectives in a very efficient +manner. We present numerical examples using both deterministic and stochastic +gradients. We furthermore demonstrate that knowledge of the regularization path +allows for a well-generalizing network parametrization. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ☆ IncreLoRA: Incremental Parameter Allocation Method for + Parameter-Efficient Fine-tuning + + +
+ With the increasing size of pre-trained language models (PLMs), fine-tuning +all the parameters in the model is not efficient, especially when there are a +large number of downstream tasks, which incur significant training and storage +costs. Many parameter-efficient fine-tuning (PEFT) approaches have been +proposed, among which, Low-Rank Adaptation (LoRA) is a representative approach +that injects trainable rank decomposition matrices into every target module. +Yet LoRA ignores the importance of parameters in different modules. To address +this problem, many works have been proposed to prune the parameters of LoRA. +However, under limited training conditions, the upper bound of the rank of the +pruned parameter matrix is still affected by the preset values. We, therefore, +propose IncreLoRA, an incremental parameter allocation method that adaptively +adds trainable parameters during training based on the importance scores of +each module. This approach is different from the pruning method as it is not +limited by the initial number of training parameters, and each parameter matrix +has a higher rank upper bound for the same training overhead. We conduct +extensive experiments on GLUE to demonstrate the effectiveness of IncreLoRA. +The results show that our method owns higher parameter efficiency, especially +when under the low-resource settings where our method significantly outperforms +the baselines. Our code is publicly available. + +
+
+
+
+
+ + ☆ CACTUS: a Comprehensive Abstraction and Classification Tool for + Uncovering Structures + + +
+ The availability of large data sets is providing an impetus for driving +current artificial intelligent developments. There are, however, challenges for +developing solutions with small data sets due to practical and cost-effective +deployment and the opacity of deep learning models. The Comprehensive +Abstraction and Classification Tool for Uncovering Structures called CACTUS is +presented for improved secure analytics by effectively employing explainable +artificial intelligence. It provides additional support for categorical +attributes, preserving their original meaning, optimising memory usage, and +speeding up the computation through parallelisation. It shows to the user the +frequency of the attributes in each class and ranks them by their +discriminative power. Its performance is assessed by application to the +Wisconsin diagnostic breast cancer and Thyroid0387 data sets. + +
+
+
+
+
+ + ☆ Prompt-Based Length Controlled Generation with Reinforcement Learning + + +
+ Recently, large language models (LLMs) like ChatGPT and GPT-4 have attracted +great attention given their surprising improvement and performance. Length +controlled generation of LLMs emerges as an important topic, which also enables +users to fully leverage the capability of LLMs in more real-world scenarios +like generating a proper answer or essay of a desired length. In addition, the +autoregressive generation in LLMs is extremely time-consuming, while the +ability of controlling this generated length can arbitrarily reduce the +inference cost by limiting the length, and thus satisfy different needs. +Therefore, we aim to propose a prompt-based length control method to achieve +this length controlled generation, which can also be widely applied in +GPT-style LLMs. In particular, we adopt reinforcement learning with the reward +signal given by either trainable or rule-based reward model, which further +affects the generation of LLMs via rewarding a pre-defined target length. +Experiments show that our method significantly improves the accuracy of +prompt-based length control for summarization task on popular datasets like +CNNDM and NYT. We believe this length-controllable ability can provide more +potentials towards the era of LLMs. + +
+
+
+
+
+ + ☆ A Scale-Invariant Task Balancing Approach for Multi-Task Learning + + +
+ Multi-task learning (MTL), a learning paradigm to learn multiple related +tasks simultaneously, has achieved great success in various fields. However, +task-balancing remains a significant challenge in MTL, with the disparity in +loss/gradient scales often leading to performance compromises. In this paper, +we propose a Scale-Invariant Multi-Task Learning (SI-MTL) method to alleviate +the task-balancing problem from both loss and gradient perspectives. +Specifically, SI-MTL contains a logarithm transformation which is performed on +all task losses to ensure scale-invariant at the loss level, and a gradient +balancing method, SI-G, which normalizes all task gradients to the same +magnitude as the maximum gradient norm. Extensive experiments conducted on +several benchmark datasets consistently demonstrate the effectiveness of SI-G +and the state-of-the-art performance of SI-MTL. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Bias-Aware Minimisation: Understanding and Mitigating Estimator Bias in + Private SGD + + +
+ Differentially private SGD (DP-SGD) holds the promise of enabling the safe +and responsible application of machine learning to sensitive datasets. However, +DP-SGD only provides a biased, noisy estimate of a mini-batch gradient. This +renders optimisation steps less effective and limits model utility as a result. +With this work, we show a connection between per-sample gradient norms and the +estimation bias of the private gradient oracle used in DP-SGD. Here, we propose +Bias-Aware Minimisation (BAM) that allows for the provable reduction of private +gradient estimator bias. We show how to efficiently compute quantities needed +for BAM to scale to large neural networks and highlight similarities to closely +related methods such as Sharpness-Aware Minimisation. Finally, we provide +empirical evidence that BAM not only reduces bias but also substantially +improves privacy-utility trade-offs on the CIFAR-10, CIFAR-100, and ImageNet-32 +datasets. + +
+
+ comment: Accepted to the 2023 Theory and Practice of Differential Privacy + (TPDP) Workshop +
+
+
+
+
+ + ☆ MKL-$L_{0/1}$-SVM + + +
+ This paper presents a Multiple Kernel Learning (abbreviated as MKL) framework +for the Support Vector Machine (SVM) with the $(0, 1)$ loss function. Some +first-order optimality conditions are given and then exploited to develop a +fast ADMM solver to deal with the nonconvex and nonsmooth optimization problem. +Extensive numerical experiments on synthetic and real datasets show that the +performance of our MKL-$L_{0/1}$-SVM is comparable with the one of the leading +approaches called SimpleMKL developed by Rakotomamonjy, Bach, Canu, and +Grandvalet [Journal of Machine Learning Research, vol. 9, pp. 2491-2521, 2008]. + +
+
+ comment: 25 pages in the JMLR template, 4 figures, and 2 tables. arXiv admin + note: substantial text overlap with arXiv:2303.04445 +
+
+
+
+
+ + ☆ Quantum-Noise-driven Generative Diffusion Models + + +
+ Generative models realized with machine learning techniques are powerful +tools to infer complex and unknown data distributions from a finite number of +training samples in order to produce new synthetic data. Diffusion models are +an emerging framework that have recently overcome the performance of the +generative adversarial networks in creating synthetic text and high-quality +images. Here, we propose and discuss the quantum generalization of diffusion +models, i.e., three quantum-noise-driven generative diffusion models that could +be experimentally tested on real quantum systems. The idea is to harness unique +quantum features, in particular the non-trivial interplay among coherence, +entanglement and noise that the currently available noisy quantum processors do +unavoidably suffer from, in order to overcome the main computational burdens of +classical diffusion models during inference. Hence, we suggest to exploit +quantum noise not as an issue to be detected and solved but instead as a very +remarkably beneficial key ingredient to generate much more complex probability +distributions that would be difficult or even impossible to express +classically, and from which a quantum processor might sample more efficiently +than a classical one. Therefore, our results are expected to pave the way for +new quantum-inspired or quantum-based generative diffusion algorithms +addressing more powerfully classical tasks as data generation/prediction with +widespread real-world applications ranging from climate forecasting to +neuroscience, from traffic flow analysis to financial forecasting. + +
+
+ comment: 13 pages, 2 figures +
+
+
+
+
+ + ☆ Neural oscillators for magnetic hysteresis modeling + + +
+ Hysteresis is a ubiquitous phenomenon in science and engineering; its +modeling and identification are crucial for understanding and optimizing the +behavior of various systems. We develop an ordinary differential equation-based +recurrent neural network (RNN) approach to model and quantify the hysteresis, +which manifests itself in sequentiality and history-dependence. Our neural +oscillator, HystRNN, draws inspiration from coupled-oscillatory RNN and +phenomenological hysteresis models to update the hidden states. The performance +of HystRNN is evaluated to predict generalized scenarios, involving first-order +reversal curves and minor loops. The findings show the ability of HystRNN to +generalize its behavior to previously untrained regions, an essential feature +that hysteresis models must have. This research highlights the advantage of +neural oscillators over the traditional RNN-based methods in capturing complex +hysteresis patterns in magnetic materials, where traditional rate-dependent +methods are inadequate to capture intrinsic nonlinearity. + +
+
+
+
+
+ + ☆ On Uniformly Optimal Algorithms for Best Arm Identification in Two-Armed + Bandits with Fixed Budget + + +
+ We study the problem of best-arm identification with fixed budget in +stochastic two-arm bandits with Bernoulli rewards. We prove that surprisingly, +there is no algorithm that (i) performs as well as the algorithm sampling each +arm equally (this algorithm is referred to as the {\it uniform sampling} +algorithm) on all instances, and that (ii) strictly outperforms this algorithm +on at least one instance. In short, there is no algorithm better than the +uniform sampling algorithm. Towards this result, we introduce the natural class +of {\it consistent} and {\it stable} algorithms, and show that any algorithm +that performs as well as the uniform sampling algorithm on all instances +belongs to this class. The proof is completed by deriving a lower bound on the +error rate satisfied by any consistent and stable algorithm, and by showing +that the uniform sampling algorithm matches this lower bound. Our results +provide a solution to the two open problems presented in \cite{qin2022open}. + +
+
+
+
+
+ + ☆ Relational Concept Based Models + + +
+ The design of interpretable deep learning models working in relational +domains poses an open challenge: interpretable deep learning methods, such as +Concept-Based Models (CBMs), are not designed to solve relational problems, +while relational models are not as interpretable as CBMs. To address this +problem, we propose Relational Concept-Based Models, a family of relational +deep learning methods providing interpretable task predictions. Our +experiments, ranging from image classification to link prediction in knowledge +graphs, show that relational CBMs (i) match generalization performance of +existing relational black-boxes (as opposed to non-relational CBMs), (ii) +support the generation of quantified concept-based explanations, (iii) +effectively respond to test-time interventions, and (iv) withstand demanding +settings including out-of-distribution scenarios, limited training data +regimes, and scarce concept supervisions. + +
+
+
+
+
+ + ☆ Will More Expressive Graph Neural Networks do Better on Generative + Tasks? + + +
+ Graph generation poses a significant challenge as it involves predicting a +complete graph with multiple nodes and edges based on simply a given label. +This task also carries fundamental importance to numerous real-world +applications, including de-novo drug and molecular design. In recent years, +several successful methods have emerged in the field of graph generation. +However, these approaches suffer from two significant shortcomings: (1) the +underlying Graph Neural Network (GNN) architectures used in these methods are +often underexplored; and (2) these methods are often evaluated on only a +limited number of metrics. To fill this gap, we investigate the expressiveness +of GNNs under the context of the molecular graph generation task, by replacing +the underlying GNNs of graph generative models with more expressive GNNs. +Specifically, we analyse the performance of six GNNs in two different +generative frameworks (GCPN and GraphAF), on six different molecular generative +objectives on the ZINC-250k dataset. Through our extensive experiments, we +demonstrate that advanced GNNs can indeed improve the performance of GCPN and +GraphAF on molecular generation tasks, but GNN expressiveness is not a +necessary condition for a good GNN-based generative model. Moreover, we show +that GCPN and GraphAF with advanced GNNs can achieve state-of-the-art results +across 17 other non-GNN-based graph generative approaches, such as variational +autoencoders and Bayesian optimisation models, on the proposed molecular +generative objectives (DRD2, Median1, Median2), which are important metrics for +de-novo molecular design. + +
+
+
+
+
+ + ☆ Approximating Score-based Explanation Techniques Using Conformal + Regression + + +
+ Score-based explainable machine-learning techniques are often used to +understand the logic behind black-box models. However, such explanation +techniques are often computationally expensive, which limits their application +in time-critical contexts. Therefore, we propose and investigate the use of +computationally less costly regression models for approximating the output of +score-based explanation techniques, such as SHAP. Moreover, validity guarantees +for the approximated values are provided by the employed inductive conformal +prediction framework. We propose several non-conformity measures designed to +take the difficulty of approximating the explanations into account while +keeping the computational cost low. We present results from a large-scale +empirical investigation, in which the approximate explanations generated by our +proposed models are evaluated with respect to efficiency (interval size). The +results indicate that the proposed method can significantly improve execution +time compared to the fast version of SHAP, TreeSHAP. The results also suggest +that the proposed method can produce tight intervals, while providing validity +guarantees. Moreover, the proposed approach allows for comparing explanations +of different approximation methods and selecting a method based on how +informative (tight) are the predicted intervals. + +
+
+ comment: 20 pages, 14 figures, The 12th Symposium on Conformal and + Probabilistic Prediction with Applications (COPA 2023) +
+
+
+
+
+ + ☆ EVE: Efficient Vision-Language Pre-training with Masked Prediction and + Modality-Aware MoE + + +
+ Building scalable vision-language models to learn from diverse, multimodal +data remains an open challenge. In this paper, we introduce an Efficient +Vision-languagE foundation model, namely EVE, which is one unified multimodal +Transformer pre-trained solely by one unified pre-training task. Specifically, +EVE encodes both vision and language within a shared Transformer network +integrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which +capture modality-specific information by selectively switching to different +experts. To unify pre-training tasks of vision and language, EVE performs +masked signal modeling on image-text pairs to reconstruct masked signals, i.e., +image pixels and text tokens, given visible signals. This simple yet effective +pre-training objective accelerates training by 3.5x compared to the model +pre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing +to the combination of the unified architecture and pre-training task, EVE is +easy to scale up, enabling better downstream performance with fewer resources +and faster training speed. Despite its simplicity, EVE achieves +state-of-the-art performance on various vision-language downstream tasks, +including visual question answering, visual reasoning, and image-text +retrieval. + +
+
+
+
+
+ + ☆ Anisotropic Hybrid Networks for liver tumor segmentation with + uncertainty quantification MICCAI + + +
+ The burden of liver tumors is important, ranking as the fourth leading cause +of cancer mortality. In case of hepatocellular carcinoma (HCC), the delineation +of liver and tumor on contrast-enhanced magnetic resonance imaging (CE-MRI) is +performed to guide the treatment strategy. As this task is time-consuming, +needs high expertise and could be subject to inter-observer variability there +is a strong need for automatic tools. However, challenges arise from the lack +of available training data, as well as the high variability in terms of image +resolution and MRI sequence. In this work we propose to compare two different +pipelines based on anisotropic models to obtain the segmentation of the liver +and tumors. The first pipeline corresponds to a baseline multi-class model that +performs the simultaneous segmentation of the liver and tumor classes. In the +second approach, we train two distinct binary models, one segmenting the liver +only and the other the tumors. Our results show that both pipelines exhibit +different strengths and weaknesses. Moreover we propose an uncertainty +quantification strategy allowing the identification of potential false positive +tumor lesions. Both solutions were submitted to the MICCAI 2023 Atlas challenge +regarding liver and tumor segmentation. + +
+
+ comment: Accepted for presentation at MICCAI Workshop on 2nd + Resource-Efficient Medical Image Analysis (REMIA) +
+
+
+
+
+ + ☆ Maintaining Plasticity via Regenerative Regularization + + +
+ In continual learning, plasticity refers to the ability of an agent to +quickly adapt to new information. Neural networks are known to lose plasticity +when processing non-stationary data streams. In this paper, we propose L2 Init, +a very simple approach for maintaining plasticity by incorporating in the loss +function L2 regularization toward initial parameters. This is very similar to +standard L2 regularization (L2), the only difference being that L2 regularizes +toward the origin. L2 Init is simple to implement and requires selecting only a +single hyper-parameter. The motivation for this method is the same as that of +methods that reset neurons or parameter values. Intuitively, when recent losses +are insensitive to particular parameters, these parameters drift toward their +initial values. This prepares parameters to adapt quickly to new tasks. On +simple problems representative of different types of nonstationarity in +continual learning, we demonstrate that L2 Init consistently mitigates +plasticity loss. We additionally find that our regularization term reduces +parameter magnitudes and maintains a high effective feature rank. + +
+
+
+
+
+ + ☆ When MiniBatch SGD Meets SplitFed Learning:Convergence Analysis and + Performance Evaluation + + +
+ Federated learning (FL) enables collaborative model training across +distributed clients (e.g., edge devices) without sharing raw data. Yet, FL can +be computationally expensive as the clients need to train the entire model +multiple times. SplitFed learning (SFL) is a recent distributed approach that +alleviates computation workload at the client device by splitting the model at +a cut layer into two parts, where clients only need to train part of the model. +However, SFL still suffers from the \textit{client drift} problem when clients' +data are highly non-IID. To address this issue, we propose MiniBatch-SFL. This +algorithm incorporates MiniBatch SGD into SFL, where the clients train the +client-side model in an FL fashion while the server trains the server-side +model similar to MiniBatch SGD. We analyze the convergence of MiniBatch-SFL and +show that the bound of the expected loss can be obtained by analyzing the +expected server-side and client-side model updates, respectively. The +server-side updates do not depend on the non-IID degree of the clients' +datasets and can potentially mitigate client drift. However, the client-side +model relies on the non-IID degree and can be optimized by properly choosing +the cut layer. Perhaps counter-intuitive, our empirical result shows that a +latter position of the cut layer leads to a smaller average gradient divergence +and a better algorithm performance. Moreover, numerical results show that +MiniBatch-SFL achieves higher accuracy than conventional SFL and FL. The +accuracy improvement can be up to 24.1\% and 17.1\% with highly non-IID data, +respectively. + +
+
+
+
+
+ + ☆ Multi-scale Transformer Pyramid Networks for Multivariate Time Series + Forecasting + + +
+ Multivariate Time Series (MTS) forecasting involves modeling temporal +dependencies within historical records. Transformers have demonstrated +remarkable performance in MTS forecasting due to their capability to capture +long-term dependencies. However, prior work has been confined to modeling +temporal dependencies at either a fixed scale or multiple scales that +exponentially increase (most with base 2). This limitation hinders their +effectiveness in capturing diverse seasonalities, such as hourly and daily +patterns. In this paper, we introduce a dimension invariant embedding technique +that captures short-term temporal dependencies and projects MTS data into a +higher-dimensional space, while preserving the dimensions of time steps and +variables in MTS data. Furthermore, we present a novel Multi-scale Transformer +Pyramid Network (MTPNet), specifically designed to effectively capture temporal +dependencies at multiple unconstrained scales. The predictions are inferred +from multi-scale latent representations obtained from transformers at various +scales. Extensive experiments on nine benchmark datasets demonstrate that the +proposed MTPNet outperforms recent state-of-the-art methods. + +
+
+
+
+
+ + ☆ RamseyRL: A Framework for Intelligent Ramsey Number Counterexample + Searching AAAI2024 + + +
+ The Ramsey number is the minimum number of nodes, $n = R(s, t)$, such that +all undirected simple graphs of order $n$, contain a clique of order $s$, or an +independent set of order $t$. This paper explores the application of a best +first search algorithm and reinforcement learning (RL) techniques to find +counterexamples to specific Ramsey numbers. We incrementally improve over prior +search methods such as random search by introducing a graph vectorization and +deep neural network (DNN)-based heuristic, which gauge the likelihood of a +graph being a counterexample. The paper also proposes algorithmic optimizations +to confine a polynomial search runtime. This paper does not aim to present new +counterexamples but rather introduces and evaluates a framework supporting +Ramsey counterexample exploration using other heuristics. Code and methods are +made available through a PyPI package and GitHub repository. + +
+
+ comment: 8 pages, 4 figures, submitted to AAAI2024 +
+
+
+
+
+ + ☆ Audio Generation with Multiple Conditional Diffusion Model AAAI 2024 + + +
+ Text-based audio generation models have limitations as they cannot encompass +all the information in audio, leading to restricted controllability when +relying solely on text. To address this issue, we propose a novel model that +enhances the controllability of existing pre-trained text-to-audio models by +incorporating additional conditions including content (timestamp) and style +(pitch contour and energy contour) as supplements to the text. This approach +achieves fine-grained control over the temporal order, pitch, and energy of +generated audio. To preserve the diversity of generation, we employ a trainable +control condition encoder that is enhanced by a large language model and a +trainable Fusion-Net to encode and fuse the additional conditions while keeping +the weights of the pre-trained text-to-audio model frozen. Due to the lack of +suitable datasets and evaluation metrics, we consolidate existing datasets into +a new dataset comprising the audio and corresponding conditions and use a +series of evaluation metrics to evaluate the controllability performance. +Experimental results demonstrate that our model successfully achieves +fine-grained control to accomplish controllable audio generation. Audio samples +and our dataset are publicly available at +https://conditionaudiogen.github.io/conditionaudiogen/ + +
+
+ comment: Submitted to AAAI 2024 +
+
+
+
+
+ + ☆ Retail Demand Forecasting: A Comparative Study for Multivariate Time + Series + + +
+ Accurate demand forecasting in the retail industry is a critical determinant +of financial performance and supply chain efficiency. As global markets become +increasingly interconnected, businesses are turning towards advanced prediction +models to gain a competitive edge. However, existing literature mostly focuses +on historical sales data and ignores the vital influence of macroeconomic +conditions on consumer spending behavior. In this study, we bridge this gap by +enriching time series data of customer demand with macroeconomic variables, +such as the Consumer Price Index (CPI), Index of Consumer Sentiment (ICS), and +unemployment rates. Leveraging this comprehensive dataset, we develop and +compare various regression and machine learning models to predict retail demand +accurately. + +
+
+
+
+
+ + ☆ System Identification for Continuous-time Linear Dynamical Systems + + +
+ The problem of system identification for the Kalman filter, relying on the +expectation-maximization (EM) procedure to learn the underlying parameters of a +dynamical system, has largely been studied assuming that observations are +sampled at equally-spaced time points. However, in many applications this is a +restrictive and unrealistic assumption. This paper addresses system +identification for the continuous-discrete filter, with the aim of generalizing +learning for the Kalman filter by relying on a solution to a continuous-time +It\^o stochastic differential equation (SDE) for the latent state and +covariance dynamics. We introduce a novel two-filter, analytical form for the +posterior with a Bayesian derivation, which yields analytical updates which do +not require the forward-pass to be pre-computed. Using this analytical and +efficient computation of the posterior, we provide an EM procedure which +estimates the parameters of the SDE, naturally incorporating irregularly +sampled measurements. Generalizing the learning of latent linear dynamical +systems (LDS) to continuous-time may extend the use of the hybrid Kalman filter +to data which is not regularly sampled or has intermittent missing values, and +can extend the power of non-linear system identification methods such as +switching LDS (SLDS), which rely on EM for the linear discrete-time Kalman +filter as a sub-unit for learning locally linearized behavior of a non-linear +system. We apply the method by learning the parameters of a latent, +multivariate Fokker-Planck SDE representing a toggle-switch genetic circuit +using biologically realistic parameters, and compare the efficacy of learning +relative to the discrete-time Kalman filter as the step-size irregularity and +spectral-radius of the dynamics-matrix increases. + +
+
+ comment: 32 pages, 3 figures +
+
+
+
+
+ + ☆ Dynamic landslide susceptibility mapping over recent three decades to + uncover variations in landslide causes in subtropical urban mountainous areas + + +
+ Landslide susceptibility assessment (LSA) is of paramount importance in +mitigating landslide risks. Recently, there has been a surge in the utilization +of data-driven methods for predicting landslide susceptibility due to the +growing availability of aerial and satellite data. Nonetheless, the rapid +oscillations within the landslide-inducing environment (LIE), primarily due to +significant changes in external triggers such as rainfall, pose difficulties +for contemporary data-driven LSA methodologies to accommodate LIEs over diverse +timespans. This study presents dynamic landslide susceptibility mapping that +simply employs multiple predictive models for annual LSA. In practice, this +will inevitably encounter small sample problems due to the limited number of +landslide samples in certain years. Another concern arises owing to the +majority of the existing LSA approaches train black-box models to fit distinct +datasets, yet often failing in generalization and providing comprehensive +explanations concerning the interactions between input features and +predictions. Accordingly, we proposed to meta-learn representations with fast +adaptation ability using a few samples and gradient updates; and apply SHAP for +each model interpretation and landslide feature permutation. Additionally, we +applied MT-InSAR for LSA result enhancement and validation. The chosen study +area is Lantau Island, Hong Kong, where we conducted a comprehensive dynamic +LSA spanning from 1992 to 2019. The model interpretation results demonstrate +that the primary factors responsible for triggering landslides in Lantau Island +are terrain slope and extreme rainfall. The results also indicate that the +variation in landslide causes can be primarily attributed to extreme rainfall +events, which result from global climate change, and the implementation of the +Landslip Prevention and Mitigation Programme (LPMitP) by the Hong Kong +government. + +
+
+
+
+
+ + ☆ Solving Elliptic Optimal Control Problems using Physics Informed Neural + Networks + + +
+ In this work, we present and analyze a numerical solver for optimal control +problems (without / with box constraint) for linear and semilinear second-order +elliptic problems. The approach is based on a coupled system derived from the +first-order optimality system of the optimal control problem, and applies +physics informed neural networks (PINNs) to solve the coupled system. We +present an error analysis of the numerical scheme, and provide $L^2(\Omega)$ +error bounds on the state, control and adjoint state in terms of deep neural +network parameters (e.g., depth, width, and parameter bounds) and the number of +sampling points in the domain and on the boundary. The main tools in the +analysis include offset Rademacher complexity and boundedness and Lipschitz +continuity of neural network functions. We present several numerical examples +to illustrate the approach and compare it with three existing approaches. + +
+
+ comment: 28 pages, 5 figures +
+
+
+
+
+ + ☆ Diverse Policies Converge in Reward-free Markov Decision Processe + + +
+ Reinforcement learning has achieved great success in many decision-making +tasks, and traditional reinforcement learning algorithms are mainly designed +for obtaining a single optimal solution. However, recent works show the +importance of developing diverse policies, which makes it an emerging research +topic. Despite the variety of diversity reinforcement learning algorithms that +have emerged, none of them theoretically answer the question of how the +algorithm converges and how efficient the algorithm is. In this paper, we +provide a unified diversity reinforcement learning framework and investigate +the convergence of training diverse policies. Under such a framework, we also +propose a provably efficient diversity reinforcement learning algorithm. +Finally, we verify the effectiveness of our method through numerical +experiments. + +
+
+
+
+
+ + ☆ Audio Difference Captioning Utilizing Similarity-Discrepancy + Disentanglement + + +
+ We proposed Audio Difference Captioning (ADC) as a new extension task of +audio captioning for describing the semantic differences between input pairs of +similar but slightly different audio clips. The ADC solves the problem that +conventional audio captioning sometimes generates similar captions for similar +audio clips, failing to describe the difference in content. We also propose a +cross-attention-concentrated transformer encoder to extract differences by +comparing a pair of audio clips and a similarity-discrepancy disentanglement to +emphasize the difference in the latent space. To evaluate the proposed methods, +we built an AudioDiffCaps dataset consisting of pairs of similar but slightly +different audio clips with human-annotated descriptions of their differences. +The experiment with the AudioDiffCaps dataset showed that the proposed methods +solve the ADC task effectively and improve the attention weights to extract the +difference by visualizing them in the transformer encoder. + +
+
+ comment: Accepted to DCASE2023 Workshop +
+
+
+
+
+ + ☆ Addressing Selection Bias in Computerized Adaptive Testing: A User-Wise + Aggregate Influence Function Approach CIKM 2023 + + +
+ Computerized Adaptive Testing (CAT) is a widely used, efficient test mode +that adapts to the examinee's proficiency level in the test domain. CAT +requires pre-trained item profiles, for CAT iteratively assesses the student +real-time based on the registered items' profiles, and selects the next item to +administer using candidate items' profiles. However, obtaining such item +profiles is a costly process that involves gathering a large, dense +item-response data, then training a diagnostic model on the collected data. In +this paper, we explore the possibility of leveraging response data collected in +the CAT service. We first show that this poses a unique challenge due to the +inherent selection bias introduced by CAT, i.e., more proficient students will +receive harder questions. Indeed, when naively training the diagnostic model +using CAT response data, we observe that item profiles deviate significantly +from the ground-truth. To tackle the selection bias issue, we propose the +user-wise aggregate influence function method. Our intuition is to filter out +users whose response data is heavily biased in an aggregate manner, as judged +by how much perturbation the added data will introduce during parameter +estimation. This way, we may enhance the performance of CAT while introducing +minimal bias to the item profiles. We provide extensive experiments to +demonstrate the superiority of our proposed method based on the three public +datasets and one dataset that contains real-world CAT response data. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ☆ Utilizing Admissible Bounds for Heuristic Learning + + +
+ While learning a heuristic function for forward search algorithms with modern +machine learning techniques has been gaining interest in recent years, there +has been little theoretical understanding of \emph{what} they should learn, +\emph{how} to train them, and \emph{why} we do so. This lack of understanding +leads to various literature performing an ad-hoc selection of datasets +(suboptimal vs optimal costs or admissible vs inadmissible heuristics) and +optimization metrics (e.g., squared vs absolute errors). Moreover, due to the +lack of admissibility of the resulting trained heuristics, little focus has +been put on the role of admissibility \emph{during} learning. This paper +articulates the role of admissible heuristics in supervised heuristic learning +using them as parameters of Truncated Gaussian distributions, which tightens +the hypothesis space compared to ordinary Gaussian distributions. We argue that +this mathematical model faithfully follows the principle of maximum entropy and +empirically show that, as a result, it yields more accurate heuristics and +converges faster during training. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ☆ Rethinking Data Perturbation and Model Stabilization for Semi-supervised + Medical Image Segmentation + + +
+ Studies on semi-supervised medical image segmentation (SSMIS) have seen fast +progress recently. Due to the limited labelled data, SSMIS methods mainly focus +on effectively leveraging unlabeled data to enhance the segmentation +performance. However, despite their promising performance, current +state-of-the-art methods often prioritize integrating complex techniques and +loss terms rather than addressing the core challenges of semi-supervised +scenarios directly. We argue that the key to SSMIS lies in generating +substantial and appropriate prediction disagreement on unlabeled data. To this +end, we emphasize the crutiality of data perturbation and model stabilization +in semi-supervised segmentation, and propose a simple yet effective approach to +boost SSMIS performance significantly, dubbed DPMS. Specifically, we first +revisit SSMIS from three distinct perspectives: the data, the model, and the +loss, and conduct a comprehensive study of corresponding strategies to examine +their effectiveness. Based on these examinations, we then propose DPMS, which +adopts a plain teacher-student framework with a standard supervised loss and +unsupervised consistency loss. To produce appropriate prediction disagreements, +DPMS perturbs the unlabeled data via strong augmentations to enlarge prediction +disagreements considerably. On the other hand, using EMA teacher when strong +augmentation is applied does not necessarily improve performance. DPMS further +utilizes a forwarding-twice and momentum updating strategies for normalization +statistics to stabilize the training on unlabeled data effectively. Despite its +simplicity, DPMS can obtain new state-of-the-art performance on the public 2D +ACDC and 3D LA datasets across various semi-supervised settings, e.g. obtaining +a remarkable 22.62% improvement against previous SOTA on ACDC with 5% labels. + +
+
+ comment: Code and logs are available at https://github.com/ZhenZHAO/DPMS +
+
+
+
+
+ + ☆ Shape-conditioned 3D Molecule Generation via Equivariant Diffusion + Models + + +
+ Ligand-based drug design aims to identify novel drug candidates of similar +shapes with known active molecules. In this paper, we formulated an in silico +shape-conditioned molecule generation problem to generate 3D molecule +structures conditioned on the shape of a given molecule. To address this +problem, we developed a translation- and rotation-equivariant shape-guided +generative model ShapeMol. ShapeMol consists of an equivariant shape encoder +that maps molecular surface shapes into latent embeddings, and an equivariant +diffusion model that generates 3D molecules based on these embeddings. +Experimental results show that ShapeMol can generate novel, diverse, drug-like +molecules that retain 3D molecular shapes similar to the given shape condition. +These results demonstrate the potential of ShapeMol in designing drug +candidates of desired 3D shapes binding to protein target pockets. + +
+
+
+
+
+ + ☆ Adversarial Training Using Feedback Loops + + +
+ Deep neural networks (DNN) have found wide applicability in numerous fields +due to their ability to accurately learn very complex input-output relations. +Despite their accuracy and extensive use, DNNs are highly susceptible to +adversarial attacks due to limited generalizability. For future progress in the +field, it is essential to build DNNs that are robust to any kind of +perturbations to the data points. In the past, many techniques have been +proposed to robustify DNNs using first-order derivative information of the +network. + This paper proposes a new robustification approach based on control theory. A +neural network architecture that incorporates feedback control, named Feedback +Neural Networks, is proposed. The controller is itself a neural network, which +is trained using regular and adversarial data such as to stabilize the system +outputs. The novel adversarial training approach based on the feedback control +architecture is called Feedback Looped Adversarial Training (FLAT). Numerical +results on standard test problems empirically show that our FLAT method is more +effective than the state-of-the-art to guard against adversarial attacks. + +
+
+
+
+
+ + ☆ SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal + Targets ICCV 2023 + + +
+ Scene understanding using multi-modal data is necessary in many applications, +e.g., autonomous navigation. To achieve this in a variety of situations, +existing models must be able to adapt to shifting data distributions without +arduous data annotation. Current approaches assume that the source data is +available during adaptation and that the source consists of paired multi-modal +data. Both these assumptions may be problematic for many applications. Source +data may not be available due to privacy, security, or economic concerns. +Assuming the existence of paired multi-modal data for training also entails +significant data collection costs and fails to take advantage of widely +available freely distributed pre-trained uni-modal models. In this work, we +relax both of these assumptions by addressing the problem of adapting a set of +models trained independently on uni-modal data to a target domain consisting of +unlabeled multi-modal data, without having access to the original source +dataset. Our proposed approach solves this problem through a switching +framework which automatically chooses between two complementary methods of +cross-modal pseudo-label fusion -- agreement filtering and entropy weighting -- +based on the estimated domain gap. We demonstrate our work on the semantic +segmentation problem. Experiments across seven challenging adaptation scenarios +verify the efficacy of our approach, achieving results comparable to, and in +some cases outperforming, methods which assume access to source data. Our +method achieves an improvement in mIoU of up to 12% over competing baselines. +Our code is publicly available at https://github.com/csimo005/SUMMIT. + +
+
+ comment: 12 pages, 5 figures, 9 tables, ICCV 2023 +
+
+
+
+
+ + ☆ Cabrita: closing the gap for foreign languages + + +
+ The strategy of training the model from scratch in a specific language or +domain serves two essential purposes: i) enhancing performance in the +particular linguistic or domain context, and ii) ensuring effective +tokenization. The main limitation inherent to this approach lies in the +associated cost, which can reach six to seven-digit dollar values, depending on +the model size and the number of parameters involved. + The main solution to overcome the cost challenge is to rely on available +pre-trained models, which, despite recent advancements such as the LLaMA and +LLaMA-2 models, still demonstrate inefficiency for certain specific domain +problems or prove ineffective in scenarios involving conversational memory +resources, given the large number of tokens required to represent text. + To overcome this issue, we present a methodology named Cabrita, which, as our +research demonstrates, successfully addresses the performance and efficient +tokenization problem, all at an affordable cost. We believe that this +methodology can be applied to any transformer-like architecture model. To +validate the study, we conducted continuous pre-training exclusively using +Portuguese text on a 3-billion-parameter model known as OpenLLaMA, resulting in +a model named openCabrita 3B. The openCabrita 3B also features a new tokenizer +that results in a significant reduction in the number of tokens required to +represent the text. In our assessment, for few-shot learning tasks, we achieved +similar results with this 3B model compared to a traditional continuous +pre-training approach as well as to 7B models English pre-trained models. + +
+
+ comment: 9 pages, 1 figure +
+
+
+
+
+ + ☆ Integrating Large Language Models into the Debugging C Compiler for + generating contextual error explanations + + +
+ This paper introduces a method for Large Language Models (LLM) to produce +enhanced compiler error explanations, in simple language, within our Debugging +C Compiler (DCC). It is well documented that compiler error messages have been +known to present a barrier for novices learning how to program. Although our +initial use of DCC in introductory programming (CS1) has been instrumental in +teaching C to novice programmers by providing safeguards to commonly occurring +errors and translating the usually cryptic compiler error messages at both +compile- and run-time, we proposed that incorporating LLM-generated +explanations would further enhance the learning experience for novice +programmers. Through an expert evaluation, we observed that LLM-generated +explanations for compiler errors were conceptually accurate in 90% of +compile-time errors, and 75% of run-time errors. Additionally, the new DCC-help +tool has been increasingly adopted by students, with an average of 1047 unique +runs per week, demonstrating a promising initial assessment of using LLMs to +complement compiler output to enhance programming education for beginners. We +release our tool as open-source to the community. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ☆ KinSPEAK: Improving speech recognition for Kinyarwanda via + semi-supervised learning methods + + +
+ Despite recent availability of large transcribed Kinyarwanda speech data, +achieving robust speech recognition for Kinyarwanda is still challenging. In +this work, we show that using self-supervised pre-training, following a simple +curriculum schedule during fine-tuning and using semi-supervised learning to +leverage large unlabelled speech data significantly improve speech recognition +performance for Kinyarwanda. Our approach focuses on using public domain data +only. A new studio-quality speech dataset is collected from a public website, +then used to train a clean baseline model. The clean baseline model is then +used to rank examples from a more diverse and noisy public dataset, defining a +simple curriculum training schedule. Finally, we apply semi-supervised learning +to label and learn from large unlabelled data in four successive generations. +Our final model achieves 3.2% word error rate (WER) on the new dataset and +15.9% WER on Mozilla Common Voice benchmark, which is state-of-the-art to the +best of our knowledge. Our experiments also indicate that using syllabic rather +than character-based tokenization results in better speech recognition +performance for Kinyarwanda. + +
+
+ comment: 9 pages, 2 figures, 5 tables +
+
+
+
+
+ + ☆ Finding the Perfect Fit: Applying Regression Models to ClimateBench v1.0 + + +
+ Climate projections using data driven machine learning models acting as +emulators, is one of the prevailing areas of research to enable policy makers +make informed decisions. Use of machine learning emulators as surrogates for +computationally heavy GCM simulators reduces time and carbon footprints. In +this direction, ClimateBench [1] is a recently curated benchmarking dataset for +evaluating the performance of machine learning emulators designed for climate +data. Recent studies have reported that despite being considered fundamental, +regression models offer several advantages pertaining to climate emulations. In +particular, by leveraging the kernel trick, regression models can capture +complex relationships and improve their predictive capabilities. This study +focuses on evaluating non-linear regression models using the aforementioned +dataset. Specifically, we compare the emulation capabilities of three +non-linear regression models. Among them, Gaussian Process Regressor +demonstrates the best-in-class performance against standard evaluation metrics +used for climate field emulation studies. However, Gaussian Process Regression +suffers from being computational resource hungry in terms of space and time +complexity. Alternatively, Support Vector and Kernel Ridge models also deliver +competitive results and but there are certain trade-offs to be addressed. +Additionally, we are actively investigating the performance of composite +kernels and techniques such as variational inference to further enhance the +performance of the regression models and effectively model complex non-linear +patterns, including phenomena like precipitation. + +
+
+
+
+
+ + ☆ A deep reinforcement learning approach for real-time demand-responsive + railway rescheduling to mitigate station overcrowding using mobile data + + +
+ Real-time railway rescheduling is a timely and flexible technique to +automatically alter the operation schedule in response to time-varying +conditions. Current research lacks data-driven approaches that capture +real-time passenger mobility during railway disruptions, relying mostly on +OD-based data and model-based methods for estimating demands of trains. +Meanwhile, the schedule-updating principles for a long-term disruption overlook +the uneven distribution of demand over time. To fill this gap, this paper +proposes a demand-responsive approach by inferring real-world passenger +mobility from mobile data (MD) to facilitate real-time rescheduling. Unlike +network-level approaches, this paper focuses on a heavy-demand station upstream +of the disrupted area. The objective is to reschedule all trains on multiple +routes passing through this target station, which have been affected by a +severe emergency event such as a natural disaster. Particular attention should +be given to avoiding the accumulation of overcrowded passengers at this +station, to prevent additional accidents arising from overcrowding. This +research addresses the challenges associated with this scenario, including the +dynamics of arriving and leaving of passengers, station overcrowding, rolling +stock shortage, open-ended disruption duration, integrated rescheduling on +multiple routes, and delays due to detours. A deep reinforcement learning (DRL) +framework is proposed to determine the optimal rescheduled timetable, route +stops, and rolling stock allocation, while considering real-time demand +satisfaction, station overcrowding, train capacity utilization, and headway +safety. + +
+
+ comment: 36 pages,16 figures +
+
+
+
+
+ + ☆ SEA: Shareable and Explainable Attribution for Query-based Black-box + Attacks + + +
+ Machine Learning (ML) systems are vulnerable to adversarial examples, +particularly those from query-based black-box attacks. Despite various efforts +to detect and prevent such attacks, there is a need for a more comprehensive +approach to logging, analyzing, and sharing evidence of attacks. While classic +security benefits from well-established forensics and intelligence sharing, +Machine Learning is yet to find a way to profile its attackers and share +information about them. In response, this paper introduces SEA, a novel ML +security system to characterize black-box attacks on ML systems for forensic +purposes and to facilitate human-explainable intelligence sharing. SEA +leverages the Hidden Markov Models framework to attribute the observed query +sequence to known attacks. It thus understands the attack's progression rather +than just focusing on the final adversarial examples. Our evaluations reveal +that SEA is effective at attack attribution, even on their second occurrence, +and is robust to adaptive strategies designed to evade forensics analysis. +Interestingly, SEA's explanations of the attack behavior allow us even to +fingerprint specific minor implementation bugs in attack libraries. For +example, we discover that the SignOPT and Square attacks implementation in ART +v1.14 sends over 50% specific zero difference queries. We thoroughly evaluate +SEA on a variety of settings and demonstrate that it can recognize the same +attack's second occurrence with 90+% Top-1 and 95+% Top-3 accuracy. + +
+
+
+
+
+ + ☆ ${\rm E}(3)$-Equivariant Actor-Critic Methods for Cooperative + Multi-Agent Reinforcement Learning + + +
+ Identification and analysis of symmetrical patterns in the natural world have +led to significant discoveries across various scientific fields, such as the +formulation of gravitational laws in physics and advancements in the study of +chemical structures. In this paper, we focus on exploiting Euclidean symmetries +inherent in certain cooperative multi-agent reinforcement learning (MARL) +problems and prevalent in many applications. We begin by formally +characterizing a subclass of Markov games with a general notion of symmetries +that admits the existence of symmetric optimal values and policies. Motivated +by these properties, we design neural network architectures with symmetric +constraints embedded as an inductive bias for multi-agent actor-critic methods. +This inductive bias results in superior performance in various cooperative MARL +benchmarks and impressive generalization capabilities such as zero-shot +learning and transfer learning in unseen scenarios with repeated symmetric +patterns. The code is available at: https://github.com/dchen48/E3AC. + +
+
+
+
+
+ + ☆ A Survey for Federated Learning Evaluations: Goals and Measures + + +
+ Evaluation is a systematic approach to assessing how well a system achieves +its intended purpose. Federated learning (FL) is a novel paradigm for +privacy-preserving machine learning that allows multiple parties to +collaboratively train models without sharing sensitive data. However, +evaluating FL is challenging due to its interdisciplinary nature and diverse +goals, such as utility, efficiency, and security. In this survey, we first +review the major evaluation goals adopted in the existing studies and then +explore the evaluation metrics used for each goal. We also introduce FedEval, +an open-source platform that provides a standardized and comprehensive +evaluation framework for FL algorithms in terms of their utility, efficiency, +and security. Finally, we discuss several challenges and future research +directions for FL evaluation. + +
+
+
+
+
+ + ☆ A Benchmark Study on Calibration + + +
+ Deep neural networks are increasingly utilized in various machine learning +tasks. However, as these models grow in complexity, they often face calibration +issues, despite enhanced prediction accuracy. Many studies have endeavored to +improve calibration performance through data preprocessing, the use of specific +loss functions, and training frameworks. Yet, investigations into calibration +properties have been somewhat overlooked. Our study leverages the Neural +Architecture Search (NAS) search space, offering an exhaustive model +architecture space for thorough calibration properties exploration. We +specifically create a model calibration dataset. This dataset evaluates 90 +bin-based and 12 additional calibration measurements across 117,702 unique +neural networks within the widely employed NATS-Bench search space. Our +analysis aims to answer several longstanding questions in the field, using our +proposed dataset: (i) Can model calibration be generalized across different +tasks? (ii) Can robustness be used as a calibration measurement? (iii) How +reliable are calibration metrics? (iv) Does a post-hoc calibration method +affect all models uniformly? (v) How does calibration interact with accuracy? +(vi) What is the impact of bin size on calibration measurement? (vii) Which +architectural designs are beneficial for calibration? Additionally, our study +bridges an existing gap by exploring calibration within NAS. By providing this +dataset, we enable further research into NAS calibration. As far as we are +aware, our research represents the first large-scale investigation into +calibration properties and the premier study of calibration issues within NAS. + +
+
+ comment: 39 pages, 35 figures +
+
+
+
+
+ + ♻ ☆ Tryage: Real-time, intelligent Routing of User Prompts to Large Language + Models + + +
+ The introduction of the transformer architecture and the self-attention +mechanism has led to an explosive production of language models trained on +specific downstream tasks and data domains. With over 200, 000 models in the +Hugging Face ecosystem, users grapple with selecting and optimizing models to +suit multifaceted workflows and data domains while addressing computational, +security, and recency concerns. There is an urgent need for machine learning +frameworks that can eliminate the burden of model selection and customization +and unleash the incredible power of the vast emerging model library for end +users. Here, we propose a context-aware routing system, Tryage, that leverages +a language model router for optimal selection of expert models from a model +library based on analysis of individual input prompts. Inspired by the thalamic +router in the brain, Tryage employs a perceptive router to predict down-stream +model performance on prompts and, then, makes a routing decision using an +objective function that integrates performance predictions with user goals and +constraints that are incorporated through flags (e.g., model size, model +recency). Tryage allows users to explore a Pareto front and automatically +trade-off between task accuracy and secondary goals including minimization of +model size, recency, security, verbosity, and readability. Across heterogeneous +data sets that include code, text, clinical data, and patents, the Tryage +framework surpasses Gorilla and GPT3.5 turbo in dynamic model selection +identifying the optimal model with an accuracy of 50.9% , compared to 23.6% by +GPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how +routing models can be applied to program and control the behavior of +multi-model LLM systems to maximize efficient use of the expanding and evolving +language model ecosystem. + +
+
+
+
+
+ + ♻ ☆ Towards Interactive Reinforcement Learning with Intrinsic Feedback + + +
+ Reinforcement learning (RL) and brain-computer interfaces (BCI) have +experienced significant growth over the past decade. With rising interest in +human-in-the-loop (HITL), incorporating human input with RL algorithms has +given rise to the sub-field of interactive RL. Adjacently, the field of BCI has +long been interested in extracting informative brain signals from neural +activity for use in human-computer interactions. A key link between these +fields lies in the interpretation of neural activity as feedback such that +interactive RL approaches can be employed. We denote this new and emerging +medium of feedback as intrinsic feedback. Despite intrinsic feedback's ability +to be conveyed automatically and even unconsciously, proper exploration +surrounding this key link has largely gone unaddressed by both communities. +Thus, to help facilitate a deeper understanding and a more effective +utilization, we provide a tutorial-style review covering the motivations, +approaches, and open problems of intrinsic feedback and its foundational +concepts. + +
+
+ comment: Name change and vast rewrites of the paper +
+
+
+
+
+ + ♻ ☆ The Common Intuition to Transfer Learning Can Win or Lose: Case Studies + for Linear Regression + + +
+ We study a fundamental transfer learning process from source to target linear +regression tasks, including overparameterized settings where there are more +learned parameters than data samples. The target task learning is addressed by +using its training data together with the parameters previously computed for +the source task. We define a transfer learning approach to the target task as a +linear regression optimization with a regularization on the distance between +the to-be-learned target parameters and the already-learned source parameters. +We analytically characterize the generalization performance of our transfer +learning approach and demonstrate its ability to resolve the peak in +generalization errors in double descent phenomena of the minimum L2-norm +solution to linear regression. Moreover, we show that for sufficiently related +tasks, the optimally tuned transfer learning approach can outperform the +optimally tuned ridge regression method, even when the true parameter vector +conforms to an isotropic Gaussian prior distribution. Namely, we demonstrate +that transfer learning can beat the minimum mean square error (MMSE) solution +of the independent target task. Our results emphasize the ability of transfer +learning to extend the solution space to the target task and, by that, to have +an improved MMSE solution. We formulate the linear MMSE solution to our +transfer learning setting and point out its key differences from the common +design philosophy to transfer learning. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Selective Labeling for More Effective Semi-Supervised + Learning ECCV 2022 + + +
+ Given an unlabeled dataset and an annotation budget, we study how to +selectively label a fixed number of instances so that semi-supervised learning +(SSL) on such a partially labeled dataset is most effective. We focus on +selecting the right data to label, in addition to usual SSL's propagating +labels from labeled data to the rest unlabeled data. This instance selection +task is challenging, as without any labeled data we do not know what the +objective of learning should be. Intuitively, no matter what the downstream +task is, instances to be labeled must be representative and diverse: The former +would facilitate label propagation to unlabeled data, whereas the latter would +ensure coverage of the entire dataset. We capture this idea by selecting +cluster prototypes, either in a pretrained feature space, or along with feature +optimization, both without labels. Our unsupervised selective labeling +consistently improves SSL methods over state-of-the-art active learning given +labeled data, by 8 to 25 times in label efficiency. For example, it boosts +FixMatch by 10% (14%) in accuracy on CIFAR-10 (ImageNet-1K) with 0.08% (0.2%) +labeled data, demonstrating that small computation spent on selecting what data +to label brings significant gain especially under a low annotation budget. Our +work sets a new standard for practical and efficient SSL. + +
+
+ comment: Accepted by ECCV 2022; Fixed a few typos +
+
+
+
+
+ + ♻ ☆ Towards Top-Down Automated Development in Limited Scopes: A + Neuro-Symbolic Framework from Expressibles to Executables + + +
+ Deep code generation is a topic of deep learning for software engineering +(DL4SE), which adopts neural models to generate code for the intended +functions. Since end-to-end neural methods lack domain knowledge and software +hierarchy awareness, they tend to perform poorly w.r.t project-level tasks. To +systematically explore the potential improvements of code generation, we let it +participate in the whole top-down development from \emph{expressibles} to +\emph{executables}, which is possible in limited scopes. In the process, it +benefits from massive samples, features, and knowledge. As the foundation, we +suggest building a taxonomy on code data, namely code taxonomy, leveraging the +categorization of code information. Moreover, we introduce a three-layer +semantic pyramid (SP) to associate text data and code data. It identifies the +information of different abstraction levels, and thus introduces the domain +knowledge on development and reveals the hierarchy of software. Furthermore, we +propose a semantic pyramid framework (SPF) as the approach, focusing on +software of high modularity and low complexity. SPF divides the code generation +process into stages and reserves spots for potential interactions. In addition, +we conceived preliminary applications in software development to confirm the +neuro-symbolic framework. + +
+
+ comment: 5 pages, 3 figures, 2 tables, accepted by ESEC/FSE 2023, the + camera-ready version +
+
+
+
+
+ + ♻ ☆ Emergent segmentation from participation dynamics and multi-learner + retraining + + +
+ The choice to participate in a data-driven service, often made on the basis +of quality of that service, influences the ability of the service to learn and +improve. We study the participation and retraining dynamics that arise when +both the learners and sub-populations of users are \emph{risk-reducing}, which +cover a broad class of updates including gradient descent, multiplicative +weights, etc. Suppose, for example, that individuals choose to spend their time +amongst social media platforms proportionally to how well each platform works +for them. Each platform also gathers data about its active users, which it uses +to update parameters with a gradient step. For this example and for our general +class of dynamics, we show that the only asymptotically stable equilibria are +segmented, with sub-populations allocated to a single learner. Under mild +assumptions, the utilitarian social optimum is a stable equilibrium. In +contrast to previous work, which shows that repeated risk minimization can +result in representation disparity and high overall loss for a single learner +\citep{hashimoto2018fairness,miller2021outside}, we find that repeated myopic +updates with multiple learners lead to better outcomes. We illustrate the +phenomena via a simulated example initialized from real data. + +
+
+
+
+
+ + ♻ ☆ ProtoBandit: Efficient Prototype Selection via Multi-Armed Bandits + + +
+ In this work, we propose a multi-armed bandit-based framework for identifying +a compact set of informative data instances (i.e., the prototypes) from a +source dataset $S$ that best represents a given target set $T$. Prototypical +examples of a given dataset offer interpretable insights into the underlying +data distribution and assist in example-based reasoning, thereby influencing +every sphere of human decision-making. Current state-of-the-art prototype +selection approaches require $O(|S||T|)$ similarity comparisons between source +and target data points, which becomes prohibitively expensive for large-scale +settings. We propose to mitigate this limitation by employing stochastic greedy +search in the space of prototypical examples and multi-armed bandits for +reducing the number of similarity comparisons. Our randomized algorithm, +ProtoBandit, identifies a set of $k$ prototypes incurring $O(k^3|S|)$ +similarity comparisons, which is independent of the size of the target set. An +interesting outcome of our analysis is for the $k$-medoids clustering problem +$T = S$ setting) in which we show that our algorithm ProtoBandit approximates +the BUILD step solution of the partitioning around medoids (PAM) method in +$O(k^3|S|)$ complexity. Empirically, we observe that ProtoBandit reduces the +number of similarity computation calls by several orders of magnitudes +($100-1000$ times) while obtaining solutions similar in quality to those from +state-of-the-art approaches. + +
+
+ comment: Erratum corrected +
+
+
+
+
+ + ♻ ☆ A Survey on Dataset Distillation: Approaches, Applications and Future + Directions + + +
+ Dataset distillation is attracting more attention in machine learning as +training sets continue to grow and the cost of training state-of-the-art models +becomes increasingly high. By synthesizing datasets with high information +density, dataset distillation offers a range of potential applications, +including support for continual learning, neural architecture search, and +privacy protection. Despite recent advances, we lack a holistic understanding +of the approaches and applications. Our survey aims to bridge this gap by first +proposing a taxonomy of dataset distillation, characterizing existing +approaches, and then systematically reviewing the data modalities, and related +applications. In addition, we summarize the challenges and discuss future +directions for this field of research. + +
+
+
+
+
+ + ♻ ☆ Learning Interpretable Dynamics from Images of a Freely Rotating 3D + Rigid Body + + +
+ In many real-world settings, image observations of freely rotating 3D rigid +bodies, such as satellites, may be available when low-dimensional measurements +are not. However, the high-dimensionality of image data precludes the use of +classical estimation techniques to learn the dynamics and a lack of +interpretability reduces the usefulness of standard deep learning methods. In +this work, we present a physics-informed neural network model to estimate and +predict 3D rotational dynamics from image sequences. We achieve this using a +multi-stage prediction pipeline that maps individual images to a latent +representation homeomorphic to $\mathbf{SO}(3)$, computes angular velocities +from latent pairs, and predicts future latent states using the Hamiltonian +equations of motion with a learned representation of the Hamiltonian. We +demonstrate the efficacy of our approach on a new rotating rigid-body dataset +with sequences of rotating cubes and rectangular prisms with uniform and +non-uniform density. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ AudioFormer: Audio Transformer learns audio feature representations from + discrete acoustic codes + + +
+ We propose a method named AudioFormer,which learns audio feature +representations through the acquisition of discrete acoustic codes and +subsequently fine-tunes them for audio classification tasks. Initially,we +introduce a novel perspective by considering the audio classification task as a +form of natural language understanding (NLU). Leveraging an existing neural +audio codec model,we generate discrete acoustic codes and utilize them to train +a masked language model (MLM),thereby obtaining audio feature representations. +Furthermore,we pioneer the integration of a Multi-Positive sample Contrastive +(MPC) learning approach. This method enables the learning of joint +representations among multiple discrete acoustic codes within the same audio +input. In our experiments,we treat discrete acoustic codes as textual data and +train a masked language model using a cloze-like methodology,ultimately +deriving high-quality audio representations. Notably,the MPC learning technique +effectively captures collaborative representations among distinct positive +samples. Our research outcomes demonstrate that AudioFormer attains +significantly improved performance compared to prevailing monomodal audio +classification models across multiple datasets,and even outperforms +audio-visual multimodal classification models on select datasets. +Specifically,our approach achieves remarkable results on datasets including +AudioSet (2M,20K),and FSD50K,with performance scores of 53.9,45.1,and +65.6,respectively. We have openly shared both the code and models: +https://github.com/LZH-0225/AudioFormer.git. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Domain Specific Question Answering Over Knowledge Graphs Using Logical + Programming and Large Language Models + + +
+ Answering questions over domain-specific graphs requires a tailored approach +due to the limited number of relations and the specific nature of the domain. +Our approach integrates classic logical programming languages into large +language models (LLMs), enabling the utilization of logical reasoning +capabilities to tackle the KGQA task. By representing the questions as Prolog +queries, which are readable and near close to natural language in +representation, we facilitate the generation of programmatically derived +answers. To validate the effectiveness of our approach, we evaluate it using a +well-known benchmark dataset, MetaQA. Our experimental results demonstrate that +our method achieves accurate identification of correct answer entities for all +test questions, even when trained on a small fraction of annotated data. +Overall, our work presents a promising approach to addressing question +answering over domain-specific graphs, offering an explainable and robust +solution by incorporating logical programming languages. + +
+
+
+
+
+ + ♻ ☆ On the link between generative semi-supervised learning and generative + open-set recognition + + +
+ This study investigates the relationship between semi-supervised learning +(SSL, which is training off partially labelled datasets) and open-set +recognition (OSR, which is classification with simultaneous novelty detection) +under the context of generative adversarial networks (GANs). Although no +previous study has formally linked SSL and OSR, their respective methods share +striking similarities. Specifically, SSL-GANs and OSR-GANs require their +generators to produce 'bad-looking' samples which are used to regularise their +classifier networks. We hypothesise that the definitions of bad-looking samples +in SSL and OSR represents the same concept and realises the same goal. More +formally, bad-looking samples lie in the complementary space, which is the area +between and around the boundaries of the labelled categories within the +classifier's embedding space. By regularising a classifier with samples in the +complementary space, classifiers achieve improved generalisation for SSL and +also generalise the open space for OSR. To test this hypothesis, we compare a +foundational SSL-GAN with the state-of-the-art OSR-GAN under the same SSL-OSR +experimental conditions. Our results find that SSL-GANs achieve near identical +results to OSR-GANs, proving the SSL-OSR link. Subsequently, to further this +new research path, we compare several SSL-GANs various SSL-OSR setups which +this first benchmark results. A combined framework of SSL-OSR certainly +improves the practicality and cost-efficiency of classifier training, and so +further theoretical and application studies are also discussed. + +
+
+
+
+
+ + ♻ ☆ Physics-informed neural networks with unknown measurement noise + + +
+ Physics-informed neural networks (PINNs) constitute a flexible approach to +both finding solutions and identifying parameters of partial differential +equations. Most works on the topic assume noiseless data, or data contaminated +by weak Gaussian noise. We show that the standard PINN framework breaks down in +case of non-Gaussian noise. We give a way of resolving this fundamental issue +and we propose to jointly train an energy-based model (EBM) to learn the +correct noise distribution. We illustrate the improved performance of our +approach using multiple examples. + +
+
+
+
+
+ + ♻ ☆ Randomized Coordinate Subgradient Method for Nonsmooth Composite + Optimization + + +
+ Coordinate-type subgradient methods for addressing nonsmooth optimization +problems are relatively underexplored due to the set-valued nature of the +subdifferential. In this work, our study focuses on nonsmooth composite +optimization problems, encompassing a wide class of convex and weakly convex +(nonconvex nonsmooth) problems. By utilizing the chain rule of the composite +structure properly, we introduce the Randomized Coordinate Subgradient method +(RCS) for tackling this problem class. To the best of our knowledge, this is +the first coordinate subgradient method for solving general nonsmooth composite +optimization problems. In theory, we consider the linearly bounded subgradients +assumption for the objective function, which is more general than the +traditional Lipschitz continuity assumption, to account for practical +scenarios. We then conduct convergence analysis for RCS in both convex and +weakly convex cases based on this generalized Lipschitz-type assumption. +Specifically, we establish the $\widetilde{\mathcal{O}}$$(1/\sqrt{k})$ +convergence rate in expectation and the $\tilde o(1/\sqrt{k})$ almost sure +asymptotic convergence rate in terms of the suboptimality gap when $f$ is +convex. For the case when $f$ is weakly convex and its subdifferential +satisfies the global metric subregularity property, we derive the +$\mathcal{O}(\varepsilon^{-4})$ iteration complexity in expectation. We also +establish an asymptotic convergence result. To justify the global metric +subregularity property utilized in the analysis, we establish this error bound +condition for the concrete (real-valued) robust phase retrieval problem. We +also provide a convergence lemma and the relationship between the global metric +subregularity properties of a weakly convex function and its Moreau envelope. +Finally, we conduct several experiments to demonstrate the possible superiority +of RCS over the subgradient method. + +
+
+
+
+
+ + ♻ ☆ Knowledge-Aware Federated Active Learning with Non-IID Data ICCV23 + + +
+ Federated learning enables multiple decentralized clients to learn +collaboratively without sharing the local training data. However, the expensive +annotation cost to acquire data labels on local clients remains an obstacle in +utilizing local data. In this paper, we propose a federated active learning +paradigm to efficiently learn a global model with limited annotation budget +while protecting data privacy in a decentralized learning way. The main +challenge faced by federated active learning is the mismatch between the active +sampling goal of the global model on the server and that of the asynchronous +local clients. This becomes even more significant when data is distributed +non-IID across local clients. To address the aforementioned challenge, we +propose Knowledge-Aware Federated Active Learning (KAFAL), which consists of +Knowledge-Specialized Active Sampling (KSAS) and Knowledge-Compensatory +Federated Update (KCFU). KSAS is a novel active sampling method tailored for +the federated active learning problem. It deals with the mismatch challenge by +sampling actively based on the discrepancies between local and global models. +KSAS intensifies specialized knowledge in local clients, ensuring the sampled +data to be informative for both the local clients and the global model. KCFU, +in the meantime, deals with the client heterogeneity caused by limited data and +non-IID data distributions. It compensates for each client's ability in weak +classes by the assistance of the global model. Extensive experiments and +analyses are conducted to show the superiority of KSAS over the +state-of-the-art active learning methods and the efficiency of KCFU under the +federated active learning framework. + +
+
+ comment: 14 pages, 12 figures, ICCV23 +
+
+
+
+
+ + ♻ ☆ Exact Manifold Gaussian Variational Bayes + + +
+ We propose an optimization algorithm for Variational Inference (VI) in +complex models. Our approach relies on natural gradient updates where the +variational space is a Riemann manifold. We develop an efficient algorithm for +Gaussian Variational Inference that implicitly satisfies the positive definite +constraint on the variational covariance matrix. Our Exact manifold Gaussian +Variational Bayes (EMGVB) provides exact but simple update rules and is +straightforward to implement. Due to its black-box nature, EMGVB stands as a +ready-to-use solution for VI in complex models. Over five datasets, we +empirically validate our feasible approach on different statistical, +econometric, and deep learning models, discussing its performance with respect +to baseline methods. + +
+
+
+
+
+ + ♻ ☆ Estimating Driver Personality Traits from On-Road Driving Data + + +
+ This paper focuses on the estimation of a driver's psychological +characteristics using driving data for driving assistance systems. Driving +assistance systems that support drivers by adapting individual psychological +characteristics can provide appropriate feedback and prevent traffic accidents. +As a first step toward implementing such adaptive assistance systems, this +research aims to develop a model to estimate drivers' psychological +characteristics, such as cognitive function, psychological driving style, and +workload sensitivity, from on-road driving behavioral data using machine +learning and deep learning techniques. We also investigated the relationship +between driving behavior and various cognitive functions, including the Trail +Making Test (TMT) and Useful Field of View (UFOV) test, through regression +modeling. The proposed method focuses on road type information and captures +various durations of time-series data observed from driving behaviors. First, +we segment the driving time-series data into two road types, namely, arterial +roads and intersections, to consider driving situations. Second, we further +segment data into many sequences of various durations. Third, statistics are +calculated from each sequence. Finally, these statistics are used as input +features of machine learning models to estimate psychological characteristics. +The experimental results show that our model can estimate a driver's cognitive +function, namely, the TMT~(B) and UFOV test scores, with Pearson correlation +coefficients $r$ of 0.579 and 0.708, respectively. Some characteristics, such +as psychological driving style and workload sensitivity, are estimated with +high accuracy, but whether various duration segmentation improves accuracy +depends on the characteristics, and it is not effective for all +characteristics. + +
+
+
+
+
+ + ♻ ☆ Comparison of Machine Learning Methods for Assigning Software Issues to + Team Members + + +
+ Software issues contain units of work to fix, improve, or create new threads +during the development and facilitate communication among the team members. +Assigning an issue to the most relevant team member and determining a category +of an issue is a tedious and challenging task. Wrong classifications cause +delays and rework in the project and trouble among the team members. This paper +proposes a set of carefully curated linguistic features for shallow machine +learning methods and compares the performance of shallow and ensemble methods +with deep language models. Unlike the state-of-the-art, we assign issues to +four roles (designer, developer, tester, and leader) rather than to specific +individuals or teams to contribute to the generality of our solution. We also +consider the level of experience of the developers to reflect the industrial +practices in our solution formulation. We collect and annotate five industrial +data sets from one of the top three global television producers to evaluate our +proposal and compare it with deep language models. Our data sets contain 5324 +issues in total. We show that an ensemble classifier of shallow techniques +achieves 0.92 for issue assignment in accuracy which is statistically +comparable to the state-of-the-art deep language models. The contributions +include the public sharing of five annotated industrial issue data sets, the +development of a clear and comprehensive feature set, the introduction of a +novel label set, and the validation of the efficacy of an ensemble classifier +of shallow machine learning techniques. + +
+
+
+
+
+ + ♻ ☆ Deletion and Insertion Tests in Regression Models + + +
+ A basic task in explainable AI (XAI) is to identify the most important +features behind a prediction made by a black box function $f$. The insertion +and deletion tests of Petsiuk et al. (2018) can be used to judge the quality of +algorithms that rank pixels from most to least important for a classification. +Motivated by regression problems we establish a formula for their area under +the curve (AUC) criteria in terms of certain main effects and interactions in +an anchored decomposition of $f$. We find an expression for the expected value +of the AUC under a random ordering of inputs to $f$ and propose an alternative +area above a straight line for the regression setting. We use this criterion to +compare feature importances computed by integrated gradients (IG) to those +computed by Kernel SHAP (KS) as well as LIME, DeepLIFT, vanilla gradient and +input$\times$gradient methods. KS has the best overall performance in two +datasets we consider but it is very expensive to compute. We find that IG is +nearly as good as KS while being much faster. Our comparison problems include +some binary inputs that pose a challenge to IG because it must use values +between the possible variable levels and so we consider ways to handle binary +variables in IG. We show that sorting variables by their Shapley value does not +necessarily give the optimal ordering for an insertion-deletion test. It will +however do that for monotone functions of additive models, such as logistic +regression. + +
+
+
+
+
+ + ♻ ☆ Dirac signal processing of higher-order topological signals + + +
+ Higher-order networks can sustain topological signals which are variables +associated not only to the nodes, but also to the links, to the triangles and +in general to the higher dimensional simplices of simplicial complexes. These +topological signals can describe a large variety of real systems including +currents in the ocean, synaptic currents between neurons and biological +transportation networks. In real scenarios topological signal data might be +noisy and an important task is to process these signals by improving their +signal to noise ratio. So far topological signals are typically processed +independently of each other. For instance, node signals are processed +independently of link signals, and algorithms that can enforce a consistent +processing of topological signals across different dimensions are largely +lacking. Here we propose Dirac signal processing, an adaptive, unsupervised +signal processing algorithm that learns to jointly filter topological signals +supported on nodes, links and triangles of simplicial complexes in a consistent +way. The proposed Dirac signal processing algorithm is formulated in terms of +the discrete Dirac operator which can be interpreted as "square root" of a +higher-order Hodge Laplacian. We discuss in detail the properties of the Dirac +operator including its spectrum and the chirality of its eigenvectors and we +adopt this operator to formulate Dirac signal processing that can filter noisy +signals defined on nodes, links and triangles of simplicial complexes. We test +our algorithms on noisy synthetic data and noisy data of drifters in the ocean +and find that the algorithm can learn to efficiently reconstruct the true +signals outperforming algorithms based exclusively on the Hodge Laplacian. + +
+
+ comment: (26 pages, 12 figures) +
+
+
+
+
+ + ♻ ☆ MARLlib: A Scalable and Efficient Multi-agent Reinforcement Learning + Library + + +
+ A significant challenge facing researchers in the area of multi-agent +reinforcement learning (MARL) pertains to the identification of a library that +can offer fast and compatible development for multi-agent tasks and algorithm +combinations, while obviating the need to consider compatibility issues. In +this paper, we present MARLlib, a library designed to address the +aforementioned challenge by leveraging three key mechanisms: 1) a standardized +multi-agent environment wrapper, 2) an agent-level algorithm implementation, +and 3) a flexible policy mapping strategy. By utilizing these mechanisms, +MARLlib can effectively disentangle the intertwined nature of the multi-agent +task and the learning process of the algorithm, with the ability to +automatically alter the training strategy based on the current task's +attributes. The MARLlib library's source code is publicly accessible on GitHub: +\url{https://github.com/Replicable-MARL/MARLlib}. + +
+
+
+
+
+ + ♻ ☆ Riemannian Hamiltonian methods for min-max optimization on manifolds + + +
+ In this paper, we study min-max optimization problems on Riemannian +manifolds. We introduce a Riemannian Hamiltonian function, minimization of +which serves as a proxy for solving the original min-max problems. Under the +Riemannian Polyak--{\L}ojasiewicz condition on the Hamiltonian function, its +minimizer corresponds to the desired min-max saddle point. We also provide +cases where this condition is satisfied. For geodesic-bilinear optimization in +particular, solving the proxy problem leads to the correct search direction +towards global optimality, which becomes challenging with the min-max +formulation. To minimize the Hamiltonian function, we propose Riemannian +Hamiltonian methods (RHM) and present their convergence analyses. We extend RHM +to include consensus regularization and to the stochastic setting. We +illustrate the efficacy of the proposed RHM in applications such as subspace +robust Wasserstein distance, robust training of neural networks, and generative +adversarial networks. + +
+
+
+
+
+ + ♻ ☆ Minimalist Traffic Prediction: Linear Layer Is All You Need + + +
+ Traffic prediction is essential for the progression of Intelligent +Transportation Systems (ITS) and the vision of smart cities. While +Spatial-Temporal Graph Neural Networks (STGNNs) have shown promise in this +domain by leveraging Graph Neural Networks (GNNs) integrated with either RNNs +or Transformers, they present challenges such as computational complexity, +gradient issues, and resource-intensiveness. This paper addresses these +challenges, advocating for three main solutions: a node-embedding approach, +time series decomposition, and periodicity learning. We introduce STLinear, a +minimalist model architecture designed for optimized efficiency and +performance. Unlike traditional STGNNs, STlinear operates fully locally, +avoiding inter-node data exchanges, and relies exclusively on linear layers, +drastically cutting computational demands. Our empirical studies on real-world +datasets confirm STLinear's prowess, matching or exceeding the accuracy of +leading STGNNs, but with significantly reduced complexity and computation +overhead (more than 95% reduction in MACs per epoch compared to +state-of-the-art STGNN baseline published in 2023). In summary, STLinear +emerges as a potent, efficient alternative to conventional STGNNs, with +profound implications for the future of ITS and smart city initiatives. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ UTRNet: High-Resolution Urdu Text Recognition In Printed Documents ICDAR 2023 + + +
+ In this paper, we propose a novel approach to address the challenges of +printed Urdu text recognition using high-resolution, multi-scale semantic +feature extraction. Our proposed UTRNet architecture, a hybrid CNN-RNN model, +demonstrates state-of-the-art performance on benchmark datasets. To address the +limitations of previous works, which struggle to generalize to the intricacies +of the Urdu script and the lack of sufficient annotated real-world data, we +have introduced the UTRSet-Real, a large-scale annotated real-world dataset +comprising over 11,000 lines and UTRSet-Synth, a synthetic dataset with 20,000 +lines closely resembling real-world and made corrections to the ground truth of +the existing IIITH dataset, making it a more reliable resource for future +research. We also provide UrduDoc, a benchmark dataset for Urdu text line +detection in scanned documents. Additionally, we have developed an online tool +for end-to-end Urdu OCR from printed documents by integrating UTRNet with a +text detection model. Our work not only addresses the current limitations of +Urdu OCR but also paves the way for future research in this area and +facilitates the continued advancement of Urdu OCR technology. The project page +with source code, datasets, annotations, trained models, and online tool is +available at abdur75648.github.io/UTRNet. + +
+
+ comment: Accepted at The 17th International Conference on Document Analysis + and Recognition (ICDAR 2023) +
+
+
+
+
+ + ♻ ☆ Forward-Backward Reasoning in Large Language Models for Verification + + +
+ Chain-of-Though (CoT) prompting has shown promising performance in various +reasoning tasks. Recently, Self-Consistency \citep{wang2023selfconsistency} +proposes to sample a diverse set of reasoning chains which may lead to +different answers while the answer that receives the most votes is selected. In +this paper, we propose a novel method to use backward reasoning in verifying +candidate answers. We mask a token in the question by ${\bf x}$ and ask the LLM +to predict the masked token when a candidate answer is provided by \textit{a +simple template}, i.e., "\textit{\textbf{If we know the answer of the above +question is \{a candidate answer\}, what is the value of unknown variable ${\bf +x}$?}}" Intuitively, the LLM is expected to predict the masked token +successfully if the provided candidate answer is correct. We further propose +FOBAR to combine forward and backward reasoning for estimating the probability +of candidate answers. We conduct extensive experiments on six data sets and +three LLMs. Experimental results demonstrate that FOBAR achieves +state-of-the-art performance on various reasoning benchmarks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Learning to Generalize towards Unseen Domains via a Content-Aware Style + Invariant Model for Disease Detection from Chest X-rays + + +
+ Performance degradation due to source domain mismatch is a longstanding +challenge in deep learning-based medical image analysis, particularly for chest +X-rays (CXRs). Several methods (e.g., adversarial training, multi-domain +mixups) have been proposed to extract domain-invariant high-level features to +address this domain shift. However, these methods do not explicitly regularize +the content and style characteristics of the extracted domain-invariant +features. Recent studies have demonstrated that CNN models exhibit a strong +bias toward styles (e.g., uninformative textures) rather than content (e.g., +shape), in stark contrast to the human-vision system. Radiologists tend to +learn visual cues from CXRs and thus perform well across multiple domains. +Therefore, in medical imaging for pathology diagnosis from CXR images, models +should extract domain-invariant features that are style-invariant and +content-biased. Motivated by this, we employ the novel style randomization +modules (SRMs) at both image and feature levels that work together +hierarchically to create rich style perturbed features on the fly while keeping +the content intact. In addition, we leverage consistency regularizations +between global semantic features and predicted probability distributions, +respectively, for with and without style perturbed versions of the same CXR +image to tweak the model's sensitivity toward content markers for accurate +predictions. Extensive experiments with three large-scale thoracic disease +datasets, i.e., CheXpert, MIMIC-CXR, and BRAX, demonstrate that our proposed +framework is more robust in the presence of domain shift and achieves +state-of-the-art performance. + +
+
+
+
+
+ + ♻ ☆ Phase-shifted Adversarial Training UAI 2023 + + +
+ Adversarial training has been considered an imperative component for safely +deploying neural network-based applications to the real world. To achieve +stronger robustness, existing methods primarily focus on how to generate strong +attacks by increasing the number of update steps, regularizing the models with +the smoothed loss function, and injecting the randomness into the attack. +Instead, we analyze the behavior of adversarial training through the lens of +response frequency. We empirically discover that adversarial training causes +neural networks to have low convergence to high-frequency information, +resulting in highly oscillated predictions near each data. To learn +high-frequency contents efficiently and effectively, we first prove that a +universal phenomenon of frequency principle, i.e., \textit{lower frequencies +are learned first}, still holds in adversarial training. Based on that, we +propose phase-shifted adversarial training (PhaseAT) in which the model learns +high-frequency components by shifting these frequencies to the low-frequency +range where the fast convergence occurs. For evaluations, we conduct the +experiments on CIFAR-10 and ImageNet with the adaptive attack carefully +designed for reliable evaluation. Comprehensive results show that PhaseAT +significantly improves the convergence for high-frequency information. This +results in improved adversarial robustness by enabling the model to have +smoothed predictions near each data. + +
+
+ comment: Conference on Uncertainty in Artificial Intelligence, 2023 (UAI 2023) +
+
+
+
+
+ + ♻ ☆ A Rigorous Uncertainty-Aware Quantification Framework Is Essential for + Reproducible and Replicable Machine Learning Workflows + + +
+ The ability to replicate predictions by machine learning (ML) or artificial +intelligence (AI) models and results in scientific workflows that incorporate +such ML/AI predictions is driven by numerous factors. An uncertainty-aware +metric that can quantitatively assess the reproducibility of quantities of +interest (QoI) would contribute to the trustworthiness of results obtained from +scientific workflows involving ML/AI models. In this article, we discuss how +uncertainty quantification (UQ) in a Bayesian paradigm can provide a general +and rigorous framework for quantifying reproducibility for complex scientific +workflows. Such as framework has the potential to fill a critical gap that +currently exists in ML/AI for scientific workflows, as it will enable +researchers to determine the impact of ML/AI model prediction variability on +the predictive outcomes of ML/AI-powered workflows. We expect that the +envisioned framework will contribute to the design of more reproducible and +trustworthy workflows for diverse scientific applications, and ultimately, +accelerate scientific discoveries. + +
+
+
+
+
+ + ♻ ☆ Information Theory-Guided Heuristic Progressive Multi-View Coding + + +
+ Multi-view representation learning aims to capture comprehensive information +from multiple views of a shared context. Recent works intuitively apply +contrastive learning to different views in a pairwise manner, which is still +scalable: view-specific noise is not filtered in learning view-shared +representations; the fake negative pairs, where the negative terms are actually +within the same class as the positive, and the real negative pairs are +coequally treated; evenly measuring the similarities between terms might +interfere with optimization. Importantly, few works study the theoretical +framework of generalized self-supervised multi-view learning, especially for +more than two views. To this end, we rethink the existing multi-view learning +paradigm from the perspective of information theory and then propose a novel +information theoretical framework for generalized multi-view learning. Guided +by it, we build a multi-view coding method with a three-tier progressive +architecture, namely Information theory-guided hierarchical Progressive +Multi-view Coding (IPMC). In the distribution-tier, IPMC aligns the +distribution between views to reduce view-specific noise. In the set-tier, IPMC +constructs self-adjusted contrasting pools, which are adaptively modified by a +view filter. Lastly, in the instance-tier, we adopt a designed unified loss to +learn representations and reduce the gradient interference. Theoretically and +empirically, we demonstrate the superiority of IPMC over state-of-the-art +methods. + +
+
+ comment: This paper is accepted by the jourcal of Neural Networks (Elsevier) + by 2023. arXiv admin note: substantial text overlap with arXiv:2109.02344 +
+
+
+
+
+ + ♻ ☆ Designing an attack-defense game: how to increase robustness of + financial transaction models via a competition + + +
+ Given the escalating risks of malicious attacks in the finance sector and the +consequential severe damage, a thorough understanding of adversarial strategies +and robust defense mechanisms for machine learning models is critical. The +threat becomes even more severe with the increased adoption in banks more +accurate, but potentially fragile neural networks. We aim to investigate the +current state and dynamics of adversarial attacks and defenses for neural +network models that use sequential financial data as the input. + To achieve this goal, we have designed a competition that allows realistic +and detailed investigation of problems in modern financial transaction data. +The participants compete directly against each other, so possible attacks and +defenses are examined in close-to-real-life conditions. Our main contributions +are the analysis of the competition dynamics that answers the questions on how +important it is to conceal a model from malicious users, how long does it take +to break it, and what techniques one should use to make it more robust, and +introduction additional way to attack models or increase their robustness. + Our analysis continues with a meta-study on the used approaches with their +power, numerical experiments, and accompanied ablations studies. We show that +the developed attacks and defenses outperform existing alternatives from the +literature while being practical in terms of execution, proving the validity of +the competition as a tool for uncovering vulnerabilities of machine learning +models and mitigating them in various domains. + +
+
+
+
+
+ + ♻ ☆ Task Relation-aware Continual User Representation Learning KDD 2023 + + +
+ User modeling, which learns to represent users into a low-dimensional +representation space based on their past behaviors, got a surge of interest +from the industry for providing personalized services to users. Previous +efforts in user modeling mainly focus on learning a task-specific user +representation that is designed for a single task. However, since learning +task-specific user representations for every task is infeasible, recent studies +introduce the concept of universal user representation, which is a more +generalized representation of a user that is relevant to a variety of tasks. +Despite their effectiveness, existing approaches for learning universal user +representations are impractical in real-world applications due to the data +requirement, catastrophic forgetting and the limited learning capability for +continually added tasks. In this paper, we propose a novel continual user +representation learning method, called TERACON, whose learning capability is +not limited as the number of learned tasks increases while capturing the +relationship between the tasks. The main idea is to introduce an embedding for +each task, i.e., task embedding, which is utilized to generate task-specific +soft masks that not only allow the entire model parameters to be updated until +the end of training sequence, but also facilitate the relationship between the +tasks to be captured. Moreover, we introduce a novel knowledge retention module +with pseudo-labeling strategy that successfully alleviates the long-standing +problem of continual learning, i.e., catastrophic forgetting. Extensive +experiments on public and proprietary real-world datasets demonstrate the +superiority and practicality of TERACON. Our code is available at +https://github.com/Sein-Kim/TERACON. + +
+
+ comment: KDD 2023 +
+
+
+
+
+ + ♻ ☆ Regret-Based Optimization for Robust Reinforcement Learning + + +
+ Deep Reinforcement Learning (DRL) policies have been shown to be vulnerable +to small adversarial noise in observations. Such adversarial noise can have +disastrous consequences in safety-critical environments. For instance, a +self-driving car receiving adversarially perturbed sensory observations about +nearby signs (e.g., a stop sign physically altered to be perceived as a speed +limit sign) or objects (e.g., cars altered to be recognized as trees) can be +fatal. Existing approaches for making RL algorithms robust to an +observation-perturbing adversary have focused on reactive approaches that +iteratively improve against adversarial examples generated at each iteration. +While such approaches have been shown to provide improvements over regular RL +methods, they are reactive and can fare significantly worse if certain +categories of adversarial examples are not generated during training. To that +end, we pursue a more proactive approach that relies on directly optimizing a +well-studied robustness measure, regret instead of expected value. We provide a +principled approach that minimizes maximum regret over a "neighborhood" of +observations to the received "observation". Our regret criterion can be used to +modify existing value- and policy-based Deep RL methods. We demonstrate that +our approaches provide a significant improvement in performance across a wide +variety of benchmarks against leading approaches for robust Deep RL. + +
+
+
+
+
+ + ♻ ☆ BallGAN: 3D-aware Image Synthesis with a Spherical Background ICCV 2023 + + +
+ 3D-aware GANs aim to synthesize realistic 3D scenes such that they can be +rendered in arbitrary perspectives to produce images. Although previous methods +produce realistic images, they suffer from unstable training or degenerate +solutions where the 3D geometry is unnatural. We hypothesize that the 3D +geometry is underdetermined due to the insufficient constraint, i.e., being +classified as real image to the discriminator is not enough. To solve this +problem, we propose to approximate the background as a spherical surface and +represent a scene as a union of the foreground placed in the sphere and the +thin spherical background. It reduces the degree of freedom in the background +field. Accordingly, we modify the volume rendering equation and incorporate +dedicated constraints to design a novel 3D-aware GAN framework named BallGAN. +BallGAN has multiple advantages as follows. 1) It produces more reasonable 3D +geometry; the images of a scene across different viewpoints have better +photometric consistency and fidelity than the state-of-the-art methods. 2) The +training becomes much more stable. 3) The foreground can be separately rendered +on top of different arbitrary backgrounds. + +
+
+ comment: ICCV 2023, Project Page: https://minjung-s.github.io/ballgan +
+
+
+
+
+ + ♻ ☆ Self-consistency for open-ended generations + + +
+ Large Language Models (LLMs) can exhibit considerable variation in the +quality of their sampled outputs. Reranking and selecting the best generation +from the sampled set is a popular way of obtaining strong gains in generation +quality. In this paper, we present a novel approach for reranking LLM +generations. Unlike other techniques that might involve additional inferences +or training a specialized reranker, our approach relies on easy to compute +pairwise statistics between the generations that have minimal compute overhead. +We show that our approach can be formalized as an extension of self-consistency +and analyze its performance in that framework, theoretically as well as via +simulations. We show strong improvements for selecting the best $k$ generations +for code generation tasks as well as robust improvements for best generation +for the tasks of autoformalization, and summarization. While our approach only +assumes black-box access to LLMs, we show that additional access to token +probabilities can improve performance even further. + +
+
+
+
+
+ + ♻ ☆ Deep Residual Error and Bag-of-Tricks Learning for Gravitational Wave + Surrogate Modeling + + +
+ Deep learning methods have been employed in gravitational-wave astronomy to +accelerate the construction of surrogate waveforms for the inspiral of +spin-aligned black hole binaries, among other applications. We face the +challenge of modeling the residual error of an artificial neural network that +models the coefficients of the surrogate waveform expansion (especially those +of the phase of the waveform) which we demonstrate has sufficient structure to +be learnable by a second network. Adding this second network, we were able to +reduce the maximum mismatch for waveforms in a validation set by 13.4 times. We +also explored several other ideas for improving the accuracy of the surrogate +model, such as the exploitation of similarities between waveforms, the +augmentation of the training set, the dissection of the input space, using +dedicated networks per output coefficient and output augmentation. In several +cases, small improvements can be observed, but the most significant improvement +still comes from the addition of a second network that models the residual +error. Since the residual error for more general surrogate waveform models +(when e.g., eccentricity is included) may also have a specific structure, one +can expect our method to be applicable to cases where the gain in accuracy +could lead to significant gains in computational time. + +
+
+
+
+
+ + ♻ ☆ Foundation Model-oriented Robustness: Robust Image Model Evaluation with + Pretrained Models + + +
+ Machine learning has demonstrated remarkable performance over finite +datasets, yet whether the scores over the fixed benchmarks can sufficiently +indicate the model's performance in the real world is still in discussion. In +reality, an ideal robust model will probably behave similarly to the oracle +(e.g., the human users), thus a good evaluation protocol is probably to +evaluate the models' behaviors in comparison to the oracle. In this paper, we +introduce a new robustness measurement that directly measures the image +classification model's performance compared with a surrogate oracle (i.e., a +foundation model). Besides, we design a simple method that can accomplish the +evaluation beyond the scope of the benchmarks. Our method extends the image +datasets with new samples that are sufficiently perturbed to be distinct from +the ones in the original sets, but are still bounded within the same +image-label structure the original test image represents, constrained by a +foundation model pretrained with a large amount of samples. As a result, our +new method will offer us a new way to evaluate the models' robustness +performance, free of limitations of fixed benchmarks or constrained +perturbations, although scoped by the power of the oracle. In addition to the +evaluation results, we also leverage our generated data to understand the +behaviors of the model and our new evaluation strategies. + +
+
+
+
+
+ + ♻ ☆ Temporal Saliency Detection Towards Explainable Transformer-based + Timeseries Forecasting + + +
+ Despite the notable advancements in numerous Transformer-based models, the +task of long multi-horizon time series forecasting remains a persistent +challenge, especially towards explainability. Focusing on commonly used +saliency maps in explaining DNN in general, our quest is to build +attention-based architecture that can automatically encode saliency-related +temporal patterns by establishing connections with appropriate attention heads. +Hence, this paper introduces Temporal Saliency Detection (TSD), an effective +approach that builds upon the attention mechanism and applies it to +multi-horizon time series prediction. While our proposed architecture adheres +to the general encoder-decoder structure, it undergoes a significant renovation +in the encoder component, wherein we incorporate a series of information +contracting and expanding blocks inspired by the U-Net style architecture. The +TSD approach facilitates the multiresolution analysis of saliency patterns by +condensing multi-heads, thereby progressively enhancing the forecasting of +complex time series data. Empirical evaluations illustrate the superiority of +our proposed approach compared to other models across multiple standard +benchmark datasets in diverse far-horizon forecasting settings. The initial TSD +achieves substantial relative improvements of 31% and 46% over several models +in the context of multivariate and univariate prediction. We believe the +comprehensive investigations presented in this study will offer valuable +insights and benefits to future research endeavors. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Federated Learning in Big Model Era: Domain-Specific Multimodal Large + Models + + +
+ Multimodal data, which can comprehensively perceive and recognize the +physical world, has become an essential path towards general artificial +intelligence. However, multimodal large models trained on public datasets often +underperform in specific industrial domains. This paper proposes a multimodal +federated learning framework that enables multiple enterprises to utilize +private domain data to collaboratively train large models for vertical domains, +achieving intelligent services across scenarios. The authors discuss in-depth +the strategic transformation of federated learning in terms of intelligence +foundation and objectives in the era of big model, as well as the new +challenges faced in heterogeneous data, model aggregation, performance and cost +trade-off, data privacy, and incentive mechanism. The paper elaborates a case +study of leading enterprises contributing multimodal data and expert knowledge +to city safety operation management , including distributed deployment and +efficient coordination of the federated learning platform, technical +innovations on data quality improvement based on large model capabilities and +efficient joint fine-tuning approaches. Preliminary experiments show that +enterprises can enhance and accumulate intelligent capabilities through +multimodal model federated learning, thereby jointly creating an smart city +model that provides high-quality intelligent services covering energy +infrastructure safety, residential community security, and urban operation +management. The established federated learning cooperation ecosystem is +expected to further aggregate industry, academia, and research resources, +realize large models in multiple vertical domains, and promote the large-scale +industrial application of artificial intelligence and cutting-edge research on +multimodal federated learning. + +
+
+
+
+
+ + ♻ ☆ Traffic Forecasting on New Roads Unseen in the Training Data Using + Spatial Contrastive Pre-Training ECML + + +
+ New roads are being constructed all the time. However, the capabilities of +previous deep forecasting models to generalize to new roads not seen in the +training data (unseen roads) are rarely explored. In this paper, we introduce a +novel setup called a spatio-temporal (ST) split to evaluate the models' +capabilities to generalize to unseen roads. In this setup, the models are +trained on data from a sample of roads, but tested on roads not seen in the +training data. Moreover, we also present a novel framework called Spatial +Contrastive Pre-Training (SCPT) where we introduce a spatial encoder module to +extract latent features from unseen roads during inference time. This spatial +encoder is pre-trained using contrastive learning. During inference, the +spatial encoder only requires two days of traffic data on the new roads and +does not require any re-training. We also show that the output from the spatial +encoder can be used effectively to infer latent node embeddings on unseen roads +during inference time. The SCPT framework also incorporates a new layer, named +the spatially gated addition (SGA) layer, to effectively combine the latent +features from the output of the spatial encoder to existing backbones. +Additionally, since there is limited data on the unseen roads, we argue that it +is better to decouple traffic signals to trivial-to-capture periodic signals +and difficult-to-capture Markovian signals, and for the spatial encoder to only +learn the Markovian signals. Finally, we empirically evaluated SCPT using the +ST split setup on four real-world datasets. The results showed that adding SCPT +to a backbone consistently improves forecasting performance on unseen roads. +More importantly, the improvements are greater when forecasting further into +the future. The codes are available on GitHub: +https://github.com/cruiseresearchgroup/forecasting-on-new-roads . + +
+
+ comment: 25 pages including reference, an additional 3 pages of appendix, 8 + figures. ECML PKDD 2023 Journal track special issue: Data Mining and + Knowledge Discovery (DAMI) +
+
+
+
+
+ + ♻ ☆ A Structured Span Selector NAACL 2022 + + +
+ Many natural language processing tasks, e.g., coreference resolution and +semantic role labeling, require selecting text spans and making decisions about +them. A typical approach to such tasks is to score all possible spans and +greedily select spans for task-specific downstream processing. This approach, +however, does not incorporate any inductive bias about what sort of spans ought +to be selected, e.g., that selected spans tend to be syntactic constituents. In +this paper, we propose a novel grammar-based structured span selection model +which learns to make use of the partial span-level annotation provided for such +problems. Compared to previous approaches, our approach gets rid of the +heuristic greedy span selection scheme, allowing us to model the downstream +task on an optimal set of spans. We evaluate our model on two popular span +prediction tasks: coreference resolution and semantic role labeling. We show +empirical improvements on both. + +
+
+ comment: NAACL 2022 camera-ready +
+
+
+
+
+ + ♻ ☆ Pruning Deep Neural Networks from a Sparsity Perspective ICLR 2023 + + +
+ In recent years, deep network pruning has attracted significant attention in +order to enable the rapid deployment of AI into small devices with computation +and memory constraints. Pruning is often achieved by dropping redundant +weights, neurons, or layers of a deep network while attempting to retain a +comparable test performance. Many deep pruning algorithms have been proposed +with impressive empirical success. However, existing approaches lack a +quantifiable measure to estimate the compressibility of a sub-network during +each pruning iteration and thus may under-prune or over-prune the model. In +this work, we propose PQ Index (PQI) to measure the potential compressibility +of deep neural networks and use this to develop a Sparsity-informed Adaptive +Pruning (SAP) algorithm. Our extensive experiments corroborate the hypothesis +that for a generic pruning procedure, PQI decreases first when a large model is +being effectively regularized and then increases when its compressibility +reaches a limit that appears to correspond to the beginning of underfitting. +Subsequently, PQI decreases again when the model collapse and significant +deterioration in the performance of the model start to occur. Additionally, our +experiments demonstrate that the proposed adaptive pruning algorithm with +proper choice of hyper-parameters is superior to the iterative pruning +algorithms such as the lottery ticket-based pruning methods, in terms of both +compression efficiency and robustness. + +
+
+ comment: ICLR 2023 +
+
+
+
+
+ + ♻ ☆ AdaTerm: Adaptive T-Distribution Estimated Robust Moments for + Noise-Robust Stochastic Gradient Optimization + + +
+ With the increasing practicality of deep learning applications, practitioners +are inevitably faced with datasets corrupted by noise from various sources such +as measurement errors, mislabeling, and estimated surrogate inputs/outputs that +can adversely impact the optimization results. It is a common practice to +improve the optimization algorithm's robustness to noise, since this algorithm +is ultimately in charge of updating the network parameters. Previous studies +revealed that the first-order moment used in Adam-like stochastic gradient +descent optimizers can be modified based on the Student's t-distribution. While +this modification led to noise-resistant updates, the other associated +statistics remained unchanged, resulting in inconsistencies in the assumed +models. In this paper, we propose AdaTerm, a novel approach that incorporates +the Student's t-distribution to derive not only the first-order moment but also +all the associated statistics. This provides a unified treatment of the +optimization process, offering a comprehensive framework under the statistical +model of the t-distribution for the first time. The proposed approach offers +several advantages over previously proposed approaches, including reduced +hyperparameters and improved robustness and adaptability. This noise-adaptive +behavior contributes to AdaTerm's exceptional learning performance, as +demonstrated through various optimization problems with different and/or +unknown noise ratios. Furthermore, we introduce a new technique for deriving a +theoretical regret bound without relying on AMSGrad, providing a valuable +contribution to the field + +
+
+ comment: 27 pages; Final version accepted by Elsevier Neurocomputing Journal + (2023-08; https://doi.org/10.1016/j.neucom.2023.126692) +
+
+
+
+
+ + ♻ ☆ An ML approach to resolution of singularities ICML + + +
+ The solution set of a system of polynomial equations typically contains +ill-behaved, singular points. Resolution is a fundamental process in geometry +in which we replace singular points with smooth points, while keeping the rest +of the solution set unchanged. Resolutions are not unique: the usual way to +describe them involves repeatedly performing a fundamental operation known as +"blowing-up", and the complexity of the resolution highly depends on certain +choices. The process can be translated into various versions of a 2-player +game, the so-called Hironaka game, and a winning strategy for the first player +provides a solution to the resolution problem. In this paper we introduce a new +approach to the Hironaka game that uses reinforcement learning agents to find +optimal resolutions of singularities. In certain domains, the trained model +outperforms state-of-the-art selection heuristics in total number of polynomial +additions performed, which provides a proof-of-concept that recent developments +in machine learning have the potential to improve performance of algorithms in +symbolic computation. + +
+
+ comment: To appear in Proceedings of the 40th International Conference on + Machine Learning TAG Workshop (ICML-TAG 2023) +
+
+
+
+
+ + ♻ ☆ On the Trustworthiness Landscape of State-of-the-art Generative Models: + A Comprehensive Survey + + +
+ Diffusion models and large language models have emerged as leading-edge +generative models and have sparked a revolutionary impact on various aspects of +human life. However, the practical implementation of these models has also +exposed inherent risks, highlighting their dual nature and raising concerns +regarding their trustworthiness. Despite the abundance of literature on this +subject, a comprehensive survey specifically delving into the intersection of +large-scale generative models and their trustworthiness remains largely absent. +To bridge this gap, This paper investigates both the long-standing and emerging +threats associated with these models across four fundamental dimensions: +privacy, security, fairness, and responsibility. In this way, we construct an +extensive map outlining the trustworthiness of these models, while also +providing practical recommendations and identifying future directions. These +efforts are crucial for promoting the trustworthy deployment of these models, +ultimately benefiting society as a whole. + +
+
+ comment: Draft Version +
+
+
+
+
+ + ♻ ☆ Stability of Aggregation Graph Neural Networks + + +
+ In this paper we study the stability properties of aggregation graph neural +networks (Agg-GNNs) considering perturbations of the underlying graph. An +Agg-GNN is a hybrid architecture where information is defined on the nodes of a +graph, but it is processed block-wise by Euclidean CNNs on the nodes after +several diffusions on the graph shift operator. We derive stability bounds for +the mapping operator associated to a generic Agg-GNN, and we specify conditions +under which such operators can be stable to deformations. We prove that the +stability bounds are defined by the properties of the filters in the first +layer of the CNN that acts on each node. Additionally, we show that there is a +close relationship between the number of aggregations, the filter's +selectivity, and the size of the stability constants. We also conclude that in +Agg-GNNs the selectivity of the mapping operators is tied to the properties of +the filters only in the first layer of the CNN stage. This shows a substantial +difference with respect to the stability properties of selection GNNs, where +the selectivity of the filters in all layers is constrained by their stability. +We provide numerical evidence corroborating the results derived, testing the +behavior of Agg-GNNs in real life application scenarios considering +perturbations of different magnitude. + +
+
+
+
+
+ + ♻ ☆ On the Choice of Perception Loss Function for Learned Video Compression + + +
+ We study causal, low-latency, sequential video compression when the output is +subjected to both a mean squared-error (MSE) distortion loss as well as a +perception loss to target realism. Motivated by prior approaches, we consider +two different perception loss functions (PLFs). The first, PLF-JD, considers +the joint distribution (JD) of all the video frames up to the current one, +while the second metric, PLF-FMD, considers the framewise marginal +distributions (FMD) between the source and reconstruction. Using information +theoretic analysis and deep-learning based experiments, we demonstrate that the +choice of PLF can have a significant effect on the reconstruction, especially +at low-bit rates. In particular, while the reconstruction based on PLF-JD can +better preserve the temporal correlation across frames, it also imposes a +significant penalty in distortion compared to PLF-FMD and further makes it more +difficult to recover from errors made in the earlier output frames. Although +the choice of PLF decisively affects reconstruction quality, we also +demonstrate that it may not be essential to commit to a particular PLF during +encoding and the choice of PLF can be delegated to the decoder. In particular, +encoded representations generated by training a system to minimize the MSE +(without requiring either PLF) can be {\em near universal} and can generate +close to optimal reconstructions for either choice of PLF at the decoder. We +validate our results using (one-shot) information-theoretic analysis, detailed +study of the rate-distortion-perception tradeoff of the Gauss-Markov source +model as well as deep-learning based experiments on moving MNIST and KTH +datasets. + +
+
+
+
+
+ + ♻ ☆ Self-supervised learning based general laboratory progress pretrained + model for cardiovascular event detection + + +
+ The inherent nature of patient data poses several challenges. Prevalent cases +amass substantial longitudinal data owing to their patient volume and +consistent follow-ups, however, longitudinal laboratory data are renowned for +their irregularity, temporality, absenteeism, and sparsity; In contrast, +recruitment for rare or specific cases is often constrained due to their +limited patient size and episodic observations. This study employed +self-supervised learning (SSL) to pretrain a generalized laboratory progress +(GLP) model that captures the overall progression of six common laboratory +markers in prevalent cardiovascular cases, with the intention of transferring +this knowledge to aid in the detection of specific cardiovascular event. GLP +implemented a two-stage training approach, leveraging the information embedded +within interpolated data and amplify the performance of SSL. After GLP +pretraining, it is transferred for TVR detection. The proposed two-stage +training improved the performance of pure SSL, and the transferability of GLP +exhibited distinctiveness. After GLP processing, the classification exhibited a +notable enhancement, with averaged accuracy rising from 0.63 to 0.90. All +evaluated metrics demonstrated substantial superiority (p < 0.01) compared to +prior GLP processing. Our study effectively engages in translational +engineering by transferring patient progression of cardiovascular laboratory +parameters from one patient group to another, transcending the limitations of +data availability. The transferability of disease progression optimized the +strategies of examinations and treatments, and improves patient prognosis while +using commonly available laboratory parameters. The potential for expanding +this approach to encompass other diseases holds great promise. + +
+
+ comment: published in IEEE Journal of Translational Engineering in Health & + Medicine +
+
+
+
+
+ + ♻ ☆ Graphon Pooling for Reducing Dimensionality of Signals and Convolutional + Operators on Graphs + + +
+ In this paper we propose a pooling approach for convolutional information +processing on graphs relying on the theory of graphons and limits of dense +graph sequences. We present three methods that exploit the induced graphon +representation of graphs and graph signals on partitions of [0, 1]2 in the +graphon space. As a result we derive low dimensional representations of the +convolutional operators, while a dimensionality reduction of the signals is +achieved by simple local interpolation of functions in L2([0, 1]). We prove +that those low dimensional representations constitute a convergent sequence of +graphs and graph signals, respectively. The methods proposed and the +theoretical guarantees that we provide show that the reduced graphs and signals +inherit spectral-structural properties of the original quantities. We evaluate +our approach with a set of numerical experiments performed on graph neural +networks (GNNs) that rely on graphon pooling. We observe that graphon pooling +performs significantly better than other approaches proposed in the literature +when dimensionality reduction ratios between layers are large. We also observe +that when graphon pooling is used we have, in general, less overfitting and +lower computational cost. + +
+
+
+
+
+ + ♻ ☆ Group Equality in Adaptive Submodular Maximization + + +
+ In this paper, we study the classic submodular maximization problem subject +to a group equality constraint under both non-adaptive and adaptive settings. +It has been shown that the utility function of many machine learning +applications, including data summarization, influence maximization in social +networks, and personalized recommendation, satisfies the property of +submodularity. Hence, maximizing a submodular function subject to various +constraints can be found at the heart of many of those applications. On a high +level, submodular maximization aims to select a group of most representative +items (e.g., data points). However, the design of most existing algorithms does +not incorporate the fairness constraint, leading to under- or +over-representation of some particular groups. This motivates us to study the +submodular maximization problem with group equality, where we aim to select a +group of items to maximize a (possibly non-monotone) submodular utility +function subject to a group equality constraint. To this end, we develop the +first constant-factor approximation algorithm for this problem. The design of +our algorithm is robust enough to be extended to solving the submodular +maximization problem under a more complicated adaptive setting. Moreover, we +further extend our study to incorporating a global cardinality constraint and +other fairness notations. + +
+
+ comment: This paper has been accepted by INFORMS Journal on Computing +
+
+
+
+
+ + ♻ ☆ Measuring Equality in Machine Learning Security Defenses: A Case Study + in Speech Recognition + + +
+ Over the past decade, the machine learning security community has developed a +myriad of defenses for evasion attacks. An understudied question in that +community is: for whom do these defenses defend? This work considers common +approaches to defending learned systems and how security defenses result in +performance inequities across different sub-populations. We outline appropriate +parity metrics for analysis and begin to answer this question through empirical +results of the fairness implications of machine learning security methods. We +find that many methods that have been proposed can cause direct harm, like +false rejection and unequal benefits from robustness training. The framework we +propose for measuring defense equality can be applied to robustly trained +models, preprocessing-based defenses, and rejection methods. We identify a set +of datasets with a user-centered application and a reasonable computational +cost suitable for case studies in measuring the equality of defenses. In our +case study of speech command recognition, we show how such adversarial training +and augmentation have non-equal but complex protections for social subgroups +across gender, accent, and age in relation to user coverage. We present a +comparison of equality between two rejection-based defenses: randomized +smoothing and neural rejection, finding randomized smoothing more equitable due +to the sampling mechanism for minority groups. This represents the first work +examining the disparity in the adversarial robustness in the speech domain and +the fairness evaluation of rejection-based defenses. + +
+
+ comment: Accepted to AISec'23 +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Aparecium: Revealing Secrets from Physical Photographs + + +
+ Watermarking is a crucial tool for safeguarding copyrights and can serve as a +more aesthetically pleasing alternative to QR codes. In recent years, +watermarking methods based on deep learning have proved superior robustness +against complex physical distortions than traditional watermarking methods. +However, they have certain limitations that render them less effective in +practice. For instance, current solutions necessitate physical photographs to +be rectangular for accurate localization, cannot handle physical bending or +folding, and require the hidden area to be completely captured at a close +distance and small angle. To overcome these challenges, we propose a novel deep +watermarking framework dubbed \textit{Aparecium}. Specifically, we preprocess +secrets (i.e., watermarks) into a pattern and then embed it into the cover +image, which is symmetrical to the final decoding-then-extracting process. To +capture the watermarked region from complex physical scenarios, a locator is +also introduced. Besides, we adopt a three-stage training strategy for training +convergence. Extensive experiments demonstrate that \textit{Aparecium} is not +only robust against different digital distortions, but also can resist various +physical distortions, such as screen-shooting and printing-shooting, even in +severe cases including different shapes, curvature, folding, incompleteness, +long distances, and big angles while maintaining high visual quality. +Furthermore, some ablation studies are also conducted to verify our design. + +
+
+
+
+
+ + ☆ CgT-GAN: CLIP-guided Text GAN for Image Captioning ACM MM 2023 + + +
+ The large-scale visual-language pre-trained model, Contrastive Language-Image +Pre-training (CLIP), has significantly improved image captioning for scenarios +without human-annotated image-caption pairs. Recent advanced CLIP-based image +captioning without human annotations follows a text-only training paradigm, +i.e., reconstructing text from shared embedding space. Nevertheless, these +approaches are limited by the training/inference gap or huge storage +requirements for text embeddings. Given that it is trivial to obtain images in +the real world, we propose CLIP-guided text GAN (CgT-GAN), which incorporates +images into the training process to enable the model to "see" real visual +modality. Particularly, we use adversarial training to teach CgT-GAN to mimic +the phrases of an external text corpus and CLIP-based reward to provide +semantic guidance. The caption generator is jointly rewarded based on the +caption naturalness to human language calculated from the GAN's discriminator +and the semantic guidance reward computed by the CLIP-based reward module. In +addition to the cosine similarity as the semantic guidance reward (i.e., +CLIP-cos), we further introduce a novel semantic guidance reward called +CLIP-agg, which aligns the generated caption with a weighted text embedding by +attentively aggregating the entire corpus. Experimental results on three +subtasks (ZS-IC, In-UIC and Cross-UIC) show that CgT-GAN outperforms +state-of-the-art methods significantly across all metrics. Code is available at +https://github.com/Lihr747/CgtGAN. + +
+
+ comment: Accepted at ACM MM 2023 +
+
+
+
+
+ + ☆ EVE: Efficient Vision-Language Pre-training with Masked Prediction and + Modality-Aware MoE + + +
+ Building scalable vision-language models to learn from diverse, multimodal +data remains an open challenge. In this paper, we introduce an Efficient +Vision-languagE foundation model, namely EVE, which is one unified multimodal +Transformer pre-trained solely by one unified pre-training task. Specifically, +EVE encodes both vision and language within a shared Transformer network +integrated with modality-aware sparse Mixture-of-Experts (MoE) modules, which +capture modality-specific information by selectively switching to different +experts. To unify pre-training tasks of vision and language, EVE performs +masked signal modeling on image-text pairs to reconstruct masked signals, i.e., +image pixels and text tokens, given visible signals. This simple yet effective +pre-training objective accelerates training by 3.5x compared to the model +pre-trained with Image-Text Contrastive and Image-Text Matching losses. Owing +to the combination of the unified architecture and pre-training task, EVE is +easy to scale up, enabling better downstream performance with fewer resources +and faster training speed. Despite its simplicity, EVE achieves +state-of-the-art performance on various vision-language downstream tasks, +including visual question answering, visual reasoning, and image-text +retrieval. + +
+
+
+
+
+ + ♻ ☆ Multimodal Garment Designer: Human-Centric Latent Diffusion Models for + Fashion Image Editing ICCV 2023 + + +
+ Fashion illustration is used by designers to communicate their vision and to +bring the design idea from conceptualization to realization, showing how +clothes interact with the human body. In this context, computer vision can thus +be used to improve the fashion design process. Differently from previous works +that mainly focused on the virtual try-on of garments, we propose the task of +multimodal-conditioned fashion image editing, guiding the generation of +human-centric fashion images by following multimodal prompts, such as text, +human body poses, and garment sketches. We tackle this problem by proposing a +new architecture based on latent diffusion models, an approach that has not +been used before in the fashion domain. Given the lack of existing datasets +suitable for the task, we also extend two existing fashion datasets, namely +Dress Code and VITON-HD, with multimodal annotations collected in a +semi-automatic manner. Experimental results on these new datasets demonstrate +the effectiveness of our proposal, both in terms of realism and coherence with +the given multimodal inputs. Source code and collected multimodal annotations +are publicly available at: +https://github.com/aimagelab/multimodal-garment-designer. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ VoxBlink: X-Large Speaker Verification Dataset on Camera ICASSP2024 + + +
+ In this paper, we contribute a novel and extensive dataset for speaker +verification, which contains noisy 38k identities/1.45M utterances (VoxBlink) +and relatively cleaned 18k identities/1.02M (VoxBlink-Clean) utterances for +training. Firstly, we accumulate a 60K+ users' list with their avatars and +download their short videos on YouTube. We then established an automatic and +scalable pipeline to extract relevant speech and video segments from these +videos. To our knowledge, the VoxBlink dataset is one of the largest speaker +recognition datasets available. Secondly, we conduct a series of experiments +based on different backbones trained on a mix of the VoxCeleb2 and the +VoxBlink-Clean. Our findings highlight a notable performance improvement, +ranging from 13% to 30%, across different backbone architectures upon +integrating our dataset for training. The dataset will be made publicly +available shortly. + +
+
+ comment: submit to ICASSP2024 +
+
+
+
+
+ + ♻ ☆ A Tale of Two Graphs: Freezing and Denoising Graph Structures for + Multimodal Recommendation + + +
+ Multimodal recommender systems utilizing multimodal features (e.g., images +and textual descriptions) typically show better recommendation accuracy than +general recommendation models based solely on user-item interactions. +Generally, prior work fuses multimodal features into item ID embeddings to +enrich item representations, thus failing to capture the latent semantic +item-item structures. In this context, LATTICE proposes to learn the latent +structure between items explicitly and achieves state-of-the-art performance +for multimodal recommendations. However, we argue the latent graph structure +learning of LATTICE is both inefficient and unnecessary. Experimentally, we +demonstrate that freezing its item-item structure before training can also +achieve competitive performance. Based on this finding, we propose a simple yet +effective model, dubbed as FREEDOM, that FREEzes the item-item graph and +DenOises the user-item interaction graph simultaneously for Multimodal +recommendation. Theoretically, we examine the design of FREEDOM through a graph +spectral perspective and demonstrate that it possesses a tighter upper bound on +the graph spectrum. In denoising the user-item interaction graph, we devise a +degree-sensitive edge pruning method, which rejects possibly noisy edges with a +high probability when sampling the graph. We evaluate the proposed model on +three real-world datasets and show that FREEDOM can significantly outperform +current strongest baselines. Compared with LATTICE, FREEDOM achieves an average +improvement of 19.07% in recommendation accuracy while reducing its memory cost +up to 6$\times$ on large graphs. The source code is available at: +https://github.com/enoche/FREEDOM. + +
+
+ comment: Accepted to ACM Multimedia (MM) 2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 52 + +
+
+
+ + ☆ StoryBench: A Multifaceted Benchmark for Continuous Story Visualization + + +
+ Generating video stories from text prompts is a complex task. In addition to +having high visual quality, videos need to realistically adhere to a sequence +of text prompts whilst being consistent throughout the frames. Creating a +benchmark for video generation requires data annotated over time, which +contrasts with the single caption used often in video datasets. To fill this +gap, we collect comprehensive human annotations on three existing datasets, and +introduce StoryBench: a new, challenging multi-task benchmark to reliably +evaluate forthcoming text-to-video models. Our benchmark includes three video +generation tasks of increasing difficulty: action execution, where the next +action must be generated starting from a conditioning video; story +continuation, where a sequence of actions must be executed starting from a +conditioning video; and story generation, where a video must be generated from +only text prompts. We evaluate small yet strong text-to-video baselines, and +show the benefits of training on story-like data algorithmically generated from +existing video captions. Finally, we establish guidelines for human evaluation +of video stories, and reaffirm the need of better automatic metrics for video +generation. StoryBench aims at encouraging future research efforts in this +exciting new area. + +
+
+
+
+
+ + ☆ Tryage: Real-time, intelligent Routing of User Prompts to Large Language + Model + + +
+ The introduction of the transformer architecture and the self-attention +mechanism has led to an explosive production of language models trained on +specific downstream tasks and data domains. With over 200, 000 models in the +Hugging Face ecosystem, users grapple with selecting and optimizing models to +suit multifaceted workflows and data domains while addressing computational, +security, and recency concerns. There is an urgent need for machine learning +frameworks that can eliminate the burden of model selection and customization +and unleash the incredible power of the vast emerging model library for end +users. Here, we propose a context-aware routing system, Tryage, that leverages +a language model router for optimal selection of expert models from a model +library based on analysis of individual input prompts. Inspired by the thalamic +router in the brain, Tryage employs a perceptive router to predict down-stream +model performance on prompts and, then, makes a routing decision using an +objective function that integrates performance predictions with user goals and +constraints that are incorporated through flags (e.g., model size, model +recency). Tryage allows users to explore a Pareto front and automatically +trade-off between task accuracy and secondary goals including minimization of +model size, recency, security, verbosity, and readability. Across heterogeneous +data sets that include code, text, clinical data, and patents, the Tryage +framework surpasses Gorilla and GPT3.5 turbo in dynamic model selection +identifying the optimal model with an accuracy of 50.9% , compared to 23.6% by +GPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how +routing models can be applied to program and control the behavior of +multi-model LLM systems to maximize efficient use of the expanding and evolving +language model ecosystem. + +
+
+
+
+
+ + ☆ SeamlessM4T-Massively Multilingual & Multimodal Machine Translation + + +
+ What does it take to create the Babel Fish, a tool that can help individuals +translate speech between any two languages? While recent breakthroughs in +text-based models have pushed machine translation coverage beyond 200 +languages, unified speech-to-speech translation models have yet to achieve +similar strides. More specifically, conventional speech-to-speech translation +systems rely on cascaded systems that perform translation progressively, +putting high-performing unified systems out of reach. To address these gaps, we +introduce SeamlessM4T, a single model that supports speech-to-speech +translation, speech-to-text translation, text-to-speech translation, +text-to-text translation, and automatic speech recognition for up to 100 +languages. To build this, we used 1 million hours of open speech audio data to +learn self-supervised speech representations with w2v-BERT 2.0. Subsequently, +we created a multimodal corpus of automatically aligned speech translations. +Filtered and combined with human-labeled and pseudo-labeled data, we developed +the first multilingual system capable of translating from and into English for +both speech and text. On FLEURS, SeamlessM4T sets a new standard for +translations into multiple target languages, achieving an improvement of 20% +BLEU over the previous SOTA in direct speech-to-text translation. Compared to +strong cascaded models, SeamlessM4T improves the quality of into-English +translation by 1.3 BLEU points in speech-to-text and by 2.6 ASR-BLEU points in +speech-to-speech. Tested for robustness, our system performs better against +background noises and speaker variations in speech-to-text tasks compared to +the current SOTA model. Critically, we evaluated SeamlessM4T on gender bias and +added toxicity to assess translation safety. Finally, all contributions in this +work are open-sourced at this https +https://github.com/facebookresearch/seamless_communication. + +
+
+
+
+
+ + ☆ Using ChatGPT as a CAT tool in Easy Language translation + + +
+ This study sets out to investigate the feasibility of using ChatGPT to +translate citizen-oriented administrative texts into German Easy Language, a +simplified, controlled language variety that is adapted to the needs of people +with reading impairments. We use ChatGPT to translate selected texts from +websites of German public authorities using two strategies, i.e. linguistic and +holistic. We analyse the quality of the generated texts based on different +criteria, such as correctness, readability, and syntactic complexity. The +results indicated that the generated texts are easier than the standard texts, +but that they still do not fully meet the established Easy Language standards. +Additionally, the content is not always rendered correctly. + +
+
+
+
+
+ + ☆ BELB: a Biomedical Entity Linking Benchmark + + +
+ Biomedical entity linking (BEL) is the task of grounding entity mentions to a +knowledge base. It plays a vital role in information extraction pipelines for +the life sciences literature. We review recent work in the field and find that, +as the task is absent from existing benchmarks for biomedical text mining, +different studies adopt different experimental setups making comparisons based +on published numbers problematic. Furthermore, neural systems are tested +primarily on instances linked to the broad coverage knowledge base UMLS, +leaving their performance to more specialized ones, e.g. genes or variants, +understudied. We therefore developed BELB, a Biomedical Entity Linking +Benchmark, providing access in a unified format to 11 corpora linked to 7 +knowledge bases and spanning six entity types: gene, disease, chemical, +species, cell line and variant. BELB greatly reduces preprocessing overhead in +testing BEL systems on multiple corpora offering a standardized testbed for +reproducible experiments. Using BELB we perform an extensive evaluation of six +rule-based entity-specific systems and three recent neural approaches +leveraging pre-trained language models. Our results reveal a mixed picture +showing that neural approaches fail to perform consistently across entity +types, highlighting the need of further studies towards entity-agnostic models. + +
+
+
+
+
+ + ☆ Empowering Refugee Claimants and their Lawyers: Using Machine Learning + to Examine Decision-Making in Refugee Law + + +
+ Our project aims at helping and supporting stakeholders in refugee status +adjudications, such as lawyers, judges, governing bodies, and claimants, in +order to make better decisions through data-driven intelligence and increase +the understanding and transparency of the refugee application process for all +involved parties. This PhD project has two primary objectives: (1) to retrieve +past cases, and (2) to analyze legal decision-making processes on a dataset of +Canadian cases. In this paper, we present the current state of our work, which +includes a completed experiment on part (1) and ongoing efforts related to part +(2). We believe that NLP-based solutions are well-suited to address these +challenges, and we investigate the feasibility of automating all steps +involved. In addition, we introduce a novel benchmark for future NLP research +in refugee law. Our methodology aims to be inclusive to all end-users and +stakeholders, with expected benefits including reduced time-to-decision, fairer +and more transparent outcomes, and improved decision quality. + +
+
+ comment: 19th International Conference on Artificial Intelligence and Law - + ICAIL 2023, Doctoral Consortium. arXiv admin note: substantial text overlap + with arXiv:2305.15533 +
+
+
+
+
+ + ☆ Unsupervised Prototype Adapter for Vision-Language Models + + +
+ Recently, large-scale pre-trained vision-language models (e.g. CLIP and +ALIGN) have demonstrated remarkable effectiveness in acquiring transferable +visual representations. To leverage the valuable knowledge encoded within these +models for downstream tasks, several fine-tuning approaches, including prompt +tuning methods and adapter-based methods, have been developed to adapt +vision-language models effectively with supervision. However, these methods +rely on the availability of annotated samples, which can be labor-intensive and +time-consuming to acquire, thus limiting scalability. To address this issue, in +this work, we design an unsupervised fine-tuning approach for vision-language +models called Unsupervised Prototype Adapter (UP-Adapter). Specifically, for +the unannotated target datasets, we leverage the text-image aligning capability +of CLIP to automatically select the most confident samples for each class. +Utilizing these selected samples, we generate class prototypes, which serve as +the initialization for the learnable prototype model. After fine-tuning, the +prototype model prediction is combined with the original CLIP's prediction by a +residual connection to perform downstream recognition tasks. Our extensive +experimental results on image recognition and domain generalization show that +the proposed unsupervised method outperforms 8-shot CoOp, 8-shot Tip-Adapter, +and also the state-of-the-art UPL method by large margins. + +
+
+ comment: Accepted by PRCV 2023 +
+
+
+
+
+ + ☆ Can Authorship Representation Learning Capture Stylistic Features? ACL 2023 + + +
+ Automatically disentangling an author's style from the content of their +writing is a longstanding and possibly insurmountable problem in computational +linguistics. At the same time, the availability of large text corpora furnished +with author labels has recently enabled learning authorship representations in +a purely data-driven manner for authorship attribution, a task that ostensibly +depends to a greater extent on encoding writing style than encoding content. +However, success on this surrogate task does not ensure that such +representations capture writing style since authorship could also be correlated +with other latent variables, such as topic. In an effort to better understand +the nature of the information these representations convey, and specifically to +validate the hypothesis that they chiefly encode writing style, we +systematically probe these representations through a series of targeted +experiments. The results of these experiments suggest that representations +learned for the surrogate authorship prediction task are indeed sensitive to +writing style. As a consequence, authorship representations may be expected to +be robust to certain kinds of data shift, such as topic drift over time. +Additionally, our findings may open the door to downstream applications that +require stylistic representations, such as style transfer. + +
+
+ comment: appearing at TACL 2023 +
+
+
+
+
+ + ☆ Large Language Models Sensitivity to The Order of Options in + Multiple-Choice Questions + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +various NLP tasks. However, previous works have shown these models are +sensitive towards prompt wording, and few-shot demonstrations and their order, +posing challenges to fair assessment of these models. As these models become +more powerful, it becomes imperative to understand and address these +limitations. In this paper, we focus on LLMs robustness on the task of +multiple-choice questions -- commonly adopted task to study reasoning and +fact-retrieving capability of LLMs. Investigating the sensitivity of LLMs +towards the order of options in multiple-choice questions, we demonstrate a +considerable performance gap of approximately 13% to 75% in LLMs on different +benchmarks, when answer options are reordered, even when using demonstrations +in a few-shot setting. Through a detailed analysis, we conjecture that this +sensitivity arises when LLMs are uncertain about the prediction between the +top-2/3 choices, and specific options placements may favor certain prediction +between those top choices depending on the question caused by positional bias. +We also identify patterns in top-2 choices that amplify or mitigate the model's +bias toward option placement. We found that for amplifying bias, the optimal +strategy involves positioning the top two choices as the first and last +options. Conversely, to mitigate bias, we recommend placing these choices among +the adjacent options. To validate our conjecture, we conduct various +experiments and adopt two approaches to calibrate LLMs' predictions, leading to +up to 8 percentage points improvement across different models and benchmarks. + +
+
+
+
+
+ + ☆ Sentence-Level Multimodal and Language-Agnostic Representations + + +
+ We introduce SONAR, a new multilingual and multimodal fixed-size sentence +embedding space. Our single text encoder, covering 200 languages, substantially +outperforms existing sentence embeddings such as LASER3 and LabSE on the xsim +and xsim++ multilingual similarity search tasks. Speech segments can be +embedded in the same SONAR embedding space using language-specific speech +encoders trained in a teacher-student setting on speech transcription data. Our +encoders outperform existing speech encoders on similarity search tasks. We +also provide a text decoder for 200 languages, which allows us to perform +text-to-text and speech-to-text machine translation, including for zero-shot +language and modality combinations. Our text-to-text results are competitive +compared to the state-of-the-art NLLB~1B model, despite the fixed-size +bottleneck representation. Our zero-shot speech-to-text translation results +compare favorably with strong supervised baselines such as Whisper. + +
+
+
+
+
+ + ☆ Aspect-oriented Opinion Alignment Network for Aspect-Based Sentiment + Classification ECAI 2023 + + +
+ Aspect-based sentiment classification is a crucial problem in fine-grained +sentiment analysis, which aims to predict the sentiment polarity of the given +aspect according to its context. Previous works have made remarkable progress +in leveraging attention mechanism to extract opinion words for different +aspects. However, a persistent challenge is the effective management of +semantic mismatches, which stem from attention mechanisms that fall short in +adequately aligning opinions words with their corresponding aspect in +multi-aspect sentences. To address this issue, we propose a novel +Aspect-oriented Opinion Alignment Network (AOAN) to capture the contextual +association between opinion words and the corresponding aspect. Specifically, +we first introduce a neighboring span enhanced module which highlights various +compositions of neighboring words and given aspects. In addition, we design a +multi-perspective attention mechanism that align relevant opinion information +with respect to the given aspect. Extensive experiments on three benchmark +datasets demonstrate that our model achieves state-of-the-art results. The +source code is available at https://github.com/AONE-NLP/ABSA-AOAN. + +
+
+ comment: 8 pages, 5 figure, ECAI 2023 +
+
+
+
+
+ + ☆ A Survey on Large Language Model based Autonomous Agents + + +
+ Autonomous agents have long been a prominent research topic in the academic +community. Previous research in this field often focuses on training agents +with limited knowledge within isolated environments, which diverges +significantly from the human learning processes, and thus makes the agents hard +to achieve human-like decisions. Recently, through the acquisition of vast +amounts of web knowledge, large language models (LLMs) have demonstrated +remarkable potential in achieving human-level intelligence. This has sparked an +upsurge in studies investigating autonomous agents based on LLMs. To harness +the full potential of LLMs, researchers have devised diverse agent +architectures tailored to different applications. In this paper, we present a +comprehensive survey of these studies, delivering a systematic review of the +field of autonomous agents from a holistic perspective. More specifically, our +focus lies in the construction of LLM-based agents, for which we propose a +unified framework that encompasses a majority of the previous work. +Additionally, we provide a summary of the various applications of LLM-based AI +agents in the domains of social science, natural science, and engineering. +Lastly, we discuss the commonly employed evaluation strategies for LLM-based AI +agents. Based on the previous studies, we also present several challenges and +future directions in this field. To keep track of this field and continuously +update our survey, we maintain a repository for the related references at +https://github.com/Paitesanshi/LLM-Agent-Survey. + +
+
+ comment: 32 pages, 3 figures +
+
+
+
+
+ + ☆ Extracting Relational Triples Based on Graph Recursive Neural Network + via Dynamic Feedback Forest Algorithm + + +
+ Extracting relational triples (subject, predicate, object) from text enables +the transformation of unstructured text data into structured knowledge. The +named entity recognition (NER) and the relation extraction (RE) are two +foundational subtasks in this knowledge generation pipeline. The integration of +subtasks poses a considerable challenge due to their disparate nature. This +paper presents a novel approach that converts the triple extraction task into a +graph labeling problem, capitalizing on the structural information of +dependency parsing and graph recursive neural networks (GRNNs). To integrate +subtasks, this paper proposes a dynamic feedback forest algorithm that connects +the representations of subtasks by inference operations during model training. +Experimental results demonstrate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ Convoifilter: A case study of doing cocktail party speech recognition + + +
+ This paper presents an end-to-end model designed to improve automatic speech +recognition (ASR) for a particular speaker in a crowded, noisy environment. The +model utilizes a single-channel speech enhancement module that isolates the +speaker's voice from background noise, along with an ASR module. Through this +approach, the model is able to decrease the word error rate (WER) of ASR from +80% to 26.4%. Typically, these two components are adjusted independently due to +variations in data requirements. However, speech enhancement can create +anomalies that decrease ASR efficiency. By implementing a joint fine-tuning +strategy, the model can reduce the WER from 26.4% in separate tuning to 14.5% +in joint tuning. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ M3PS: End-to-End Multi-Grained Multi-Modal Attribute-Aware Product + Summarization in E-commerce + + +
+ Given the long textual product information and the product image, Multi-Modal +Product Summarization (MMPS) aims to attract customers' interest and increase +their desire to purchase by highlighting product characteristics with a short +textual summary. Existing MMPS methods have achieved promising performance. +Nevertheless, there still exist several problems: 1) lack end-to-end product +summarization, 2) lack multi-grained multi-modal modeling, and 3) lack +multi-modal attribute modeling. To address these issues, we propose an +end-to-end multi-grained multi-modal attribute-aware product summarization +method (M3PS) for generating high-quality product summaries in e-commerce. M3PS +jointly models product attributes and generates product summaries. Meanwhile, +we design several multi-grained multi-modal tasks to better guide the +multi-modal learning of M3PS. Furthermore, we model product attributes based on +both text and image modalities so that multi-modal product characteristics can +be manifested in the generated summaries. Extensive experiments on a real +large-scale Chinese e-commence dataset demonstrate that our model outperforms +state-of-the-art product summarization methods w.r.t. several summarization +metrics. + +
+
+
+
+
+ + ☆ LEAP: Efficient and Automated Test Method for NLP Software + + +
+ The widespread adoption of DNNs in NLP software has highlighted the need for +robustness. Researchers proposed various automatic testing techniques for +adversarial test cases. However, existing methods suffer from two limitations: +weak error-discovering capabilities, with success rates ranging from 0% to +24.6% for BERT-based NLP software, and time inefficiency, taking 177.8s to +205.28s per test case, making them challenging for time-constrained scenarios. +To address these issues, this paper proposes LEAP, an automated test method +that uses LEvy flight-based Adaptive Particle swarm optimization integrated +with textual features to generate adversarial test cases. Specifically, we +adopt Levy flight for population initialization to increase the diversity of +generated test cases. We also design an inertial weight adaptive update +operator to improve the efficiency of LEAP's global optimization of +high-dimensional text examples and a mutation operator based on the greedy +strategy to reduce the search time. We conducted a series of experiments to +validate LEAP's ability to test NLP software and found that the average success +rate of LEAP in generating adversarial test cases is 79.1%, which is 6.1% +higher than the next best approach (PSOattack). While ensuring high success +rates, LEAP significantly reduces time overhead by up to 147.6s compared to +other heuristic-based methods. Additionally, the experimental results +demonstrate that LEAP can generate more transferable test cases and +significantly enhance the robustness of DNN-based systems. + +
+
+ comment: Accepted at ASE 2023 +
+
+
+
+
+ + ☆ Music Understanding LLaMA: Advancing Text-to-Music Generation with + Question Answering and Captioning + + +
+ Text-to-music generation (T2M-Gen) faces a major obstacle due to the scarcity +of large-scale publicly available music datasets with natural language +captions. To address this, we propose the Music Understanding LLaMA (MU-LLaMA), +capable of answering music-related questions and generating captions for music +files. Our model utilizes audio representations from a pretrained MERT model to +extract music features. However, obtaining a suitable dataset for training the +MU-LLaMA model remains challenging, as existing publicly accessible audio +question answering datasets lack the necessary depth for open-ended music +question answering. To fill this gap, we present a methodology for generating +question-answer pairs from existing audio captioning datasets and introduce the +MusicQA Dataset designed for answering open-ended music-related questions. The +experiments demonstrate that the proposed MU-LLaMA model, trained on our +designed MusicQA dataset, achieves outstanding performance in both music +question answering and music caption generation across various metrics, +outperforming current state-of-the-art (SOTA) models in both fields and +offering a promising advancement in the T2M-Gen research field. + +
+
+
+
+
+ + ☆ HopPG: Self-Iterative Program Generation for Multi-Hop Question + Answering over Heterogeneous Knowledge + + +
+ The semantic parsing-based method is an important research branch for +knowledge-based question answering. It usually generates executable programs +lean upon the question and then conduct them to reason answers over a knowledge +base. Benefit from this inherent mechanism, it has advantages in the +performance and the interpretability. However,traditional semantic parsing +methods usually generate a complete program before executing it, which +struggles with multi-hop question answering over heterogeneous knowledge. +Firstly,a complete multi-hop program relies on multiple heterogeneous +supporting facts, and it is difficult for models to receive these facts +simultaneously. Secondly,these methods ignore the interaction information +between the previous-hop execution result and the current-hop program +generation. To alleviate these challenges, we propose a self-iterative +framework for multi-hop program generation (HopPG) over heterogeneous +knowledge, which leverages the previous-hop execution results to retrieve +supporting facts and generate subsequent programs iteratively. We evaluate our +model on MMQA-T^2. The experimental results show that HopPG outperforms +existing semantic-parsing-based baselines, especially on the multi-hop +questions. + +
+
+
+
+
+ + ☆ Evaluating Large Language Models on Graphs: Performance Insights and + Comparative Analysis + + +
+ Large Language Models (LLMs) have garnered considerable interest within both +academic and industrial. Yet, the application of LLMs to graph data remains +under-explored. In this study, we evaluate the capabilities of four LLMs in +addressing several analytical problems with graph data. We employ four distinct +evaluation metrics: Comprehension, Correctness, Fidelity, and Rectification. +Our results show that: 1) LLMs effectively comprehend graph data in natural +language and reason with graph topology. 2) GPT models can generate logical and +coherent results, outperforming alternatives in correctness. 3) All examined +LLMs face challenges in structural reasoning, with techniques like zero-shot +chain-of-thought and few-shot prompting showing diminished efficacy. 4) GPT +models often produce erroneous answers in multi-answer tasks, raising concerns +in fidelity. 5) GPT models exhibit elevated confidence in their outputs, +potentially hindering their rectification capacities. Notably, GPT-4 has +demonstrated the capacity to rectify responses from GPT-3.5-turbo and its own +previous iterations. The code is available at: +https://github.com/Ayame1006/LLMtoGraph. + +
+
+ comment: 12 pages, 1 figure +
+
+
+
+
+ + ☆ Diversity Measures: Domain-Independent Proxies for Failure in Language + Model Queries + + +
+ Error prediction in large language models often relies on domain-specific +information. In this paper, we present measures for quantification of error in +the response of a large language model based on the diversity of responses to a +given prompt - hence independent of the underlying application. We describe how +three such measures - based on entropy, Gini impurity, and centroid distance - +can be employed. We perform a suite of experiments on multiple datasets and +temperature settings to demonstrate that these measures strongly correlate with +the probability of failure. Additionally, we present empirical results +demonstrating how these measures can be applied to few-shot prompting, +chain-of-thought reasoning, and error detection. + +
+
+
+
+
+ + ☆ ViCo: Engaging Video Comment Generation with Human Preference Rewards + + +
+ Engaging video comments play an important role in video social media, as they +are the carrier of feelings, thoughts, or humor of the audience. Preliminary +works have made initial exploration for video comment generation by adopting +caption-style encoder-decoder models. However, comment generation presents some +unique challenges distinct from caption generation, which makes these methods +somewhat less effective at generating engaging comments. In contrast to the +objective and descriptive nature of captions, comments tend to be inherently +subjective, making it hard to quantify and evaluate the engagement of comments. +Furthermore, the scarcity of truly engaging comments brings difficulty to +collecting enough high-quality training examples. In this paper, we propose +ViCo with three novel designs to tackle the above challenges for generating +engaging Video Comments. Firstly, to quantify the engagement of comments, we +utilize the number of "likes" each comment receives as a proxy of human +preference after an appropriate debiasing procedure. Secondly, to automatically +evaluate the engagement of comments, we train a reward model to align its +judgment to the above proxy. Our user studies indicate that this reward model +effectively aligns with human judgments. Lastly, to alleviate the scarcity of +high-quality comments, an initial generator is trained on readily available but +noisy data to generate comments. Then the reward model is employed to offer +feedback on the generated comments, thus optimizing the initial generator. To +facilitate the research of video commenting, we collect a large video +comment-dataset (ViCo-20k) with rich metadata from a popular video website. +Experiments on ViCo-20k show that the comments generated by our ViCo model +exhibit the best performance in terms of both quantitative and qualitative +results, particularly when engagement is considered. + +
+
+
+
+
+ + ☆ LLaMA-Reviewer: Advancing Code Review Automation with Large Language + Models through Parameter-Efficient Fine-Tuning (Practical Experience Report) + + +
+ The automation of code review activities, a long-standing pursuit in software +engineering, has been primarily addressed by numerous domain-specific +pre-trained models. Despite their success, these models frequently demand +extensive resources for pre-training from scratch. In contrast, Large Language +Models (LLMs) provide an intriguing alternative, given their remarkable +capabilities when supplemented with domain-specific knowledge. However, their +potential for automating code review tasks remains largely unexplored. + In response to this research gap, we present LLaMA-Reviewer, an innovative +framework that leverages the capabilities of LLaMA, a popular LLM, in the realm +of code review. Mindful of resource constraints, this framework employs +parameter-efficient fine-tuning (PEFT) methods, delivering high performance +while using less than 1% of trainable parameters. + An extensive evaluation of LLaMA-Reviewer is conducted on two diverse, +publicly available datasets. Notably, even with the smallest LLaMA base model +consisting of 6.7B parameters and a limited number of tuning epochs, +LLaMA-Reviewer equals the performance of existing code-review-focused models. + The ablation experiments provide insights into the influence of various +fine-tuning process components, including input representation, instruction +tuning, and different PEFT methods. To foster continuous progress in this +field, the code and all PEFT-weight plugins have been made open-source. + +
+
+ comment: Accepted to the 34th IEEE International Symposium on Software + Reliability Engineering (ISSRE 2023) +
+
+
+
+
+ + ☆ NLP-based detection of systematic anomalies among the narratives of + consumer complaints + + +
+ We develop an NLP-based procedure for detecting systematic nonmeritorious +consumer complaints, simply called systematic anomalies, among complaint +narratives. While classification algorithms are used to detect pronounced +anomalies, in the case of smaller and frequent systematic anomalies, the +algorithms may falter due to a variety of reasons, including technical ones as +well as natural limitations of human analysts. Therefore, as the next step +after classification, we convert the complaint narratives into quantitative +data, which are then analyzed using an algorithm for detecting systematic +anomalies. We illustrate the entire procedure using complaint narratives from +the Consumer Complaint Database of the Consumer Financial Protection Bureau. + +
+
+
+
+
+ + ☆ Anonymity at Risk? Assessing Re-Identification Capabilities of Large + Language Models + + +
+ Anonymity of both natural and legal persons in court rulings is a critical +aspect of privacy protection in the European Union and Switzerland. With the +advent of LLMs, concerns about large-scale re-identification of anonymized +persons are growing. In accordance with the Federal Supreme Court of +Switzerland, we explore the potential of LLMs to re-identify individuals in +court rulings by constructing a proof-of-concept using actual legal data from +the Swiss federal supreme court. Following the initial experiment, we +constructed an anonymized Wikipedia dataset as a more rigorous testing ground +to further investigate the findings. With the introduction and application of +the new task of re-identifying people in texts, we also introduce new metrics +to measure performance. We systematically analyze the factors that influence +successful re-identifications, identifying model size, input length, and +instruction tuning among the most critical determinants. Despite high +re-identification rates on Wikipedia, even the best LLMs struggled with court +decisions. The complexity is attributed to the lack of test datasets, the +necessity for substantial training resources, and data sparsity in the +information used for re-identification. In conclusion, this study demonstrates +that re-identification using LLMs may not be feasible for now, but as the +proof-of-concept on Wikipedia showed, it might become possible in the future. +We hope that our system can help enhance the confidence in the security of +anonymized decisions, thus leading to the courts being more confident to +publish decisions. + +
+
+
+
+
+ + ☆ Exploring the Effectiveness of GPT Models in Test-Taking: A Case Study + of the Driver's License Knowledge Test + + +
+ Large language models such as Open AI's Generative Pre-trained Transformer +(GPT) models are proficient at answering questions, but their knowledge is +confined to the information present in their training data. This limitation +renders them ineffective when confronted with questions about recent +developments or non-public documents. Our research proposes a method that +enables GPT models to answer questions by employing context from an information +source not previously included in their training data. The methodology includes +preprocessing of contextual information, the embedding of contexts and queries, +constructing prompt through the integration of context embeddings, and +generating answers using GPT models. We applied this method in a controlled +test scenario using the California Driver's Handbook as the information source. +The GPT-3 model achieved a 96% passing score on a set of 50 sample driving +knowledge test questions. In contrast, without context, the model's passing +score fell to 82%. However, the model still fails to answer some questions +correctly even with providing library of context, highlighting room for +improvement. The research also examined the impact of prompt length and context +format, on the model's performance. Overall, the study provides insights into +the limitations and potential improvements for GPT models in question-answering +tasks. + +
+
+
+
+
+ + ☆ Towards an On-device Agent for Text Rewriting + + +
+ Large Language Models (LLMs) have demonstrated impressive capabilities for +text rewriting. Nonetheless, the large sizes of these models make them +impractical for on-device inference, which would otherwise allow for enhanced +privacy and economical inference. Creating a smaller yet potent language model +for text rewriting presents a formidable challenge because it requires +balancing the need for a small size with the need to retain the emergent +capabilities of the LLM, that requires costly data collection. To address the +above challenge, we introduce a new instruction tuning approach for building a +mobile-centric text rewriting model. Our strategies enable the generation of +high quality training data without any human labeling. In addition, we propose +a heuristic reinforcement learning framework which substantially enhances +performance without requiring preference data. To further bridge the +performance gap with the larger server-side model, we propose an effective +approach that combines the mobile rewrite agent with the server model using a +cascade. To tailor the text rewriting tasks to mobile scenarios, we introduce +MessageRewriteEval, a benchmark that focuses on text rewriting for messages +through natural language instructions. Through empirical experiments, we +demonstrate that our on-device model surpasses the current state-of-the-art +LLMs in text rewriting while maintaining a significantly reduced model size. +Notably, we show that our proposed cascading approach improves model +performance. + +
+
+
+
+
+ + ☆ Few-shot Anomaly Detection in Text with Deviation Learning ICONIP 2023 + + +
+ Most current methods for detecting anomalies in text concentrate on +constructing models solely relying on unlabeled data. These models operate on +the presumption that no labeled anomalous examples are available, which +prevents them from utilizing prior knowledge of anomalies that are typically +present in small numbers in many real-world applications. Furthermore, these +models prioritize learning feature embeddings rather than optimizing anomaly +scores directly, which could lead to suboptimal anomaly scoring and inefficient +use of data during the learning process. In this paper, we introduce FATE, a +deep few-shot learning-based framework that leverages limited anomaly examples +and learns anomaly scores explicitly in an end-to-end method using deviation +learning. In this approach, the anomaly scores of normal examples are adjusted +to closely resemble reference scores obtained from a prior distribution. +Conversely, anomaly samples are forced to have anomalous scores that +considerably deviate from the reference score in the upper tail of the prior. +Additionally, our model is optimized to learn the distinct behavior of +anomalies by utilizing a multi-head self-attention layer and multiple instance +learning approaches. Comprehensive experiments on several benchmark datasets +demonstrate that our proposed approach attains a new level of state-of-the-art +performance. + +
+
+ comment: Accepted in ICONIP 2023 +
+
+
+
+
+ + ☆ Identifying depression-related topics in smartphone-collected + free-response speech recordings using an automatic speech recognition system + and a deep learning topic model + + +
+ Language use has been shown to correlate with depression, but large-scale +validation is needed. Traditional methods like clinic studies are expensive. +So, natural language processing has been employed on social media to predict +depression, but limitations remain-lack of validated labels, biased user +samples, and no context. Our study identified 29 topics in 3919 +smartphone-collected speech recordings from 265 participants using the Whisper +tool and BERTopic model. Six topics with a median PHQ-8 greater than or equal +to 10 were regarded as risk topics for depression: No Expectations, Sleep, +Mental Therapy, Haircut, Studying, and Coursework. To elucidate the topic +emergence and associations with depression, we compared behavioral (from +wearables) and linguistic characteristics across identified topics. The +correlation between topic shifts and changes in depression severity over time +was also investigated, indicating the importance of longitudinally monitoring +language use. We also tested the BERTopic model on a similar smaller dataset +(356 speech recordings from 57 participants), obtaining some consistent +results. In summary, our findings demonstrate specific speech topics may +indicate depression severity. The presented data-driven workflow provides a +practical approach to collecting and analyzing large-scale speech data from +real-world settings for digital health research. + +
+
+
+
+
+ + ☆ Halo: Estimation and Reduction of Hallucinations in Open-Source Weak + Large Language Models + + +
+ Large Language Models (LLMs) have revolutionized Natural Language Processing +(NLP). Although convenient for research and practical applications, open-source +LLMs with fewer parameters often suffer from severe hallucinations compared to +their larger counterparts. This paper focuses on measuring and reducing +hallucinations in BLOOM 7B, a representative of such weaker open-source LLMs +that are publicly available for research and commercial applications. We +introduce HaloCheck, a lightweight BlackBox knowledge-free framework designed +to quantify the severity of hallucinations in LLMs. Additionally, we explore +techniques like knowledge injection and teacher-student approaches to alleviate +hallucinations in low-parameter LLMs. Our experiments effectively demonstrate +the reduction of hallucinations in challenging domains for these LLMs. + +
+
+
+
+
+ + ☆ Knowledge Graph Prompting for Multi-Document Question Answering + + +
+ The 'pre-train, prompt, predict' paradigm of large language models (LLMs) has +achieved remarkable success in open-domain question answering (OD-QA). However, +few works explore this paradigm in the scenario of multi-document question +answering (MD-QA), a task demanding a thorough understanding of the logical +associations among the contents and structures of different documents. To fill +this crucial gap, we propose a Knowledge Graph Prompting (KGP) method to +formulate the right context in prompting LLMs for MD-QA, which consists of a +graph construction module and a graph traversal module. For graph construction, +we create a knowledge graph (KG) over multiple documents with nodes symbolizing +passages or document structures (e.g., pages/tables), and edges denoting the +semantic/lexical similarity between passages or intra-document structural +relations. For graph traversal, we design an LM-guided graph traverser that +navigates across nodes and gathers supporting passages assisting LLMs in MD-QA. +The constructed graph serves as the global ruler that regulates the +transitional space among passages and reduces retrieval latency. Concurrently, +the LM-guided traverser acts as a local navigator that gathers pertinent +context to progressively approach the question and guarantee retrieval quality. +Extensive experiments underscore the efficacy of KGP for MD-QA, signifying the +potential of leveraging graphs in enhancing the prompt design for LLMs. Our +code is at https://github.com/YuWVandy/KG-LLM-MDQA. + +
+
+
+
+
+ + ☆ Efficient Benchmarking (of Language Models) + + +
+ The increasing versatility of language models LMs has given rise to a new +class of benchmarks that comprehensively assess a broad range of capabilities. +Such benchmarks are associated with massive computational costs reaching +thousands of GPU hours per model. However the efficiency aspect of these +evaluation efforts had raised little discussion in the literature. In this work +we present the problem of Efficient Benchmarking namely intelligently reducing +the computation costs of LM evaluation without compromising reliability. Using +the HELM benchmark as a test case we investigate how different benchmark design +choices affect the computation-reliability tradeoff. We propose to evaluate the +reliability of such decisions by using a new measure Decision Impact on +Reliability DIoR for short. We find for example that the current leader on HELM +may change by merely removing a low-ranked model from the benchmark and observe +that a handful of examples suffice to obtain the correct benchmark ranking. +Conversely a slightly different choice of HELM scenarios varies ranking widely. +Based on our findings we outline a set of concrete recommendations for more +efficient benchmark design and utilization practices leading to dramatic cost +savings with minimal loss of benchmark reliability often reducing computation +by x100 or more. + +
+
+
+
+
+ + ☆ Learning to generate and corr- uh I mean repair language in real-time + + +
+ In conversation, speakers produce language incrementally, word by word, while +continuously monitoring the appropriateness of their own contribution in the +dynamically unfolding context of the conversation; and this often leads them to +repair their own utterance on the fly. This real-time language processing +capacity is furthermore crucial to the development of fluent and natural +conversational AI. In this paper, we use a previously learned Dynamic Syntax +grammar and the CHILDES corpus to develop, train and evaluate a probabilistic +model for incremental generation where input to the model is a purely semantic +generation goal concept in Type Theory with Records (TTR). We show that the +model's output exactly matches the gold candidate in 78% of cases with a +ROUGE-l score of 0.86. We further do a zero-shot evaluation of the ability of +the same model to generate self-repairs when the generation goal changes +mid-utterance. Automatic evaluation shows that the model can generate +self-repairs correctly in 85% of cases. A small human evaluation confirms the +naturalness and grammaticality of the generated self-repairs. Overall, these +results further highlight the generalisation power of grammar-based models and +lay the foundations for more controllable, and naturally interactive +conversational AI systems. + +
+
+ comment: Proceedings of the workshop on the Semantics and Pragmatics of + Dialogue, SemDial, Maribor, Slovenia (2023) +
+
+
+
+
+ + ♻ ☆ MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large + Language Models + + +
+ LLMs usually exhibit limitations in their ability to incorporate new +knowledge, the generation of hallucinations, and the transparency of their +decision-making process. In this paper, we explore how to prompt LLMs with +knowledge graphs (KG), working as a remedy to engage LLMs with up-to-date +knowledge and elicit the reasoning pathways from LLMs. Specifically, we build a +prompting pipeline that endows LLMs with the capability of comprehending KG +inputs and inferring with a combined implicit knowledge and the retrieved +external knowledge. In addition, we investigate eliciting the mind map on which +LLMs perform the reasoning and generate the answers. It is identified that the +produced mind map exhibits the reasoning pathways of LLMs grounded on the +ontology of knowledge, hence bringing the prospects of probing and gauging LLM +inference in production. The experiments on three question & answering datasets +also show that MindMap prompting leads to a striking empirical gain. For +instance, prompting a GPT-3.5 with MindMap yields an overwhelming performance +over GPT-4 consistently. We also demonstrate that with structured facts +retrieved from KG, MindMap can outperform a series of +prompting-with-document-retrieval methods, benefiting from more accurate, +concise, and comprehensive knowledge from KGs. + +
+
+ comment: 7 pages, 8 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Exploring the Landscape of Natural Language Processing Research + + +
+ As an efficient approach to understand, generate, and process natural +language texts, research in natural language processing (NLP) has exhibited a +rapid spread and wide adoption in recent years. Given the increasing research +work in this area, several NLP-related approaches have been surveyed in the +research community. However, a comprehensive study that categorizes established +topics, identifies trends, and outlines areas for future research remains +absent. Contributing to closing this gap, we have systematically classified and +analyzed research papers in the ACL Anthology. As a result, we present a +structured overview of the research landscape, provide a taxonomy of fields of +study in NLP, analyze recent developments in NLP, summarize our findings, and +highlight directions for future work. + +
+
+ comment: Accepted to the 14th International Conference on Recent Advances in + Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ Fairness in Image Search: A Study of Occupational Stereotyping in Image + Retrieval and its Debiasing + + +
+ Multi-modal search engines have experienced significant growth and widespread +use in recent years, making them the second most common internet use. While +search engine systems offer a range of services, the image search field has +recently become a focal point in the information retrieval community, as the +adage goes, "a picture is worth a thousand words". Although popular search +engines like Google excel at image search accuracy and agility, there is an +ongoing debate over whether their search results can be biased in terms of +gender, language, demographics, socio-cultural aspects, and stereotypes. This +potential for bias can have a significant impact on individuals' perceptions +and influence their perspectives. + In this paper, we present our study on bias and fairness in web search, with +a focus on keyword-based image search. We first discuss several kinds of biases +that exist in search systems and why it is important to mitigate them. We +narrow down our study to assessing and mitigating occupational stereotypes in +image search, which is a prevalent fairness issue in image retrieval. For the +assessment of stereotypes, we take gender as an indicator. We explore various +open-source and proprietary APIs for gender identification from images. With +these, we examine the extent of gender bias in top-tanked image search results +obtained for several occupational keywords. To mitigate the bias, we then +propose a fairness-aware re-ranking algorithm that optimizes (a) relevance of +the search result with the keyword and (b) fairness w.r.t genders identified. +We experiment on 100 top-ranked images obtained for 10 occupational keywords +and consider random re-ranking and re-ranking based on relevance as baselines. +Our experimental results show that the fairness-aware re-ranking algorithm +produces rankings with better fairness scores and competitive relevance scores +than the baselines. + +
+
+ comment: 20 Pages, Work uses Proprietary Search Systems from the year 2021 +
+
+
+
+
+ + ♻ ☆ Taken by Surprise: Contrast effect for Similarity Scores + + +
+ Accurately evaluating the similarity of object vector embeddings is of +critical importance for natural language processing, information retrieval and +classification tasks. Popular similarity scores (e.g cosine similarity) are +based on pairs of embedding vectors and disregard the distribution of the +ensemble from which objects are drawn. Human perception of object similarity +significantly depends on the context in which the objects appear. In this work +we propose the $\textit{surprise score}$, an ensemble-normalized similarity +metric that encapsulates the contrast effect of human perception and +significantly improves the classification performance on zero- and few-shot +document classification tasks. This score quantifies the surprise to find a +given similarity between two elements relative to the pairwise ensemble +similarities. We evaluate this metric on zero/few shot classification and +clustering tasks and typically find 10-15 % better performance compared to raw +cosine similarity. Our code is available at +https://github.com/MeetElise/surprise-similarity. + +
+
+ comment: 9 pages, 2 figures and 4 tables +
+
+
+
+
+ + ♻ ☆ An Effective Method using Phrase Mechanism in Neural Machine Translation + + +
+ Machine Translation is one of the essential tasks in Natural Language +Processing (NLP), which has massive applications in real life as well as +contributing to other tasks in the NLP research community. Recently, +Transformer -based methods have attracted numerous researchers in this domain +and achieved state-of-the-art results in most of the pair languages. In this +paper, we report an effective method using a phrase mechanism, +PhraseTransformer, to improve the strong baseline model Transformer in +constructing a Neural Machine Translation (NMT) system for parallel corpora +Vietnamese-Chinese. Our experiments on the MT dataset of the VLSP 2022 +competition achieved the BLEU score of 35.3 on Vietnamese to Chinese and 33.2 +BLEU scores on Chinese to Vietnamese data. Our code is available at +https://github.com/phuongnm94/PhraseTransformer. + +
+
+
+
+
+ + ♻ ☆ Using Large Language Models for Zero-Shot Natural Language Generation + from Knowledge Graphs + + +
+ In any system that uses structured knowledge graph (KG) data as its +underlying knowledge representation, KG-to-text generation is a useful tool for +turning parts of the graph data into text that can be understood by humans. +Recent work has shown that models that make use of pretraining on large amounts +of text data can perform well on the KG-to-text task even with relatively small +sets of training data on the specific graph-to-text task. In this paper, we +build on this concept by using large language models to perform zero-shot +generation based on nothing but the model's understanding of the triple +structure from what it can read. We show that ChatGPT achieves near +state-of-the-art performance on some measures of the WebNLG 2020 challenge, but +falls behind on others. Additionally, we compare factual, counter-factual and +fictional statements, and show that there is a significant connection between +what the LLM already knows about the data it is parsing and the quality of the +output text. + +
+
+ comment: 9 pages, 3 pages appendices, 1 figure, 4 tables (incl. appendices) +
+
+
+
+
+ + ♻ ☆ LibriSQA: Advancing Free-form and Open-ended Spoken Question Answering + with a Novel Dataset and Framework + + +
+ While Large Language Models (LLMs) have demonstrated commendable performance +across a myriad of domains and tasks, existing LLMs still exhibit a palpable +deficit in handling multimodal functionalities, especially for the Spoken +Question Answering (SQA) task which necessitates precise alignment and deep +interaction between speech and text features. To address the SQA challenge on +LLMs, we initially curated the free-form and open-ended LibriSQA dataset from +Librispeech, comprising Part I with natural conversational formats and Part II +encompassing multiple-choice questions followed by answers and analytical +segments. Both parts collectively include 107k SQA pairs that cover various +topics. Given the evident paucity of existing speech-text LLMs, we propose a +lightweight, end-to-end framework to execute the SQA task on the LibriSQA, +witnessing significant results. By reforming ASR into the SQA format, we +further substantiate our framework's capability in handling ASR tasks. Our +empirical findings bolster the LLMs' aptitude for aligning and comprehending +multimodal information, paving the way for the development of universal +multimodal LLMs. The dataset and demo can be found at +https://github.com/ZihanZhaoSJTU/LibriSQA. + +
+
+
+
+
+ + ♻ ☆ LARCH: Large Language Model-based Automatic Readme Creation with + Heuristics CIKM'23 + + +
+ Writing a readme is a crucial aspect of software development as it plays a +vital role in managing and reusing program code. Though it is a pain point for +many developers, automatically creating one remains a challenge even with the +recent advancements in large language models (LLMs), because it requires +generating an abstract description from thousands of lines of code. In this +demo paper, we show that LLMs are capable of generating a coherent and +factually correct readmes if we can identify a code fragment that is +representative of the repository. Building upon this finding, we developed +LARCH (LLM-based Automatic Readme Creation with Heuristics) which leverages +representative code identification with heuristics and weak supervision. +Through human and automated evaluations, we illustrate that LARCH can generate +coherent and factually correct readmes in the majority of cases, outperforming +a baseline that does not rely on representative code identification. We have +made LARCH open-source and provided a cross-platform Visual Studio Code +interface and command-line interface, accessible at +https://github.com/hitachi-nlp/larch. A demo video showcasing LARCH's +capabilities is available at https://youtu.be/ZUKkh5ED-O4. + +
+
+ comment: This is a pre-print of a paper accepted at CIKM'23 Demo. Refer to the + DOI URL for the original publication +
+
+
+
+
+ + ♻ ☆ NollySenti: Leveraging Transfer Learning and Machine Translation for + Nigerian Movie Sentiment Classification ACL 2023 + + +
+ Africa has over 2000 indigenous languages but they are under-represented in +NLP research due to lack of datasets. In recent years, there have been progress +in developing labeled corpora for African languages. However, they are often +available in a single domain and may not generalize to other domains. In this +paper, we focus on the task of sentiment classification for cross domain +adaptation. We create a new dataset, NollySenti - based on the Nollywood movie +reviews for five languages widely spoken in Nigeria (English, Hausa, Igbo, +Nigerian-Pidgin, and Yoruba. We provide an extensive empirical evaluation using +classical machine learning methods and pre-trained language models. Leveraging +transfer learning, we compare the performance of cross-domain adaptation from +Twitter domain, and cross-lingual adaptation from English language. Our +evaluation shows that transfer from English in the same target domain leads to +more than 5% improvement in accuracy compared to transfer from Twitter in the +same language. To further mitigate the domain difference, we leverage machine +translation (MT) from English to other Nigerian languages, which leads to a +further improvement of 7% over cross-lingual evaluation. While MT to +low-resource languages are often of low quality, through human evaluation, we +show that most of the translated sentences preserve the sentiment of the +original English reviews. + +
+
+ comment: Accepted to ACL 2023 (main conference) +
+
+
+
+
+ + ♻ ☆ PMET: Precise Model Editing in a Transformer + + +
+ Model editing techniques modify a minor proportion of knowledge in Large +Language Models (LLMs) at a relatively low cost, which have demonstrated +notable success. Existing methods assume Transformer Layer (TL) hidden states +are values of key-value memories of the Feed-Forward Network (FFN). They +usually optimize the TL hidden states to memorize target knowledge and use it +to update the weights of the FFN in LLMs. However, the information flow of TL +hidden states comes from three parts: Multi-Head Self-Attention (MHSA), FFN, +and residual connections. Existing methods neglect the fact that the TL hidden +states contains information not specifically required for FFN. Consequently, +the performance of model editing decreases. To achieve more precise model +editing, we analyze hidden states of MHSA and FFN, finding that MHSA encodes +certain general knowledge extraction patterns. This implies that MHSA weights +do not require updating when new knowledge is introduced. Based on above +findings, we introduce PMET, which simultaneously optimizes Transformer +Component (TC, namely MHSA and FFN) hidden states, while only using the +optimized TC hidden states of FFN to precisely update FFN weights. Our +experiments demonstrate that PMET exhibits state-of-the-art performance on both +the COUNTERFACT and zsRE datasets. Our ablation experiments substantiate the +effectiveness of our enhancements, further reinforcing the finding that the +MHSA encodes certain general knowledge extraction patterns and indicating its +storage of a small amount of factual knowledge. Our code is available at +https://github.com/xpq-tech/PMET.git. + +
+
+ comment: Preprint. Under review +
+
+
+
+
+ + ♻ ☆ Summary of ChatGPT-Related Research and Perspective Towards the Future + of Large Language Models + + +
+ This paper presents a comprehensive survey of ChatGPT-related (GPT-3.5 and +GPT-4) research, state-of-the-art large language models (LLM) from the GPT +series, and their prospective applications across diverse domains. Indeed, key +innovations such as large-scale pre-training that captures knowledge across the +entire world wide web, instruction fine-tuning and Reinforcement Learning from +Human Feedback (RLHF) have played significant roles in enhancing LLMs' +adaptability and performance. We performed an in-depth analysis of 194 relevant +papers on arXiv, encompassing trend analysis, word cloud representation, and +distribution analysis across various application domains. The findings reveal a +significant and increasing interest in ChatGPT-related research, predominantly +centered on direct natural language processing applications, while also +demonstrating considerable potential in areas ranging from education and +history to mathematics, medicine, and physics. This study endeavors to furnish +insights into ChatGPT's capabilities, potential implications, ethical concerns, +and offer direction for future advancements in this field. + +
+
+ comment: 21 pages, 4 figures, accepted by Meta-Radiology +
+
+
+
+
+ + ♻ ☆ WanJuan: A Comprehensive Multimodal Dataset for Advancing English and + Chinese Large Models + + +
+ The rise in popularity of ChatGPT and GPT-4 has significantly accelerated the +development of large models, leading to the creation of numerous impressive +large language models(LLMs) and multimodal large language models (MLLMs). These +cutting-edge models owe their remarkable performance to high-quality data. +However, the details of the training data used in leading paradigms are often +kept confidential. This lack of transparency, coupled with the scarcity of +open-source data, impedes further developments within the community. As a +response, this paper presents "Wan Juan", a large-scale multimodal dataset +composed of both Chinese and English data, collected from a wide range of web +sources. The dataset incorporates text, image-text, and video modalities, with +a total volume exceeding 2TB. It was utilized in the training of InternLM, a +model that demonstrated significant advantages in multi-dimensional evaluations +when compared to models of a similar scale. All data can be accessed at +https://opendatalab.org.cn/WanJuan1.0. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ WMFormer++: Nested Transformer for Visible Watermark Removal via Implict + Joint Learning + + +
+ Watermarking serves as a widely adopted approach to safeguard media +copyright. In parallel, the research focus has extended to watermark removal +techniques, offering an adversarial means to enhance watermark robustness and +foster advancements in the watermarking field. Existing watermark removal +methods mainly rely on UNet with task-specific decoder branches--one for +watermark localization and the other for background image restoration. However, +watermark localization and background restoration are not isolated tasks; +precise watermark localization inherently implies regions necessitating +restoration, and the background restoration process contributes to more +accurate watermark localization. To holistically integrate information from +both branches, we introduce an implicit joint learning paradigm. This empowers +the network to autonomously navigate the flow of information between implicit +branches through a gate mechanism. Furthermore, we employ cross-channel +attention to facilitate local detail restoration and holistic structural +comprehension, while harnessing nested structures to integrate multi-scale +information. Extensive experiments are conducted on various challenging +benchmarks to validate the effectiveness of our proposed method. The results +demonstrate our approach's remarkable superiority, surpassing existing +state-of-the-art methods by a large margin. + +
+
+
+
+
+ + ♻ ☆ TrojText: Test-time Invisible Textual Trojan Insertion ICLR 2023 + + +
+ In Natural Language Processing (NLP), intelligent neuron models can be +susceptible to textual Trojan attacks. Such attacks occur when Trojan models +behave normally for standard inputs but generate malicious output for inputs +that contain a specific trigger. Syntactic-structure triggers, which are +invisible, are becoming more popular for Trojan attacks because they are +difficult to detect and defend against. However, these types of attacks require +a large corpus of training data to generate poisoned samples with the necessary +syntactic structures for Trojan insertion. Obtaining such data can be difficult +for attackers, and the process of generating syntactic poisoned triggers and +inserting Trojans can be time-consuming. This paper proposes a solution called +TrojText, which aims to determine whether invisible textual Trojan attacks can +be performed more efficiently and cost-effectively without training data. The +proposed approach, called the Representation-Logit Trojan Insertion (RLI) +algorithm, uses smaller sampled test data instead of large training data to +achieve the desired attack. The paper also introduces two additional +techniques, namely the accumulated gradient ranking (AGR) and Trojan Weights +Pruning (TWP), to reduce the number of tuned parameters and the attack +overhead. The TrojText approach was evaluated on three datasets (AG's News, +SST-2, and OLID) using three NLP models (BERT, XLNet, and DeBERTa). The +experiments demonstrated that the TrojText approach achieved a 98.35\% +classification accuracy for test sentences in the target class on the BERT +model for the AG's News dataset. The source code for TrojText is available at +https://github.com/UCF-ML-Research/TrojText. + +
+
+ comment: In The Eleventh International Conference on Learning Representations. + 2023 (ICLR 2023) +
+
+
+
+
+ + ♻ ☆ Survey on Sociodemographic Bias in Natural Language Processing + + +
+ Deep neural networks often learn unintended bias during training, which might +have harmful effects when deployed in real-world settings. This work surveys +214 papers related to sociodemographic bias in natural language processing +(NLP). In this study, we aim to provide a more comprehensive understanding of +the similarities and differences among approaches to sociodemographic bias in +NLP. To better understand the distinction between bias and real-world harm, we +turn to ideas from psychology and behavioral economics to propose a definition +for sociodemographic bias. We identify three main categories of NLP bias +research: types of bias, quantifying bias, and debiasing techniques. We +highlight the current trends in quantifying bias and debiasing techniques, +offering insights into their strengths and weaknesses. We conclude that current +approaches on quantifying bias face reliability issues, that many of the bias +metrics do not relate to real-world bias, and that debiasing techniques need to +focus more on training methods. Finally, we provide recommendations for future +work. + +
+
+ comment: 23 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Truveta Mapper: A Zero-shot Ontology Alignment Framework + + +
+ In this paper, a new perspective is suggested for unsupervised Ontology +Matching (OM) or Ontology Alignment (OA) by treating it as a translation task. +Ontologies are represented as graphs, and the translation is performed from a +node in the source ontology graph to a path in the target ontology graph. The +proposed framework, Truveta Mapper (TM), leverages a multi-task +sequence-to-sequence transformer model to perform alignment across multiple +ontologies in a zero-shot, unified and end-to-end manner. Multi-tasking enables +the model to implicitly learn the relationship between different ontologies via +transfer-learning without requiring any explicit cross-ontology manually +labeled data. This also enables the formulated framework to outperform existing +solutions for both runtime latency and alignment quality. The model is +pre-trained and fine-tuned only on publicly available text corpus and +inner-ontologies data. The proposed solution outperforms state-of-the-art +approaches, Edit-Similarity, LogMap, AML, BERTMap, and the recently presented +new OM frameworks in Ontology Alignment Evaluation Initiative (OAEI22), offers +log-linear complexity, and overall makes the OM task efficient and more +straightforward without much post-processing involving mapping extension or +mapping repair. We are open sourcing our solution. + +
+
+
+
+
+ + ♻ ☆ Blockwise Parallel Transformer for Long Context Large Models + + +
+ Transformers have emerged as the cornerstone of state-of-the-art natural +language processing models, showcasing exceptional performance across a wide +range of AI applications. However, the memory demands posed by the +self-attention mechanism and the large feedforward network in Transformers +limit their ability to handle long sequences, thereby creating challenges for +tasks involving multiple long sequences or long-term dependencies. We present a +distinct approach, Blockwise Parallel Transformer (BPT), that leverages +blockwise computation of self-attention and feedforward network fusion to +minimize memory costs. By processing longer input sequences while maintaining +memory efficiency, BPT enables training sequences up to 32 times longer than +vanilla Transformers and 2 to 4 times longer than previous memory-efficient +methods. Extensive experiments on language modeling and reinforcement learning +tasks demonstrate the effectiveness of BPT in reducing memory requirements and +improving performance. + +
+
+
+
+
+ + ♻ ☆ PyABSA: A Modularized Framework for Reproducible Aspect-based Sentiment + Analysis + + +
+ The advancement of aspect-based sentiment analysis (ABSA) has urged the lack +of a user-friendly framework that can largely lower the difficulty of +reproducing state-of-the-art ABSA performance, especially for beginners. To +meet the demand, we present \our, a modularized framework built on PyTorch for +reproducible ABSA. To facilitate ABSA research, PyABSA supports several ABSA +subtasks, including aspect term extraction, aspect sentiment classification, +and end-to-end aspect-based sentiment analysis. Concretely, PyABSA integrates +29 models and 26 datasets. With just a few lines of code, the result of a model +on a specific dataset can be reproduced. With a modularized design, PyABSA can +also be flexibly extended to considered models, datasets, and other related +tasks. Besides, PyABSA highlights its data augmentation and annotation +features, which significantly address data scarcity. All are welcome to have a +try at \url{https://github.com/yangheng95/PyABSA}. + +
+
+
+
+
+ + ♻ ☆ Red-Teaming Large Language Models using Chain of Utterances for + Safety-Alignment + + +
+ Larger language models (LLMs) have taken the world by storm with their +massive multi-tasking capabilities simply by optimizing over a next-word +prediction objective. With the emergence of their properties and encoded +knowledge, the risk of LLMs producing harmful outputs increases, making them +unfit for scalable deployment for the public. In this work, we propose a new +safety evaluation benchmark RED-EVAL that carries out red-teaming. We show that +even widely deployed models are susceptible to the Chain of Utterances-based +(CoU) prompting, jailbreaking closed source LLM-based systems such as GPT-4 and +ChatGPT to unethically respond to more than 65% and 73% of harmful queries. We +also demonstrate the consistency of the RED-EVAL across 8 open-source LLMs in +generating harmful responses in more than 86% of the red-teaming attempts. +Next, we propose RED-INSTRUCT--An approach for the safety alignment of LLMs. It +constitutes two phases: 1) HARMFULQA data collection: Leveraging CoU prompting, +we collect a dataset that consists of 1.9K harmful questions covering a wide +range of topics, 9.5K safe and 7.3K harmful conversations from ChatGPT; 2) +SAFE-ALIGN: We demonstrate how the conversational dataset can be used for the +safety alignment of LLMs by minimizing the negative log-likelihood over helpful +responses and penalizing over harmful responses by gradient accent over sample +loss. Our model STARLING, a fine-tuned Vicuna-7B, is observed to be more safely +aligned when evaluated on RED-EVAL and HHH benchmarks while preserving the +utility of the baseline models (TruthfulQA, MMLU, and BBH). + +
+
+
+
+
+ + ♻ ☆ Dive into Deep Learning + + +
+ This open-source book represents our attempt to make deep learning +approachable, teaching readers the concepts, the context, and the code. The +entire book is drafted in Jupyter notebooks, seamlessly integrating exposition +figures, math, and interactive examples with self-contained code. Our goal is +to offer a resource that could (i) be freely available for everyone; (ii) offer +sufficient technical depth to provide a starting point on the path to actually +becoming an applied machine learning scientist; (iii) include runnable code, +showing readers how to solve problems in practice; (iv) allow for rapid +updates, both by us and also by the community at large; (v) be complemented by +a forum for interactive discussion of technical details and to answer +questions. + +
+
+ comment: (HTML) https://D2L.ai (GitHub) https://github.com/d2l-ai/d2l-en/ +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 160 + +
+
+
+ + ☆ GRIP: Generating Interaction Poses Using Latent Consistency and Spatial + Cues + + +
+ Hands are dexterous and highly versatile manipulators that are central to how +humans interact with objects and their environment. Consequently, modeling +realistic hand-object interactions, including the subtle motion of individual +fingers, is critical for applications in computer graphics, computer vision, +and mixed reality. Prior work on capturing and modeling humans interacting with +objects in 3D focuses on the body and object motion, often ignoring hand pose. +In contrast, we introduce GRIP, a learning-based method that takes, as input, +the 3D motion of the body and the object, and synthesizes realistic motion for +both hands before, during, and after object interaction. As a preliminary step +before synthesizing the hand motion, we first use a network, ANet, to denoise +the arm motion. Then, we leverage the spatio-temporal relationship between the +body and the object to extract two types of novel temporal interaction cues, +and use them in a two-stage inference pipeline to generate the hand motion. In +the first stage, we introduce a new approach to enforce motion temporal +consistency in the latent space (LTC), and generate consistent interaction +motions. In the second stage, GRIP generates refined hand poses to avoid +hand-object penetrations. Given sequences of noisy body and object motion, GRIP +upgrades them to include hand-object interaction. Quantitative experiments and +perceptual studies demonstrate that GRIP outperforms baseline methods and +generalizes to unseen objects and motions from different motion-capture +datasets. + +
+
+ comment: The project has been started during Omid Taheri's internship at Adobe + and as a collaboration with the Max Planck Institute for Intelligent Systems +
+
+
+
+
+ + ☆ Delving into Motion-Aware Matching for Monocular 3D Object Tracking ICCV 2023 + + +
+ Recent advances of monocular 3D object detection facilitate the 3D +multi-object tracking task based on low-cost camera sensors. In this paper, we +find that the motion cue of objects along different time frames is critical in +3D multi-object tracking, which is less explored in existing monocular-based +approaches. In this paper, we propose a motion-aware framework for monocular 3D +MOT. To this end, we propose MoMA-M3T, a framework that mainly consists of +three motion-aware components. First, we represent the possible movement of an +object related to all object tracklets in the feature space as its motion +features. Then, we further model the historical object tracklet along the time +frame in a spatial-temporal perspective via a motion transformer. Finally, we +propose a motion-aware matching module to associate historical object tracklets +and current observations as final tracking results. We conduct extensive +experiments on the nuScenes and KITTI datasets to demonstrate that our MoMA-M3T +achieves competitive performance against state-of-the-art methods. Moreover, +the proposed tracker is flexible and can be easily plugged into existing +image-based 3D object detectors without re-training. Code and models are +available at https://github.com/kuanchihhuang/MoMA-M3T. + +
+
+ comment: Accepted by ICCV 2023. Code is available at + https://github.com/kuanchihhuang/MoMA-M3T +
+
+
+
+
+ + ☆ StoryBench: A Multifaceted Benchmark for Continuous Story Visualization + + +
+ Generating video stories from text prompts is a complex task. In addition to +having high visual quality, videos need to realistically adhere to a sequence +of text prompts whilst being consistent throughout the frames. Creating a +benchmark for video generation requires data annotated over time, which +contrasts with the single caption used often in video datasets. To fill this +gap, we collect comprehensive human annotations on three existing datasets, and +introduce StoryBench: a new, challenging multi-task benchmark to reliably +evaluate forthcoming text-to-video models. Our benchmark includes three video +generation tasks of increasing difficulty: action execution, where the next +action must be generated starting from a conditioning video; story +continuation, where a sequence of actions must be executed starting from a +conditioning video; and story generation, where a video must be generated from +only text prompts. We evaluate small yet strong text-to-video baselines, and +show the benefits of training on story-like data algorithmically generated from +existing video captions. Finally, we establish guidelines for human evaluation +of video stories, and reaffirm the need of better automatic metrics for video +generation. StoryBench aims at encouraging future research efforts in this +exciting new area. + +
+
+
+
+
+ + ☆ GOPro: Generate and Optimize Prompts in CLIP using Self-Supervised + Learning BMVC 2023 + + +
+ Large-scale foundation models, such as CLIP, have demonstrated remarkable +success in visual recognition tasks by embedding images in a semantically rich +space. Self-supervised learning (SSL) has also shown promise in improving +visual recognition by learning invariant features. However, the combination of +CLIP with SSL is found to face challenges due to the multi-task framework that +blends CLIP's contrastive loss and SSL's loss, including difficulties with loss +weighting and inconsistency among different views of images in CLIP's output +space. To overcome these challenges, we propose a prompt learning-based model +called GOPro, which is a unified framework that ensures similarity between +various augmented views of input images in a shared image-text embedding space, +using a pair of learnable image and text projectors atop CLIP, to promote +invariance and generalizability. To automatically learn such prompts, we +leverage the visual content and style primitives extracted from pre-trained +CLIP and adapt them to the target task. In addition to CLIP's cross-domain +contrastive loss, we introduce a visual contrastive loss and a novel prompt +consistency loss, considering the different views of the images. GOPro is +trained end-to-end on all three loss objectives, combining the strengths of +CLIP and SSL in a principled manner. Empirical evaluations demonstrate that +GOPro outperforms the state-of-the-art prompting techniques on three +challenging domain generalization tasks across multiple benchmarks by a +significant margin. Our code is available at +https://github.com/mainaksingha01/GOPro. + +
+
+ comment: Accepted at BMVC 2023 +
+
+
+
+
+ + ☆ G3Reg: Pyramid Graph-based Global Registration using Gaussian Ellipsoid + Model + + +
+ This study introduces a novel framework, G3Reg, for fast and robust global +registration of LiDAR point clouds. In contrast to conventional complex +keypoints and descriptors, we extract fundamental geometric primitives +including planes, clusters, and lines (PCL) from the raw point cloud to obtain +low-level semantic segments. Each segment is formulated as a unified Gaussian +Ellipsoid Model (GEM) by employing a probability ellipsoid to ensure the ground +truth centers are encompassed with a certain degree of probability. Utilizing +these GEMs, we then present a distrust-and-verify scheme based on a Pyramid +Compatibility Graph for Global Registration (PAGOR). Specifically, we establish +an upper bound, which can be traversed based on the confidence level for +compatibility testing to construct the pyramid graph. Gradually, we solve +multiple maximum cliques (MAC) for each level of the graph, generating numerous +transformation candidates. In the verification phase, we adopt a precise and +efficient metric for point cloud alignment quality, founded on geometric +primitives, to identify the optimal candidate. The performance of the algorithm +is extensively validated on three publicly available datasets and a +self-collected multi-session dataset, without changing any parameter settings +in the experimental evaluation. The results exhibit superior robustness and +real-time performance of the G3Reg framework compared to state-of-the-art +methods. Furthermore, we demonstrate the potential for integrating individual +GEM and PAGOR components into other algorithmic frameworks to enhance their +efficacy. To advance further research and promote community understanding, we +have publicly shared the source code. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ SPANet: Frequency-balancing Token Mixer using Spectral Pooling + Aggregation Modulation ICCV 2023 + + +
+ Recent studies show that self-attentions behave like low-pass filters (as +opposed to convolutions) and enhancing their high-pass filtering capability +improves model performance. Contrary to this idea, we investigate existing +convolution-based models with spectral analysis and observe that improving the +low-pass filtering in convolution operations also leads to performance +improvement. To account for this observation, we hypothesize that utilizing +optimal token mixers that capture balanced representations of both high- and +low-frequency components can enhance the performance of models. We verify this +by decomposing visual features into the frequency domain and combining them in +a balanced manner. To handle this, we replace the balancing problem with a mask +filtering problem in the frequency domain. Then, we introduce a novel +token-mixer named SPAM and leverage it to derive a MetaFormer model termed as +SPANet. Experimental results show that the proposed method provides a way to +achieve this balance, and the balanced representations of both high- and +low-frequency components can improve the performance of models on multiple +computer vision tasks. Our code is available at +$\href{https://doranlyong.github.io/projects/spanet/}{\text{https://doranlyong.github.io/projects/spanet/}}$. + +
+
+ comment: Accepted paper at ICCV 2023 +
+
+
+
+
+ + ☆ EndoNet: model for automatic calculation of H-score on histological + slides + + +
+ H-score is a semi-quantitative method used to assess the presence and +distribution of proteins in tissue samples by combining the intensity of +staining and percentage of stained nuclei. It is widely used but time-consuming +and can be limited in accuracy and precision. Computer-aided methods may help +overcome these limitations and improve the efficiency of pathologists' +workflows. In this work, we developed a model EndoNet for automatic calculation +of H-score on histological slides. Our proposed method uses neural networks and +consists of two main parts. The first is a detection model which predicts +keypoints of centers of nuclei. The second is a H-score module which calculates +the value of the H-score using mean pixel values of predicted keypoints. Our +model was trained and validated on 1780 annotated tiles with a shape of 100x100 +$\mu m$ and performed 0.77 mAP on a test dataset. Moreover, the model can be +adjusted to a specific specialist or whole laboratory to reproduce the manner +of calculating the H-score. Thus, EndoNet is effective and robust in the +analysis of histology slides, which can improve and significantly accelerate +the work of pathologists. + +
+
+
+
+
+ + ☆ Target-Grounded Graph-Aware Transformer for Aerial Vision-and-Dialog + Navigation + + +
+ This report details the method of the winning entry of the AVDN Challenge in +ICCV 2023. The competition addresses the Aerial Navigation from Dialog History +(ANDH) task, which requires a drone agent to associate dialog history with +aerial observations to reach the destination. For better cross-modal grounding +abilities of the drone agent, we propose a Target-Grounded Graph-Aware +Transformer (TG-GAT) framework. Concretely, TG-GAT first leverages a +graph-aware transformer to capture spatiotemporal dependency, which is +beneficial for navigation state tracking and robust action planning. TG-GAT +first leverages a graph-aware transformer to capture spatiotemporal +dependencies for more robust action planning. In addition, an auxiliary visual +grounding task is devised to boost the agent's awareness of referred landmarks. +Moreover, a hybrid augmentation strategy based on large language models is +utilized to mitigate data scarcity limitations. Our TG-GAT framework won the +AVDN Challenge 2023, with 2.2% and 3.0% absolute improvements over the baseline +on SPL and SR metrics, respectively. The code is available at +https://github.com/yifeisu/avdn-challenge. + +
+
+
+
+
+ + ☆ Open Set Synthetic Image Source Attribution + + +
+ AI-generated images have become increasingly realistic and have garnered +significant public attention. While synthetic images are intriguing due to +their realism, they also pose an important misinformation threat. To address +this new threat, researchers have developed multiple algorithms to detect +synthetic images and identify their source generators. However, most existing +source attribution techniques are designed to operate in a closed-set scenario, +i.e. they can only be used to discriminate between known image generators. By +contrast, new image-generation techniques are rapidly emerging. To contend with +this, there is a great need for open-set source attribution techniques that can +identify when synthetic images have originated from new, unseen generators. To +address this problem, we propose a new metric learning-based approach. Our +technique works by learning transferrable embeddings capable of discriminating +between generators, even when they are not seen during training. An image is +first assigned to a candidate generator, then is accepted or rejected based on +its distance in the embedding space from known generators' learned reference +points. Importantly, we identify that initializing our source attribution +embedding network by pretraining it on image camera identification can improve +our embeddings' transferability. Through a series of experiments, we +demonstrate our approach's ability to attribute the source of synthetic images +in open-set scenarios. + +
+
+
+
+
+ + ☆ Multi-event Video-Text Retrieval ICCV2023 + + +
+ Video-Text Retrieval (VTR) is a crucial multi-modal task in an era of massive +video-text data on the Internet. A plethora of work characterized by using a +two-stream Vision-Language model architecture that learns a joint +representation of video-text pairs has become a prominent approach for the VTR +task. However, these models operate under the assumption of bijective +video-text correspondences and neglect a more practical scenario where video +content usually encompasses multiple events, while texts like user queries or +webpage metadata tend to be specific and correspond to single events. This +establishes a gap between the previous training objective and real-world +applications, leading to the potential performance degradation of earlier +models during inference. In this study, we introduce the Multi-event Video-Text +Retrieval (MeVTR) task, addressing scenarios in which each video contains +multiple different events, as a niche scenario of the conventional Video-Text +Retrieval Task. We present a simple model, Me-Retriever, which incorporates key +event video representation and a new MeVTR loss for the MeVTR task. +Comprehensive experiments show that this straightforward framework outperforms +other models in the Video-to-Text and Text-to-Video tasks, effectively +establishing a robust baseline for the MeVTR task. We believe this work serves +as a strong foundation for future studies. Code is available at +https://github.com/gengyuanmax/MeVTR. + +
+
+ comment: accepted to ICCV2023 +
+
+
+
+
+ + ☆ TrackFlow: Multi-Object Tracking with Normalizing Flows ICCV 2023 + + +
+ The field of multi-object tracking has recently seen a renewed interest in +the good old schema of tracking-by-detection, as its simplicity and strong +priors spare it from the complex design and painful babysitting of +tracking-by-attention approaches. In view of this, we aim at extending +tracking-by-detection to multi-modal settings, where a comprehensive cost has +to be computed from heterogeneous information e.g., 2D motion cues, visual +appearance, and pose estimates. More precisely, we follow a case study where a +rough estimate of 3D information is also available and must be merged with +other traditional metrics (e.g., the IoU). To achieve that, recent approaches +resort to either simple rules or complex heuristics to balance the contribution +of each cost. However, i) they require careful tuning of tailored +hyperparameters on a hold-out set, and ii) they imply these costs to be +independent, which does not hold in reality. We address these issues by +building upon an elegant probabilistic formulation, which considers the cost of +a candidate association as the negative log-likelihood yielded by a deep +density estimator, trained to model the conditional joint probability +distribution of correct associations. Our experiments, conducted on both +simulated and real benchmarks, show that our approach consistently enhances the +performance of several tracking-by-detection algorithms. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ SwinFace: A Multi-task Transformer for Face Recognition, Expression + Recognition, Age Estimation and Attribute Estimation + + +
+ In recent years, vision transformers have been introduced into face +recognition and analysis and have achieved performance breakthroughs. However, +most previous methods generally train a single model or an ensemble of models +to perform the desired task, which ignores the synergy among different tasks +and fails to achieve improved prediction accuracy, increased data efficiency, +and reduced training time. This paper presents a multi-purpose algorithm for +simultaneous face recognition, facial expression recognition, age estimation, +and face attribute estimation (40 attributes including gender) based on a +single Swin Transformer. Our design, the SwinFace, consists of a single shared +backbone together with a subnet for each set of related tasks. To address the +conflicts among multiple tasks and meet the different demands of tasks, a +Multi-Level Channel Attention (MLCA) module is integrated into each +task-specific analysis subnet, which can adaptively select the features from +optimal levels and channels to perform the desired tasks. Extensive experiments +show that the proposed model has a better understanding of the face and +achieves excellent performance for all tasks. Especially, it achieves 90.97% +accuracy on RAF-DB and 0.22 $\epsilon$-error on CLAP2015, which are +state-of-the-art results on facial expression recognition and age estimation +respectively. The code and models will be made publicly available at +https://github.com/lxq1000/SwinFace. + +
+
+
+
+
+ + ☆ Unsupervised Prototype Adapter for Vision-Language Models + + +
+ Recently, large-scale pre-trained vision-language models (e.g. CLIP and +ALIGN) have demonstrated remarkable effectiveness in acquiring transferable +visual representations. To leverage the valuable knowledge encoded within these +models for downstream tasks, several fine-tuning approaches, including prompt +tuning methods and adapter-based methods, have been developed to adapt +vision-language models effectively with supervision. However, these methods +rely on the availability of annotated samples, which can be labor-intensive and +time-consuming to acquire, thus limiting scalability. To address this issue, in +this work, we design an unsupervised fine-tuning approach for vision-language +models called Unsupervised Prototype Adapter (UP-Adapter). Specifically, for +the unannotated target datasets, we leverage the text-image aligning capability +of CLIP to automatically select the most confident samples for each class. +Utilizing these selected samples, we generate class prototypes, which serve as +the initialization for the learnable prototype model. After fine-tuning, the +prototype model prediction is combined with the original CLIP's prediction by a +residual connection to perform downstream recognition tasks. Our extensive +experimental results on image recognition and domain generalization show that +the proposed unsupervised method outperforms 8-shot CoOp, 8-shot Tip-Adapter, +and also the state-of-the-art UPL method by large margins. + +
+
+ comment: Accepted by PRCV 2023 +
+
+
+
+
+ + ☆ LCCo: Lending CLIP to Co-Segmentation + + +
+ This paper studies co-segmenting the common semantic object in a set of +images. Existing works either rely on carefully engineered networks to mine the +implicit semantic information in visual features or require extra data (i.e., +classification labels) for training. In this paper, we leverage the contrastive +language-image pre-training framework (CLIP) for the task. With a backbone +segmentation network that independently processes each image from the set, we +introduce semantics from CLIP into the backbone features, refining them in a +coarse-to-fine manner with three key modules: i) an image set feature +correspondence module, encoding global consistent semantic information of the +image set; ii) a CLIP interaction module, using CLIP-mined common semantics of +the image set to refine the backbone feature; iii) a CLIP regularization +module, drawing CLIP towards this co-segmentation task, identifying the best +CLIP semantic and using it to regularize the backbone feature. Experiments on +four standard co-segmentation benchmark datasets show that the performance of +our method outperforms state-of-the-art methods. + +
+
+
+
+
+ + ☆ Learning from Semantic Alignment between Unpaired Multiviews for + Egocentric Video Recognition + + +
+ We are concerned with a challenging scenario in unpaired multiview video +learning. In this case, the model aims to learn comprehensive multiview +representations while the cross-view semantic information exhibits variations. +We propose Semantics-based Unpaired Multiview Learning (SUM-L) to tackle this +unpaired multiview learning problem. The key idea is to build cross-view +pseudo-pairs and do view-invariant alignment by leveraging the semantic +information of videos. To facilitate the data efficiency of multiview learning, +we further perform video-text alignment for first-person and third-person +videos, to fully leverage the semantic knowledge to improve video +representations. Extensive experiments on multiple benchmark datasets verify +the effectiveness of our framework. Our method also outperforms multiple +existing view-alignment methods, under the more challenging scenario than +typical paired or unpaired multimodal or multiview learning. Our code is +available at https://github.com/wqtwjt1996/SUM-L. + +
+
+
+
+
+ + ☆ Opening the Vocabulary of Egocentric Actions + + +
+ Human actions in egocentric videos are often hand-object interactions +composed from a verb (performed by the hand) applied to an object. Despite +their extensive scaling up, egocentric datasets still face two limitations - +sparsity of action compositions and a closed set of interacting objects. This +paper proposes a novel open vocabulary action recognition task. Given a set of +verbs and objects observed during training, the goal is to generalize the verbs +to an open vocabulary of actions with seen and novel objects. To this end, we +decouple the verb and object predictions via an object-agnostic verb encoder +and a prompt-based object encoder. The prompting leverages CLIP representations +to predict an open vocabulary of interacting objects. We create open vocabulary +benchmarks on the EPIC-KITCHENS-100 and Assembly101 datasets; whereas +closed-action methods fail to generalize, our proposed method is effective. In +addition, our object encoder significantly outperforms existing open-vocabulary +visual recognition methods in recognizing novel interacting objects. + +
+
+ comment: 20 pages, 7 figures; https://dibschat.github.io/openvocab-egoAR/ +
+
+
+
+
+ + ☆ Free Lunch for Gait Recognition: A Novel Relation Descriptor + + +
+ Gait recognition is to seek correct matches for query individuals by their +unique walking patterns at a long distance. However, current methods focus +solely on individual gait features, disregarding inter-personal relationships. +In this paper, we reconsider gait representation, asserting that gait is not +just an aggregation of individual features, but also the relationships among +different subjects' gait features once reference gaits are established. From +this perspective, we redefine classifier weights as reference-anchored gaits, +allowing each person's gait to be described by their relationship with these +references. In our work, we call this novel descriptor Relationship Descriptor +(RD). This Relationship Descriptor offers two benefits: emphasizing meaningful +features and enhancing robustness. To be specific, The normalized dot product +between gait features and classifier weights signifies a similarity relation, +where each dimension indicates the similarity between the test sample and each +training ID's gait prototype, respectively. Despite its potential, the direct +use of relationship descriptors poses dimensionality challenges since the +dimension of RD depends on the training set's identity count. To address this, +we propose a Farthest Anchored gaits Selection algorithm and a dimension +reduction method to boost gait recognition performance. Our method can be built +on top of off-the-shelf pre-trained classification-based models without extra +parameters. We show that RD achieves higher recognition performance than +directly using extracted features. We evaluate the effectiveness of our method +on the popular GREW, Gait3D, CASIA-B, and OU-MVLP, showing that our method +consistently outperforms the baselines and achieves state-of-the-art +performances. + +
+
+
+
+
+ + ☆ Composed Image Retrieval using Contrastive Learning and Task-oriented + CLIP-based Features + + +
+ Given a query composed of a reference image and a relative caption, the +Composed Image Retrieval goal is to retrieve images visually similar to the +reference one that integrates the modifications expressed by the caption. Given +that recent research has demonstrated the efficacy of large-scale vision and +language pre-trained (VLP) models in various tasks, we rely on features from +the OpenAI CLIP model to tackle the considered task. We initially perform a +task-oriented fine-tuning of both CLIP encoders using the element-wise sum of +visual and textual features. Then, in the second stage, we train a Combiner +network that learns to combine the image-text features integrating the bimodal +information and providing combined features used to perform the retrieval. We +use contrastive learning in both stages of training. Starting from the bare +CLIP features as a baseline, experimental results show that the task-oriented +fine-tuning and the carefully crafted Combiner network are highly effective and +outperform more complex state-of-the-art approaches on FashionIQ and CIRR, two +popular and challenging datasets for composed image retrieval. Code and +pre-trained models are available at https://github.com/ABaldrati/CLIP4Cir + +
+
+ comment: Accepted in ACM Transactions on Multimedia Computing Communications + and Applications (TOMM) +
+
+
+
+
+ + ☆ Pose2Gait: Extracting Gait Features from Monocular Video of Individuals + with Dementia MICCAI 2023 + + +
+ Video-based ambient monitoring of gait for older adults with dementia has the +potential to detect negative changes in health and allow clinicians and +caregivers to intervene early to prevent falls or hospitalizations. Computer +vision-based pose tracking models can process video data automatically and +extract joint locations; however, publicly available models are not optimized +for gait analysis on older adults or clinical populations. In this work we +train a deep neural network to map from a two dimensional pose sequence, +extracted from a video of an individual walking down a hallway toward a +wall-mounted camera, to a set of three-dimensional spatiotemporal gait features +averaged over the walking sequence. The data of individuals with dementia used +in this work was captured at two sites using a wall-mounted system to collect +the video and depth information used to train and evaluate our model. Our +Pose2Gait model is able to extract velocity and step length values from the +video that are correlated with the features from the depth camera, with +Spearman's correlation coefficients of .83 and .60 respectively, showing that +three dimensional spatiotemporal features can be predicted from monocular +video. Future work remains to improve the accuracy of other features, such as +step time and step width, and test the utility of the predicted values for +detecting meaningful changes in gait during longitudinal ambient monitoring. + +
+
+ comment: 14 pages, 3 figures. Code is available at + https://github.com/TaatiTeam/pose2gait_public . To be published at the + Ambient Intelligence for Health Care Workshop at MICCAI 2023 +
+
+
+
+
+ + ☆ Expecting The Unexpected: Towards Broad Out-Of-Distribution Detection + + +
+ Improving the reliability of deployed machine learning systems often involves +developing methods to detect out-of-distribution (OOD) inputs. However, +existing research often narrowly focuses on samples from classes that are +absent from the training set, neglecting other types of plausible distribution +shifts. This limitation reduces the applicability of these methods in +real-world scenarios, where systems encounter a wide variety of anomalous +inputs. In this study, we categorize five distinct types of distribution shifts +and critically evaluate the performance of recent OOD detection methods on each +of them. We publicly release our benchmark under the name BROAD (Benchmarking +Resilience Over Anomaly Diversity). Our findings reveal that while these +methods excel in detecting unknown classes, their performance is inconsistent +when encountering other types of distribution shifts. In other words, they only +reliably detect unexpected inputs that they have been specifically designed to +expect. As a first step toward broad OOD detection, we learn a generative model +of existing detection scores with a Gaussian mixture. By doing so, we present +an ensemble approach that offers a more consistent and comprehensive solution +for broad OOD detection, demonstrating superior performance compared to +existing methods. Our code to download BROAD and reproduce our experiments is +publicly available. + +
+
+
+
+
+ + ☆ IT3D: Improved Text-to-3D Generation with Explicit View Synthesis + + +
+ Recent strides in Text-to-3D techniques have been propelled by distilling +knowledge from powerful large text-to-image diffusion models (LDMs). +Nonetheless, existing Text-to-3D approaches often grapple with challenges such +as over-saturation, inadequate detailing, and unrealistic outputs. This study +presents a novel strategy that leverages explicitly synthesized multi-view +images to address these issues. Our approach involves the utilization of +image-to-image pipelines, empowered by LDMs, to generate posed high-quality +images based on the renderings of coarse 3D models. Although the generated +images mostly alleviate the aforementioned issues, challenges such as view +inconsistency and significant content variance persist due to the inherent +generative nature of large diffusion models, posing extensive difficulties in +leveraging these images effectively. To overcome this hurdle, we advocate +integrating a discriminator alongside a novel Diffusion-GAN dual training +strategy to guide the training of 3D models. For the incorporated +discriminator, the synthesized multi-view images are considered real data, +while the renderings of the optimized 3D models function as fake data. We +conduct a comprehensive set of experiments that demonstrate the effectiveness +of our method over baseline approaches. + +
+
+ comment: Project Page: https://github.com/buaacyw/IT3D-text-to-3D +
+
+
+
+
+ + ☆ Dynamic Open Vocabulary Enhanced Safe-landing with Intelligence + (DOVESEI) IROS 2013 + + +
+ This work targets what we consider to be the foundational step for urban +airborne robots, a safe landing. Our attention is directed toward what we deem +the most crucial aspect of the safe landing perception stack: segmentation. We +present a streamlined reactive UAV system that employs visual servoing by +harnessing the capabilities of open vocabulary image segmentation. This +approach can adapt to various scenarios with minimal adjustments, bypassing the +necessity for extensive data accumulation for refining internal models, thanks +to its open vocabulary methodology. Given the limitations imposed by local +authorities, our primary focus centers on operations originating from altitudes +of 100 meters. This choice is deliberate, as numerous preceding works have +dealt with altitudes up to 30 meters, aligning with the capabilities of small +stereo cameras. Consequently, we leave the remaining 20m to be navigated using +conventional 3D path planning methods. Utilizing monocular cameras and image +segmentation, our findings demonstrate the system's capability to successfully +execute landing maneuvers at altitudes as low as 20 meters. However, this +approach is vulnerable to intermittent and occasionally abrupt fluctuations in +the segmentation between frames in a video stream. To address this challenge, +we enhance the image segmentation output by introducing what we call a dynamic +focus: a masking mechanism that self adjusts according to the current landing +stage. This dynamic focus guides the control system to avoid regions beyond the +drone's safety radius projected onto the ground, thus mitigating the problems +with fluctuations. Through the implementation of this supplementary layer, our +experiments have reached improvements in the landing success rate of almost +tenfold when compared to global segmentation. All the source code is open +source and available online (github.com/MISTLab/DOVESEI). + +
+
+ comment: Submitted to IROS 2013 The Last-Mile Robotics Workshop +
+
+
+
+
+ + ☆ Multitemporal analysis in Google Earth Engine for detecting urban + changes using optical data and machine learning algorithms + + +
+ The aim of this work is to perform a multitemporal analysis using the Google +Earth Engine (GEE) platform for the detection of changes in urban areas using +optical data and specific machine learning (ML) algorithms. As a case study, +Cairo City has been identified, in Egypt country, as one of the five most +populous megacities of the last decade in the world. Classification and change +detection analysis of the region of interest (ROI) have been carried out from +July 2013 to July 2021. Results demonstrate the validity of the proposed method +in identifying changed and unchanged urban areas over the selected period. +Furthermore, this work aims to evidence the growing significance of GEE as an +efficient cloud-based solution for managing large quantities of satellite data. + +
+
+ comment: 4 pages, 6 figures, 2023 InGARSS Conference +
+
+
+
+
+ + ☆ Food Image Classification and Segmentation with Attention-based Multiple + Instance Learning + + +
+ The demand for accurate food quantification has increased in the recent +years, driven by the needs of applications in dietary monitoring. At the same +time, computer vision approaches have exhibited great potential in automating +tasks within the food domain. Traditionally, the development of machine +learning models for these problems relies on training data sets with +pixel-level class annotations. However, this approach introduces challenges +arising from data collection and ground truth generation that quickly become +costly and error-prone since they must be performed in multiple settings and +for thousands of classes. To overcome these challenges, the paper presents a +weakly supervised methodology for training food image classification and +semantic segmentation models without relying on pixel-level annotations. The +proposed methodology is based on a multiple instance learning approach in +combination with an attention-based mechanism. At test time, the models are +used for classification and, concurrently, the attention mechanism generates +semantic heat maps which are used for food class segmentation. In the paper, we +conduct experiments on two meta-classes within the FoodSeg103 data set to +verify the feasibility of the proposed approach and we explore the functioning +properties of the attention mechanism. + +
+
+ comment: Accepted for presentation at 18th International Workshop on Semantic + and Social Media Adaptation & Personalization (SMAP 2023) +
+
+
+
+
+ + ☆ Towards Discriminative Representations with Contrastive Instances for + Real-Time UAV Tracking + + +
+ Maintaining high efficiency and high precision are two fundamental challenges +in UAV tracking due to the constraints of computing resources, battery +capacity, and UAV maximum load. Discriminative correlation filters (DCF)-based +trackers can yield high efficiency on a single CPU but with inferior precision. +Lightweight Deep learning (DL)-based trackers can achieve a good balance +between efficiency and precision but performance gains are limited by the +compression rate. High compression rate often leads to poor discriminative +representations. To this end, this paper aims to enhance the discriminative +power of feature representations from a new feature-learning perspective. +Specifically, we attempt to learn more disciminative representations with +contrastive instances for UAV tracking in a simple yet effective manner, which +not only requires no manual annotations but also allows for developing and +deploying a lightweight model. We are the first to explore contrastive learning +for UAV tracking. Extensive experiments on four UAV benchmarks, including +UAV123@10fps, DTB70, UAVDT and VisDrone2018, show that the proposed DRCI +tracker significantly outperforms state-of-the-art UAV tracking methods. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2308.10262 +
+
+
+
+
+ + ☆ Masked Momentum Contrastive Learning for Zero-shot Semantic + Understanding + + +
+ Self-supervised pretraining (SSP) has emerged as a popular technique in +machine learning, enabling the extraction of meaningful feature representations +without labelled data. In the realm of computer vision, pretrained vision +transformers (ViTs) have played a pivotal role in advancing transfer learning. +Nonetheless, the escalating cost of finetuning these large models has posed a +challenge due to the explosion of model size. This study endeavours to evaluate +the effectiveness of pure self-supervised learning (SSL) techniques in computer +vision tasks, obviating the need for finetuning, with the intention of +emulating human-like capabilities in generalisation and recognition of unseen +objects. To this end, we propose an evaluation protocol for zero-shot +segmentation based on a prompting patch. Given a point on the target object as +a prompt, the algorithm calculates the similarity map between the selected +patch and other patches, upon that, a simple thresholding is applied to segment +the target. Another evaluation is intra-object and inter-object similarity to +gauge discriminatory ability of SSP ViTs. Insights from zero-shot segmentation +from prompting and discriminatory abilities of SSP led to the design of a +simple SSP approach, termed MMC. This approaches combines Masked image +modelling for encouraging similarity of local features, Momentum based +self-distillation for transferring semantics from global to local features, and +global Contrast for promoting semantics of global features, to enhance +discriminative representations of SSP ViTs. Consequently, our proposed method +significantly reduces the overlap of intra-object and inter-object +similarities, thereby facilitating effective object segmentation within an +image. Our experiments reveal that MMC delivers top-tier results in zero-shot +semantic segmentation across various datasets. + +
+
+
+
+
+ + ☆ Revisiting and Exploring Efficient Fast Adversarial Training via LAW: + Lipschitz Regularization and Auto Weight Averaging + + +
+ Fast Adversarial Training (FAT) not only improves the model robustness but +also reduces the training cost of standard adversarial training. However, fast +adversarial training often suffers from Catastrophic Overfitting (CO), which +results in poor robustness performance. Catastrophic Overfitting describes the +phenomenon of a sudden and significant decrease in robust accuracy during the +training of fast adversarial training. Many effective techniques have been +developed to prevent Catastrophic Overfitting and improve the model robustness +from different perspectives. However, these techniques adopt inconsistent +training settings and require different training costs, i.e, training time and +memory costs, leading to unfair comparisons. In this paper, we conduct a +comprehensive study of over 10 fast adversarial training methods in terms of +adversarial robustness and training costs. We revisit the effectiveness and +efficiency of fast adversarial training techniques in preventing Catastrophic +Overfitting from the perspective of model local nonlinearity and propose an +effective Lipschitz regularization method for fast adversarial training. +Furthermore, we explore the effect of data augmentation and weight averaging in +fast adversarial training and propose a simple yet effective auto weight +averaging method to improve robustness further. By assembling these techniques, +we propose a FGSM-based fast adversarial training method equipped with +Lipschitz regularization and Auto Weight averaging, abbreviated as FGSM-LAW. +Experimental evaluations on four benchmark databases demonstrate the +superiority of the proposed method over state-of-the-art fast adversarial +training methods and the advanced standard adversarial training methods. + +
+
+
+
+
+ + ☆ SDeMorph: Towards Better Facial De-morphing from Single Morph + + +
+ Face Recognition Systems (FRS) are vulnerable to morph attacks. A face morph +is created by combining multiple identities with the intention to fool FRS and +making it match the morph with multiple identities. Current Morph Attack +Detection (MAD) can detect the morph but are unable to recover the identities +used to create the morph with satisfactory outcomes. Existing work in +de-morphing is mostly reference-based, i.e. they require the availability of +one identity to recover the other. Sudipta et al. \cite{ref9} proposed a +reference-free de-morphing technique but the visual realism of outputs produced +were feeble. In this work, we propose SDeMorph (Stably Diffused De-morpher), a +novel de-morphing method that is reference-free and recovers the identities of +bona fides. Our method produces feature-rich outputs that are of significantly +high quality in terms of definition and facial fidelity. Our method utilizes +Denoising Diffusion Probabilistic Models (DDPM) by destroying the input morphed +signal and then reconstructing it back using a branched-UNet. Experiments on +ASML, FRLL-FaceMorph, FRLL-MorDIFF, and SMDD datasets support the effectiveness +of the proposed method. + +
+
+
+
+
+ + ☆ Learning a More Continuous Zero Level Set in Unsigned Distance Fields + through Level Set Projection ICCV2023 + + +
+ Latest methods represent shapes with open surfaces using unsigned distance +functions (UDFs). They train neural networks to learn UDFs and reconstruct +surfaces with the gradients around the zero level set of the UDF. However, the +differential networks struggle from learning the zero level set where the UDF +is not differentiable, which leads to large errors on unsigned distances and +gradients around the zero level set, resulting in highly fragmented and +discontinuous surfaces. To resolve this problem, we propose to learn a more +continuous zero level set in UDFs with level set projections. Our insight is to +guide the learning of zero level set using the rest non-zero level sets via a +projection procedure. Our idea is inspired from the observations that the +non-zero level sets are much smoother and more continuous than the zero level +set. We pull the non-zero level sets onto the zero level set with gradient +constraints which align gradients over different level sets and correct +unsigned distance errors on the zero level set, leading to a smoother and more +continuous unsigned distance field. We conduct comprehensive experiments in +surface reconstruction for point clouds, real scans or depth maps, and further +explore the performance in unsupervised point cloud upsampling and unsupervised +point normal estimation with the learned UDF, which demonstrate our non-trivial +improvements over the state-of-the-art methods. Code is available at +https://github.com/junshengzhou/LevelSetUDF . + +
+
+ comment: To appear at ICCV2023. Code is available at + https://github.com/junshengzhou/LevelSetUDF +
+
+
+
+
+ + ☆ PoseGraphNet++: Enriching 3D Human Pose with Orientation Estimation + + +
+ Existing kinematic skeleton-based 3D human pose estimation methods only +predict joint positions. Although this is sufficient to compute the yaw and +pitch of the bone rotations, the roll around the axis of the bones remains +unresolved by these methods. In this paper, we propose a novel 2D-to-3D lifting +Graph Convolution Network named PoseGraphNet++ to predict the complete human +pose including the joint positions and the bone orientations. We employ node +and edge convolutions to utilize the joint and bone features. Our model is +evaluated on multiple benchmark datasets, and its performance is either on par +with or better than the state-of-the-art in terms of both position and rotation +metrics. Through extensive ablation studies, we show that PoseGraphNet++ +benefits from exploiting the mutual relationship between the joints and the +bones. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ TurboViT: Generating Fast Vision Transformers via Generative + Architecture Search + + +
+ Vision transformers have shown unprecedented levels of performance in +tackling various visual perception tasks in recent years. However, the +architectural and computational complexity of such network architectures have +made them challenging to deploy in real-world applications with +high-throughput, low-memory requirements. As such, there has been significant +research recently on the design of efficient vision transformer architectures. +In this study, we explore the generation of fast vision transformer +architecture designs via generative architecture search (GAS) to achieve a +strong balance between accuracy and architectural and computational efficiency. +Through this generative architecture search process, we create TurboViT, a +highly efficient hierarchical vision transformer architecture design that is +generated around mask unit attention and Q-pooling design patterns. The +resulting TurboViT architecture design achieves significantly lower +architectural computational complexity (>2.47$\times$ smaller than FasterViT-0 +while achieving same accuracy) and computational complexity (>3.4$\times$ fewer +FLOPs and 0.9% higher accuracy than MobileViT2-2.0) when compared to 10 other +state-of-the-art efficient vision transformer network architecture designs +within a similar range of accuracy on the ImageNet-1K dataset. Furthermore, +TurboViT demonstrated strong inference latency and throughput in both +low-latency and batch processing scenarios (>3.21$\times$ lower latency and +>3.18$\times$ higher throughput compared to FasterViT-0 for low-latency +scenario). These promising results demonstrate the efficacy of leveraging +generative architecture search for generating efficient transformer +architecture designs for high-throughput scenarios. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ ScanNet++: A High-Fidelity Dataset of 3D Indoor Scenes ICCV 2023 + + +
+ We present ScanNet++, a large-scale dataset that couples together capture of +high-quality and commodity-level geometry and color of indoor scenes. Each +scene is captured with a high-end laser scanner at sub-millimeter resolution, +along with registered 33-megapixel images from a DSLR camera, and RGB-D streams +from an iPhone. Scene reconstructions are further annotated with an open +vocabulary of semantics, with label-ambiguous scenarios explicitly annotated +for comprehensive semantic understanding. ScanNet++ enables a new real-world +benchmark for novel view synthesis, both from high-quality RGB capture, and +importantly also from commodity-level images, in addition to a new benchmark +for 3D semantic scene understanding that comprehensively encapsulates diverse +and ambiguous semantic labeling scenarios. Currently, ScanNet++ contains 460 +scenes, 280,000 captured DSLR images, and over 3.7M iPhone RGBD frames. + +
+
+ comment: ICCV 2023. Video: https://youtu.be/E6P9e2r6M8I , Project page: + https://cy94.github.io/scannetpp/ +
+
+
+
+
+ + ☆ MatFuse: Controllable Material Generation with Diffusion Models + + +
+ Creating high quality and realistic materials in computer graphics is a +challenging and time-consuming task, which requires great expertise. In this +paper, we present MatFuse, a novel unified approach that harnesses the +generative power of diffusion models (DM) to simplify the creation of SVBRDF +maps. Our DM-based pipeline integrates multiple sources of conditioning, such +as color palettes, sketches, and pictures, enabling fine-grained control and +flexibility in material synthesis. This design allows for the combination of +diverse information sources (e.g., sketch + image embedding), enhancing +creative possibilities in line with the principle of compositionality. We +demonstrate the generative capabilities of the proposed method under various +conditioning settings; on the SVBRDF estimation task, we show that our method +yields performance comparable to state-of-the-art approaches, both +qualitatively and quantitatively. + +
+
+
+
+
+ + ☆ Non-Redundant Combination of Hand-Crafted and Deep Learning Radiomics: + Application to the Early Detection of Pancreatic Cancer MICCAI 2023 + + +
+ We address the problem of learning Deep Learning Radiomics (DLR) that are not +redundant with Hand-Crafted Radiomics (HCR). To do so, we extract DLR features +using a VAE while enforcing their independence with HCR features by minimizing +their mutual information. The resulting DLR features can be combined with +hand-crafted ones and leveraged by a classifier to predict early markers of +cancer. We illustrate our method on four early markers of pancreatic cancer and +validate it on a large independent test set. Our results highlight the value of +combining non-redundant DLR and HCR features, as evidenced by an improvement in +the Area Under the Curve compared to baseline methods that do not address +redundancy or solely rely on HCR features. + +
+
+ comment: CaPTion workshop MICCAI 2023 +
+
+
+
+
+ + ☆ Targeted Data Augmentation for bias mitigation + + +
+ The development of fair and ethical AI systems requires careful consideration +of bias mitigation, an area often overlooked or ignored. In this study, we +introduce a novel and efficient approach for addressing biases called Targeted +Data Augmentation (TDA), which leverages classical data augmentation techniques +to tackle the pressing issue of bias in data and models. Unlike the laborious +task of removing biases, our method proposes to insert biases instead, +resulting in improved performance. To identify biases, we annotated two diverse +datasets: a dataset of clinical skin lesions and a dataset of male and female +faces. These bias annotations are published for the first time in this study, +providing a valuable resource for future research. Through Counterfactual Bias +Insertion, we discovered that biases associated with the frame, ruler, and +glasses had a significant impact on models. By randomly introducing biases +during training, we mitigated these biases and achieved a substantial decrease +in bias measures, ranging from two-fold to more than 50-fold, while maintaining +a negligible increase in the error rate. + +
+
+
+
+
+ + ☆ DALNet: A Rail Detection Network Based on Dynamic Anchor Line + + +
+ Rail detection is one of the key factors for intelligent train. In the paper, +motivated by the anchor line-based lane detection methods, we propose a rail +detection network called DALNet based on dynamic anchor line. Aiming to solve +the problem that the predefined anchor line is image agnostic, we design a +novel dynamic anchor line mechanism. It utilizes a dynamic anchor line +generator to dynamically generate an appropriate anchor line for each rail +instance based on the position and shape of the rails in the input image. These +dynamically generated anchor lines can be considered as better position +references to accurately localize the rails than the predefined anchor lines. +In addition, we present a challenging urban rail detection dataset DL-Rail with +high-quality annotations and scenario diversity. DL-Rail contains 7000 pairs of +images and annotations along with scene tags, and it is expected to encourage +the development of rail detection. We extensively compare DALNet with many +competitive lane methods. The results show that our DALNet achieves +state-of-the-art performance on our DL-Rail rail detection dataset and the +popular Tusimple and LLAMAS lane detection benchmarks. The code will be +released at \url{https://github.com/Yzichen/mmLaneDet}. + +
+
+
+
+
+ + ☆ Boundary-RL: Reinforcement Learning for Weakly-Supervised Prostate + Segmentation in TRUS Images MICCAI + + +
+ We propose Boundary-RL, a novel weakly supervised segmentation method that +utilises only patch-level labels for training. We envision the segmentation as +a boundary detection problem, rather than a pixel-level classification as in +previous works. This outlook on segmentation may allow for boundary delineation +under challenging scenarios such as where noise artefacts may be present within +the region-of-interest (ROI) boundaries, where traditional pixel-level +classification-based weakly supervised methods may not be able to effectively +segment the ROI. Particularly of interest, ultrasound images, where intensity +values represent acoustic impedance differences between boundaries, may also +benefit from the boundary delineation approach. Our method uses reinforcement +learning to train a controller function to localise boundaries of ROIs using a +reward derived from a pre-trained boundary-presence classifier. The classifier +indicates when an object boundary is encountered within a patch, as the +controller modifies the patch location in a sequential Markov decision process. +The classifier itself is trained using only binary patch-level labels of object +presence, which are the only labels used during training of the entire boundary +delineation framework, and serves as a weak signal to inform the boundary +delineation. The use of a controller function ensures that a sliding window +over the entire image is not necessary. It also prevents possible +false-positive or -negative cases by minimising number of patches passed to the +boundary-presence classifier. We evaluate our proposed approach for a +clinically relevant task of prostate gland segmentation on trans-rectal +ultrasound images. We show improved performance compared to other tested weakly +supervised methods, using the same labels e.g., multiple instance learning. + +
+
+ comment: Accepted to MICCAI Workshop MLMI 2023 (14th International Conference + on Machine Learning in Medical Imaging) +
+
+
+
+
+ + ☆ Enhancing Interpretable Object Abstraction via Clustering-based Slot + Initialization + + +
+ Object-centric representations using slots have shown the advances towards +efficient, flexible and interpretable abstraction from low-level perceptual +features in a compositional scene. Current approaches randomize the initial +state of slots followed by an iterative refinement. As we show in this paper, +the random slot initialization significantly affects the accuracy of the final +slot prediction. Moreover, current approaches require a predetermined number of +slots from prior knowledge of the data, which limits the applicability in the +real world. In our work, we initialize the slot representations with clustering +algorithms conditioned on the perceptual input features. This requires an +additional layer in the architecture to initialize the slots given the +identified clusters. We design permutation invariant and permutation +equivariant versions of this layer to enable the exchangeable slot +representations after clustering. Additionally, we employ mean-shift clustering +to automatically identify the number of slots for a given scene. We evaluate +our method on object discovery and novel view synthesis tasks with various +datasets. The results show that our method outperforms prior works +consistently, especially for complex scenes. + +
+
+
+
+
+ + ☆ Towards Clip-Free Quantized Super-Resolution Networks: How to Tame + Representative Images BMVC 2023 + + +
+ Super-resolution (SR) networks have been investigated for a while, with their +mobile and lightweight versions gaining noticeable popularity recently. +Quantization, the procedure of decreasing the precision of network parameters +(mostly FP32 to INT8), is also utilized in SR networks for establishing mobile +compatibility. This study focuses on a very important but mostly overlooked +post-training quantization (PTQ) step: representative dataset (RD), which +adjusts the quantization range for PTQ. We propose a novel pipeline (clip-free +quantization pipeline, CFQP) backed up with extensive experimental +justifications to cleverly augment RD images by only using outputs of the FP32 +model. Using the proposed pipeline for RD, we can successfully eliminate +unwanted clipped activation layers, which nearly all mobile SR methods utilize +to make the model more robust to PTQ in return for a large overhead in runtime. +Removing clipped activations with our method significantly benefits overall +increased stability, decreased inference runtime up to 54% on some SR models, +better visual quality results compared to INT8 clipped models - and outperforms +even some FP32 non-quantized models, both in runtime and visual quality, +without the need for retraining with clipped activation. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+ + ☆ How Much Temporal Long-Term Context is Needed for Action Segmentation? ICCV 2023 + + +
+ Modeling long-term context in videos is crucial for many fine-grained tasks +including temporal action segmentation. An interesting question that is still +open is how much long-term temporal context is needed for optimal performance. +While transformers can model the long-term context of a video, this becomes +computationally prohibitive for long videos. Recent works on temporal action +segmentation thus combine temporal convolutional networks with self-attentions +that are computed only for a local temporal window. While these approaches show +good results, their performance is limited by their inability to capture the +full context of a video. In this work, we try to answer how much long-term +temporal context is required for temporal action segmentation by introducing a +transformer-based model that leverages sparse attention to capture the full +context of a video. We compare our model with the current state of the art on +three datasets for temporal action segmentation, namely 50Salads, Breakfast, +and Assembly101. Our experiments show that modeling the full context of a video +is necessary to obtain the best performance for temporal action segmentation. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Exemplar-Free Continual Transformer with Convolutions ICCV 2023 + + +
+ Continual Learning (CL) involves training a machine learning model in a +sequential manner to learn new information while retaining previously learned +tasks without the presence of previous training data. Although there has been +significant interest in CL, most recent CL approaches in computer vision have +focused on convolutional architectures only. However, with the recent success +of vision transformers, there is a need to explore their potential for CL. +Although there have been some recent CL approaches for vision transformers, +they either store training instances of previous tasks or require a task +identifier during test time, which can be limiting. This paper proposes a new +exemplar-free approach for class/task incremental learning called ConTraCon, +which does not require task-id to be explicitly present during inference and +avoids the need for storing previous training instances. The proposed approach +leverages the transformer architecture and involves re-weighting the key, +query, and value weights of the multi-head self-attention layers of a +transformer trained on a similar task. The re-weighting is done using +convolution, which enables the approach to maintain low parameter requirements +per task. Additionally, an image augmentation-based entropic task +identification approach is used to predict tasks without requiring task-ids +during inference. Experiments on four benchmark datasets demonstrate that the +proposed approach outperforms several competitive approaches while requiring +fewer parameters. + +
+
+ comment: Accepted in ICCV 2023 +
+
+
+
+
+ + ☆ Semantic RGB-D Image Synthesis + + +
+ Collecting diverse sets of training images for RGB-D semantic image +segmentation is not always possible. In particular, when robots need to operate +in privacy-sensitive areas like homes, the collection is often limited to a +small set of locations. As a consequence, the annotated images lack diversity +in appearance and approaches for RGB-D semantic image segmentation tend to +overfit the training data. In this paper, we thus introduce semantic RGB-D +image synthesis to address this problem. It requires synthesising a +realistic-looking RGB-D image for a given semantic label map. Current +approaches, however, are uni-modal and cannot cope with multi-modal data. +Indeed, we show that extending uni-modal approaches to multi-modal data does +not perform well. In this paper, we therefore propose a generator for +multi-modal data that separates modal-independent information of the semantic +layout from the modal-dependent information that is needed to generate an RGB +and a depth image, respectively. Furthermore, we propose a discriminator that +ensures semantic consistency between the label maps and the generated images +and perceptual similarity between the real and generated images. Our +comprehensive experiments demonstrate that the proposed method outperforms +previous uni-modal methods by a large margin and that the accuracy of an +approach for RGB-D semantic segmentation can be significantly improved by +mixing real and generated images during training. + +
+
+
+
+
+ + ☆ Integration of Sentinel-1 and Sentinel-2 data for Earth surface + classification using Machine Learning algorithms implemented on Google Earth + Engine + + +
+ In this study, Synthetic Aperture Radar (SAR) and optical data are both +considered for Earth surface classification. Specifically, the integration of +Sentinel-1 (S-1) and Sentinel-2 (S-2) data is carried out through supervised +Machine Learning (ML) algorithms implemented on the Google Earth Engine (GEE) +platform for the classification of a particular region of interest. Achieved +results demonstrate how in this case radar and optical remote detection provide +complementary information, benefiting surface cover classification and +generally leading to increased mapping accuracy. In addition, this paper works +in the direction of proving the emerging role of GEE as an effective +cloud-based tool for handling large amounts of satellite data. + +
+
+ comment: 4 pages, 7 figures, IEEE InGARSS conference +
+
+
+
+
+ + ☆ GrowCLIP: Data-aware Automatic Model Growing for Large-scale Contrastive + Language-Image Pre-training ICCV2023 + + +
+ Cross-modal pre-training has shown impressive performance on a wide range of +downstream tasks, benefiting from massive image-text pairs collected from the +Internet. In practice, online data are growing constantly, highlighting the +importance of the ability of pre-trained model to learn from data that is +continuously growing. Existing works on cross-modal pre-training mainly focus +on training a network with fixed architecture. However, it is impractical to +limit the model capacity when considering the continuously growing nature of +pre-training data in real-world applications. On the other hand, it is +important to utilize the knowledge in the current model to obtain efficient +training and better performance. To address the above issues, in this paper, we +propose GrowCLIP, a data-driven automatic model growing algorithm for +contrastive language-image pre-training with continuous image-text pairs as +input. Specially, we adopt a dynamic growth space and seek out the optimal +architecture at each growth step to adapt to online learning scenarios. And the +shared encoder is proposed in our growth space to enhance the degree of +cross-modal fusion. Besides, we explore the effect of growth in different +dimensions, which could provide future references for the design of cross-modal +model architecture. Finally, we employ parameter inheriting with momentum (PIM) +to maintain the previous knowledge and address the issue of the local minimum +dilemma. Compared with the existing methods, GrowCLIP improves 2.3% average +top-1 accuracy on zero-shot image classification of 9 downstream tasks. As for +zero-shot image retrieval, GrowCLIP can improve 1.2% for top-1 image-to-text +recall on Flickr30K dataset. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Object Detection Difficulty: Suppressing Over-aggregation for Faster and + Better Video Object Detection ACM MM2023 + + +
+ Current video object detection (VOD) models often encounter issues with +over-aggregation due to redundant aggregation strategies, which perform feature +aggregation on every frame. This results in suboptimal performance and +increased computational complexity. In this work, we propose an image-level +Object Detection Difficulty (ODD) metric to quantify the difficulty of +detecting objects in a given image. The derived ODD scores can be used in the +VOD process to mitigate over-aggregation. Specifically, we train an ODD +predictor as an auxiliary head of a still-image object detector to compute the +ODD score for each image based on the discrepancies between detection results +and ground-truth bounding boxes. The ODD score enhances the VOD system in two +ways: 1) it enables the VOD system to select superior global reference frames, +thereby improving overall accuracy; and 2) it serves as an indicator in the +newly designed ODD Scheduler to eliminate the aggregation of frames that are +easy to detect, thus accelerating the VOD process. Comprehensive experiments +demonstrate that, when utilized for selecting global reference frames, ODD-VOD +consistently enhances the accuracy of Global-frame-based VOD models. When +employed for acceleration, ODD-VOD consistently improves the frames per second +(FPS) by an average of 73.3% across 8 different VOD models without sacrificing +accuracy. When combined, ODD-VOD attains state-of-the-art performance when +competing with many VOD methods in both accuracy and speed. Our work represents +a significant advancement towards making VOD more practical for real-world +applications. + +
+
+ comment: 11 pages, 6 figures, accepted by ACM MM2023 +
+
+
+
+
+ + ☆ CiteTracker: Correlating Image and Text for Visual Tracking ICCV 2023 + + +
+ Existing visual tracking methods typically take an image patch as the +reference of the target to perform tracking. However, a single image patch +cannot provide a complete and precise concept of the target object as images +are limited in their ability to abstract and can be ambiguous, which makes it +difficult to track targets with drastic variations. In this paper, we propose +the CiteTracker to enhance target modeling and inference in visual tracking by +connecting images and text. Specifically, we develop a text generation module +to convert the target image patch into a descriptive text containing its class +and attribute information, providing a comprehensive reference point for the +target. In addition, a dynamic description module is designed to adapt to +target variations for more effective target representation. We then associate +the target description and the search image using an attention-based +correlation module to generate the correlated features for target state +reference. Extensive experiments on five diverse datasets are conducted to +evaluate the proposed algorithm and the favorable performance against the +state-of-the-art methods demonstrates the effectiveness of the proposed +tracking method. + +
+
+ comment: accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Using and Abusing Equivariance + + +
+ In this paper we show how Group Equivariant Convolutional Neural Networks use +subsampling to learn to break equivariance to their symmetries. We focus on 2D +rotations and reflections and investigate the impact of broken equivariance on +network performance. We show that a change in the input dimension of a network +as small as a single pixel can be enough for commonly used architectures to +become approximately equivariant, rather than exactly. We investigate the +impact of networks not being exactly equivariant and find that approximately +equivariant networks generalise significantly worse to unseen symmetries +compared to their exactly equivariant counterparts. However, when the +symmetries in the training data are not identical to the symmetries of the +network, we find that approximately equivariant networks are able to relax +their own equivariant constraints, causing them to match or outperform exactly +equivariant networks on common benchmark datasets. + +
+
+
+
+
+ + ☆ Approaching human 3D shape perception with neurally mappable models + + +
+ Humans effortlessly infer the 3D shape of objects. What computations underlie +this ability? Although various computational models have been proposed, none of +them capture the human ability to match object shape across viewpoints. Here, +we ask whether and how this gap might be closed. We begin with a relatively +novel class of computational models, 3D neural fields, which encapsulate the +basic principles of classic analysis-by-synthesis in a deep neural network +(DNN). First, we find that a 3D Light Field Network (3D-LFN) supports 3D +matching judgments well aligned to humans for within-category comparisons, +adversarially-defined comparisons that accentuate the 3D failure cases of +standard DNN models, and adversarially-defined comparisons for algorithmically +generated shapes with no category structure. We then investigate the source of +the 3D-LFN's ability to achieve human-aligned performance through a series of +computational experiments. Exposure to multiple viewpoints of objects during +training and a multi-view learning objective are the primary factors behind +model-human alignment; even conventional DNN architectures come much closer to +human behavior when trained with multi-view objectives. Finally, we find that +while the models trained with multi-view learning objectives are able to +partially generalize to new object categories, they fall short of human +alignment. This work provides a foundation for understanding human shape +inferences within neurally mappable computational architectures and highlights +important questions for future work. + +
+
+
+
+
+ + ☆ BHSD: A 3D Multi-Class Brain Hemorrhage Segmentation Dataset + + +
+ Intracranial hemorrhage (ICH) is a pathological condition characterized by +bleeding inside the skull or brain, which can be attributed to various factors. +Identifying, localizing and quantifying ICH has important clinical +implications, in a bleed-dependent manner. While deep learning techniques are +widely used in medical image segmentation and have been applied to the ICH +segmentation task, existing public ICH datasets do not support the multi-class +segmentation problem. To address this, we develop the Brain Hemorrhage +Segmentation Dataset (BHSD), which provides a 3D multi-class ICH dataset +containing 192 volumes with pixel-level annotations and 2200 volumes with +slice-level annotations across five categories of ICH. To demonstrate the +utility of the dataset, we formulate a series of supervised and semi-supervised +ICH segmentation tasks. We provide experimental results with state-of-the-art +models as reference benchmarks for further model developments and evaluations +on this dataset. + +
+
+
+
+
+ + ☆ Improving Knot Prediction in Wood Logs with Longitudinal Feature + Propagation + + +
+ The quality of a wood log in the wood industry depends heavily on the +presence of both outer and inner defects, including inner knots that are a +result of the growth of tree branches. Today, locating the inner knots require +the use of expensive equipment such as X-ray scanners. In this paper, we +address the task of predicting the location of inner defects from the outer +shape of the logs. The dataset is built by extracting both the contours and the +knots with X-ray measurements. We propose to solve this binary segmentation +task by leveraging convolutional recurrent neural networks. Once the neural +network is trained, inference can be performed from the outer shape measured +with cheap devices such as laser profilers. We demonstrate the effectiveness of +our approach on fir and spruce tree species and perform ablation on the +recurrence to demonstrate its importance. + +
+
+
+
+
+ + ☆ PCMC-T1: Free-breathing myocardial T1 mapping with + Physically-Constrained Motion Correction MICCAI 2023 + + +
+ T1 mapping is a quantitative magnetic resonance imaging (qMRI) technique that +has emerged as a valuable tool in the diagnosis of diffuse myocardial diseases. +However, prevailing approaches have relied heavily on breath-hold sequences to +eliminate respiratory motion artifacts. This limitation hinders accessibility +and effectiveness for patients who cannot tolerate breath-holding. Image +registration can be used to enable free-breathing T1 mapping. Yet, inherent +intensity differences between the different time points make the registration +task challenging. We introduce PCMC-T1, a physically-constrained deep-learning +model for motion correction in free-breathing T1 mapping. We incorporate the +signal decay model into the network architecture to encourage +physically-plausible deformations along the longitudinal relaxation axis. We +compared PCMC-T1 to baseline deep-learning-based image registration approaches +using a 5-fold experimental setup on a publicly available dataset of 210 +patients. PCMC-T1 demonstrated superior model fitting quality (R2: 0.955) and +achieved the highest clinical impact (clinical score: 3.93) compared to +baseline methods (0.941, 0.946 and 3.34, 3.62 respectively). Anatomical +alignment results were comparable (Dice score: 0.9835 vs. 0.984, 0.988). Our +code and trained models are available at https://github.com/eyalhana/PCMC-T1. + +
+
+ comment: Accepted to MICCAI 2023 +
+
+
+
+
+ + ☆ CNN based Cuneiform Sign Detection Learned from Annotated 3D Renderings + and Mapped Photographs with Illumination Augmentation ICCV23 + + +
+ Motivated by the challenges of the Digital Ancient Near Eastern Studies +(DANES) community, we develop digital tools for processing cuneiform script +being a 3D script imprinted into clay tablets used for more than three +millennia and at least eight major languages. It consists of thousands of +characters that have changed over time and space. Photographs are the most +common representations usable for machine learning, while ink drawings are +prone to interpretation. Best suited 3D datasets that are becoming available. +We created and used the HeiCuBeDa and MaiCuBeDa datasets, which consist of +around 500 annotated tablets. For our novel OCR-like approach to mixed image +data, we provide an additional mapping tool for transferring annotations +between 3D renderings and photographs. Our sign localization uses a RepPoints +detector to predict the locations of characters as bounding boxes. We use image +data from GigaMesh's MSII (curvature, see https://gigamesh.eu) based rendering, +Phong-shaded 3D models, and photographs as well as illumination augmentation. +The results show that using rendered 3D images for sign detection performs +better than other work on photographs. In addition, our approach gives +reasonably good results for photographs only, while it is best used for mixed +datasets. More importantly, the Phong renderings, and especially the MSII +renderings, improve the results on photographs, which is the largest dataset on +a global scale. + +
+
+ comment: This paper was accepted to ICCV23 and includes the DOI for an Open + Access Dataset with annotated cuneiform script +
+
+
+
+
+ + ☆ HMD-NeMo: Online 3D Avatar Motion Generation From Sparse Observations ICCV 2023 + + +
+ Generating both plausible and accurate full body avatar motion is the key to +the quality of immersive experiences in mixed reality scenarios. Head-Mounted +Devices (HMDs) typically only provide a few input signals, such as head and +hands 6-DoF. Recently, different approaches achieved impressive performance in +generating full body motion given only head and hands signal. However, to the +best of our knowledge, all existing approaches rely on full hand visibility. +While this is the case when, e.g., using motion controllers, a considerable +proportion of mixed reality experiences do not involve motion controllers and +instead rely on egocentric hand tracking. This introduces the challenge of +partial hand visibility owing to the restricted field of view of the HMD. In +this paper, we propose the first unified approach, HMD-NeMo, that addresses +plausible and accurate full body motion generation even when the hands may be +only partially visible. HMD-NeMo is a lightweight neural network that predicts +the full body motion in an online and real-time fashion. At the heart of +HMD-NeMo is the spatio-temporal encoder with novel temporally adaptable mask +tokens that encourage plausible motion in the absence of hand observations. We +perform extensive analysis of the impact of different components in HMD-NeMo +and introduce a new state-of-the-art on AMASS dataset through our evaluation. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Video BagNet: short temporal receptive fields increase robustness in + long-term action recognition + + +
+ Previous work on long-term video action recognition relies on deep +3D-convolutional models that have a large temporal receptive field (RF). We +argue that these models are not always the best choice for temporal modeling in +videos. A large temporal receptive field allows the model to encode the exact +sub-action order of a video, which causes a performance decrease when testing +videos have a different sub-action order. In this work, we investigate whether +we can improve the model robustness to the sub-action order by shrinking the +temporal receptive field of action recognition models. For this, we design +Video BagNet, a variant of the 3D ResNet-50 model with the temporal receptive +field size limited to 1, 9, 17 or 33 frames. We analyze Video BagNet on +synthetic and real-world video datasets and experimentally compare models with +varying temporal receptive fields. We find that short receptive fields are +robust to sub-action order changes, while larger temporal receptive fields are +sensitive to the sub-action order. + +
+
+
+
+
+ + ☆ Are current long-term video understanding datasets long-term? + + +
+ Many real-world applications, from sport analysis to surveillance, benefit +from automatic long-term action recognition. In the current deep learning +paradigm for automatic action recognition, it is imperative that models are +trained and tested on datasets and tasks that evaluate if such models actually +learn and reason over long-term information. In this work, we propose a method +to evaluate how suitable a video dataset is to evaluate models for long-term +action recognition. To this end, we define a long-term action as excluding all +the videos that can be correctly recognized using solely short-term +information. We test this definition on existing long-term classification tasks +on three popular real-world datasets, namely Breakfast, CrossTask and LVU, to +determine if these datasets are truly evaluating long-term recognition. Our +study reveals that these datasets can be effectively solved using shortcuts +based on short-term information. Following this finding, we encourage long-term +action recognition researchers to make use of datasets that need long-term +information to be solved. + +
+
+
+
+
+ + ☆ LOCATE: Self-supervised Object Discovery via Flow-guided Graph-cut and + Bootstrapped Self-training BMVC + + +
+ Learning object segmentation in image and video datasets without human +supervision is a challenging problem. Humans easily identify moving salient +objects in videos using the gestalt principle of common fate, which suggests +that what moves together belongs together. Building upon this idea, we propose +a self-supervised object discovery approach that leverages motion and +appearance information to produce high-quality object segmentation masks. +Specifically, we redesign the traditional graph cut on images to include motion +information in a linear combination with appearance information to produce edge +weights. Remarkably, this step produces object segmentation masks comparable to +the current state-of-the-art on multiple benchmarks. To further improve +performance, we bootstrap a segmentation network trained on these preliminary +masks as pseudo-ground truths to learn from its own outputs via self-training. +We demonstrate the effectiveness of our approach, named LOCATE, on multiple +standard video object segmentation, image saliency detection, and object +segmentation benchmarks, achieving results on par with and, in many cases +surpassing state-of-the-art methods. We also demonstrate the transferability of +our approach to novel domains through a qualitative study on in-the-wild +images. Additionally, we present extensive ablation analysis to support our +design choices and highlight the contribution of each component of our proposed +method. + +
+
+ comment: Accepted to the British Machine Vision Conference (BMVC) 2023 +
+
+
+
+
+ + ☆ Affordance segmentation of hand-occluded containers from exocentric + images ICCV + + +
+ Visual affordance segmentation identifies the surfaces of an object an agent +can interact with. Common challenges for the identification of affordances are +the variety of the geometry and physical properties of these surfaces as well +as occlusions. In this paper, we focus on occlusions of an object that is +hand-held by a person manipulating it. To address this challenge, we propose an +affordance segmentation model that uses auxiliary branches to process the +object and hand regions separately. The proposed model learns affordance +features under hand-occlusion by weighting the feature map through hand and +object segmentation. To train the model, we annotated the visual affordances of +an existing dataset with mixed-reality images of hand-held containers in +third-person (exocentric) images. Experiments on both real and mixed-reality +images show that our model achieves better affordance segmentation and +generalisation than existing models. + +
+
+ comment: Paper accepted to Workshop on Assistive Computer Vision and Robotics + (ACVR) in International Conference on Computer Vision (ICCV) 2023; 10 pages, + 4 figures, 2 tables. Data, code, and trained models are available at + https://apicis.github.io/projects/acanet.html +
+
+
+
+
+ + ☆ LDP-Feat: Image Features with Local Differential Privacy ICCV + + +
+ Modern computer vision services often require users to share raw feature +descriptors with an untrusted server. This presents an inherent privacy risk, +as raw descriptors may be used to recover the source images from which they +were extracted. To address this issue, researchers recently proposed +privatizing image features by embedding them within an affine subspace +containing the original feature as well as adversarial feature samples. In this +paper, we propose two novel inversion attacks to show that it is possible to +(approximately) recover the original image features from these embeddings, +allowing us to recover privacy-critical image content. In light of such +successes and the lack of theoretical privacy guarantees afforded by existing +visual privacy methods, we further propose the first method to privatize image +features via local differential privacy, which, unlike prior approaches, +provides a guaranteed bound for privacy leakage regardless of the strength of +the attacks. In addition, our method yields strong performance in visual +localization as a downstream task while enjoying the privacy guarantee. + +
+
+ comment: 11 pages, 4 figures, to be published in International Conference on + Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ☆ DiffCloth: Diffusion Based Garment Synthesis and Manipulation via + Structural Cross-modal Semantic Alignment ICCV2023 + + +
+ Cross-modal garment synthesis and manipulation will significantly benefit the +way fashion designers generate garments and modify their designs via flexible +linguistic interfaces.Current approaches follow the general text-to-image +paradigm and mine cross-modal relations via simple cross-attention modules, +neglecting the structural correspondence between visual and textual +representations in the fashion design domain. In this work, we instead +introduce DiffCloth, a diffusion-based pipeline for cross-modal garment +synthesis and manipulation, which empowers diffusion models with flexible +compositionality in the fashion domain by structurally aligning the cross-modal +semantics. Specifically, we formulate the part-level cross-modal alignment as a +bipartite matching problem between the linguistic Attribute-Phrases (AP) and +the visual garment parts which are obtained via constituency parsing and +semantic segmentation, respectively. To mitigate the issue of attribute +confusion, we further propose a semantic-bundled cross-attention to preserve +the spatial structure similarities between the attention maps of attribute +adjectives and part nouns in each AP. Moreover, DiffCloth allows for +manipulation of the generated results by simply replacing APs in the text +prompts. The manipulation-irrelevant regions are recognized by blended masks +obtained from the bundled attention maps of the APs and kept unchanged. +Extensive experiments on the CM-Fashion benchmark demonstrate that DiffCloth +both yields state-of-the-art garment synthesis results by leveraging the +inherent structural information and supports flexible manipulation with region +consistency. + +
+
+ comment: accepted by ICCV2023 +
+
+
+
+
+ + ☆ Masked Cross-image Encoding for Few-shot Segmentation + + +
+ Few-shot segmentation (FSS) is a dense prediction task that aims to infer the +pixel-wise labels of unseen classes using only a limited number of annotated +images. The key challenge in FSS is to classify the labels of query pixels +using class prototypes learned from the few labeled support exemplars. Prior +approaches to FSS have typically focused on learning class-wise descriptors +independently from support images, thereby ignoring the rich contextual +information and mutual dependencies among support-query features. To address +this limitation, we propose a joint learning method termed Masked Cross-Image +Encoding (MCE), which is designed to capture common visual properties that +describe object details and to learn bidirectional inter-image dependencies +that enhance feature interaction. MCE is more than a visual representation +enrichment module; it also considers cross-image mutual dependencies and +implicit guidance. Experiments on FSS benchmarks PASCAL-$5^i$ and COCO-$20^i$ +demonstrate the advanced meta-learning ability of the proposed method. + +
+
+ comment: conference +
+
+
+
+
+ + ☆ ConcatPlexer: Additional Dim1 Batching for Faster ViTs + + +
+ Transformers have demonstrated tremendous success not only in the natural +language processing (NLP) domain but also the field of computer vision, +igniting various creative approaches and applications. Yet, the superior +performance and modeling flexibility of transformers came with a severe +increase in computation costs, and hence several works have proposed methods to +reduce this burden. Inspired by a cost-cutting method originally proposed for +language models, Data Multiplexing (DataMUX), we propose a novel approach for +efficient visual recognition that employs additional dim1 batching (i.e., +concatenation) that greatly improves the throughput with little compromise in +the accuracy. We first introduce a naive adaptation of DataMux for vision +models, Image Multiplexer, and devise novel components to overcome its +weaknesses, rendering our final model, ConcatPlexer, at the sweet spot between +inference speed and accuracy. The ConcatPlexer was trained on ImageNet1K and +CIFAR100 dataset and it achieved 23.5% less GFLOPs than ViT-B/16 with 69.5% and +83.4% validation accuracy, respectively. + +
+
+
+
+
+ + ☆ Novel-view Synthesis and Pose Estimation for Hand-Object Interaction + from Sparse Views + + +
+ Hand-object interaction understanding and the barely addressed novel view +synthesis are highly desired in the immersive communication, whereas it is +challenging due to the high deformation of hand and heavy occlusions between +hand and object. In this paper, we propose a neural rendering and pose +estimation system for hand-object interaction from sparse views, which can also +enable 3D hand-object interaction editing. We share the inspiration from recent +scene understanding work that shows a scene specific model built beforehand can +significantly improve and unblock vision tasks especially when inputs are +sparse, and extend it to the dynamic hand-object interaction scenario and +propose to solve the problem in two stages. We first learn the shape and +appearance prior knowledge of hands and objects separately with the neural +representation at the offline stage. During the online stage, we design a +rendering-based joint model fitting framework to understand the dynamic +hand-object interaction with the pre-built hand and object models as well as +interaction priors, which thereby overcomes penetration and separation issues +between hand and object and also enables novel view synthesis. In order to get +stable contact during the hand-object interaction process in a sequence, we +propose a stable contact loss to make the contact region to be consistent. +Experiments demonstrate that our method outperforms the state-of-the-art +methods. Code and dataset are available in project webpage +https://iscas3dv.github.io/HO-NeRF. + +
+
+
+
+
+ + ☆ ViLLA: Fine-Grained Vision-Language Representation Learning from + Real-World Data ICCV 2023 + + +
+ Vision-language models (VLMs), such as CLIP and ALIGN, are generally trained +on datasets consisting of image-caption pairs obtained from the web. However, +real-world multimodal datasets, such as healthcare data, are significantly more +complex: each image (e.g. X-ray) is often paired with text (e.g. physician +report) that describes many distinct attributes occurring in fine-grained +regions of the image. We refer to these samples as exhibiting high pairwise +complexity, since each image-text pair can be decomposed into a large number of +region-attribute pairings. The extent to which VLMs can capture fine-grained +relationships between image regions and textual attributes when trained on such +data has not been previously evaluated. The first key contribution of this work +is to demonstrate through systematic evaluations that as the pairwise +complexity of the training dataset increases, standard VLMs struggle to learn +region-attribute relationships, exhibiting performance degradations of up to +37% on retrieval tasks. In order to address this issue, we introduce ViLLA as +our second key contribution. ViLLA, which is trained to capture fine-grained +region-attribute relationships from complex datasets, involves two components: +(a) a lightweight, self-supervised mapping model to decompose image-text +samples into region-attribute pairs, and (b) a contrastive VLM to learn +representations from generated region-attribute pairs. We demonstrate with +experiments across four domains (synthetic, product, medical, and natural +images) that ViLLA outperforms comparable VLMs on fine-grained reasoning tasks, +such as zero-shot object detection (up to 3.6 AP50 points on COCO and 0.6 mAP +points on LVIS) and retrieval (up to 14.2 R-Precision points). + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Knowledge-Aware Prompt Tuning for Generalizable Vision-Language Models ICCV 2023 + + +
+ Pre-trained vision-language models, e.g., CLIP, working with manually +designed prompts have demonstrated great capacity of transfer learning. +Recently, learnable prompts achieve state-of-the-art performance, which however +are prone to overfit to seen classes, failing to generalize to unseen classes. +In this paper, we propose a Knowledge-Aware Prompt Tuning (KAPT) framework for +vision-language models. Our approach takes inspiration from human intelligence +in which external knowledge is usually incorporated into recognizing novel +categories of objects. Specifically, we design two complementary types of +knowledge-aware prompts for the text encoder to leverage the distinctive +characteristics of category-related external knowledge. The discrete prompt +extracts the key information from descriptions of an object category, and the +learned continuous prompt captures overall contexts. We further design an +adaptation head for the visual encoder to aggregate salient attentive visual +cues, which establishes discriminative and task-aware visual representations. +We conduct extensive experiments on 11 widely-used benchmark datasets and the +results verify the effectiveness in few-shot image classification, especially +in generalizing to unseen categories. Compared with the state-of-the-art CoCoOp +method, KAPT exhibits favorable performance and achieves an absolute gain of +3.22% on new classes and 2.57% in terms of harmonic mean. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ MEGA: Multimodal Alignment Aggregation and Distillation For Cinematic + Video Segmentation ICCV 2023 + + +
+ Previous research has studied the task of segmenting cinematic videos into +scenes and into narrative acts. However, these studies have overlooked the +essential task of multimodal alignment and fusion for effectively and +efficiently processing long-form videos (>60min). In this paper, we introduce +Multimodal alignmEnt aGgregation and distillAtion (MEGA) for cinematic +long-video segmentation. MEGA tackles the challenge by leveraging multiple +media modalities. The method coarsely aligns inputs of variable lengths and +different modalities with alignment positional encoding. To maintain temporal +synchronization while reducing computation, we further introduce an enhanced +bottleneck fusion layer which uses temporal alignment. Additionally, MEGA +employs a novel contrastive loss to synchronize and transfer labels across +modalities, enabling act segmentation from labeled synopsis sentences on video +shots. Our experimental results show that MEGA outperforms state-of-the-art +methods on MovieNet dataset for scene segmentation (with an Average Precision +improvement of +1.19%) and on TRIPOD dataset for act segmentation (with a Total +Agreement improvement of +5.51%) + +
+
+ comment: ICCV 2023 accepted +
+
+
+
+
+ + ☆ ReFit: Recurrent Fitting Network for 3D Human Recovery ICCV 2023 + + +
+ We present Recurrent Fitting (ReFit), a neural network architecture for +single-image, parametric 3D human reconstruction. ReFit learns a +feedback-update loop that mirrors the strategy of solving an inverse problem +through optimization. At each iterative step, it reprojects keypoints from the +human model to feature maps to query feedback, and uses a recurrent-based +updater to adjust the model to fit the image better. Because ReFit encodes +strong knowledge of the inverse problem, it is faster to train than previous +regression models. At the same time, ReFit improves state-of-the-art +performance on standard benchmarks. Moreover, ReFit applies to other +optimization settings, such as multi-view fitting and single-view shape +fitting. Project website: https://yufu-wang.github.io/refit_humans/ + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ A three in one bottom-up framework for simultaneous semantic + segmentation, instance segmentation and classification of multi-organ nuclei + in digital cancer histology + + +
+ Simultaneous segmentation and classification of nuclei in digital histology +play an essential role in computer-assisted cancer diagnosis; however, it +remains challenging. The highest achieved binary and multi-class Panoptic +Quality (PQ) remains as low as 0.68 bPQ and 0.49 mPQ, respectively. It is due +to the higher staining variability, variability across the tissue, rough +clinical conditions, overlapping nuclei, and nuclear class imbalance. The +generic deep-learning methods usually rely on end-to-end models, which fail to +address these problems associated explicitly with digital histology. In our +previous work, DAN-NucNet, we resolved these issues for semantic segmentation +with an end-to-end model. This work extends our previous model to simultaneous +instance segmentation and classification. We introduce additional decoder heads +with independent weighted losses, which produce semantic segmentation, edge +proposals, and classification maps. We use the outputs from the three-head +model to apply post-processing to produce the final segmentation and +classification. Our multi-stage approach utilizes edge proposals and semantic +segmentations compared to direct segmentation and classification strategies +followed by most state-of-the-art methods. Due to this, we demonstrate a +significant performance improvement in producing high-quality instance +segmentation and nuclei classification. We have achieved a 0.841 Dice score for +semantic segmentation, 0.713 bPQ scores for instance segmentation, and 0.633 +mPQ for nuclei classification. Our proposed framework is generalized across 19 +types of tissues. Furthermore, the framework is less complex compared to the +state-of-the-art. + +
+
+
+
+
+ + ☆ ViCo: Engaging Video Comment Generation with Human Preference Rewards + + +
+ Engaging video comments play an important role in video social media, as they +are the carrier of feelings, thoughts, or humor of the audience. Preliminary +works have made initial exploration for video comment generation by adopting +caption-style encoder-decoder models. However, comment generation presents some +unique challenges distinct from caption generation, which makes these methods +somewhat less effective at generating engaging comments. In contrast to the +objective and descriptive nature of captions, comments tend to be inherently +subjective, making it hard to quantify and evaluate the engagement of comments. +Furthermore, the scarcity of truly engaging comments brings difficulty to +collecting enough high-quality training examples. In this paper, we propose +ViCo with three novel designs to tackle the above challenges for generating +engaging Video Comments. Firstly, to quantify the engagement of comments, we +utilize the number of "likes" each comment receives as a proxy of human +preference after an appropriate debiasing procedure. Secondly, to automatically +evaluate the engagement of comments, we train a reward model to align its +judgment to the above proxy. Our user studies indicate that this reward model +effectively aligns with human judgments. Lastly, to alleviate the scarcity of +high-quality comments, an initial generator is trained on readily available but +noisy data to generate comments. Then the reward model is employed to offer +feedback on the generated comments, thus optimizing the initial generator. To +facilitate the research of video commenting, we collect a large video +comment-dataset (ViCo-20k) with rich metadata from a popular video website. +Experiments on ViCo-20k show that the comments generated by our ViCo model +exhibit the best performance in terms of both quantitative and qualitative +results, particularly when engagement is considered. + +
+
+
+
+
+ + ☆ Hierarchical Point-based Active Learning for Semi-supervised Point Cloud + Semantic Segmentation ICCV + + +
+ Impressive performance on point cloud semantic segmentation has been achieved +by fully-supervised methods with large amounts of labelled data. As it is +labour-intensive to acquire large-scale point cloud data with point-wise +labels, many attempts have been made to explore learning 3D point cloud +segmentation with limited annotations. Active learning is one of the effective +strategies to achieve this purpose but is still under-explored. The most recent +methods of this kind measure the uncertainty of each pre-divided region for +manual labelling but they suffer from redundant information and require +additional efforts for region division. This paper aims at addressing this +issue by developing a hierarchical point-based active learning strategy. +Specifically, we measure the uncertainty for each point by a hierarchical +minimum margin uncertainty module which considers the contextual information at +multiple levels. Then, a feature-distance suppression strategy is designed to +select important and representative points for manual labelling. Besides, to +better exploit the unlabelled data, we build a semi-supervised segmentation +framework based on our active strategy. Extensive experiments on the S3DIS and +ScanNetV2 datasets demonstrate that the proposed framework achieves 96.5% and +100% performance of fully-supervised baseline with only 0.07% and 0.1% training +data, respectively, outperforming the state-of-the-art weakly-supervised and +active learning methods. The code will be available at +https://github.com/SmiletoE/HPAL. + +
+
+ comment: International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ☆ Improving Misaligned Multi-modality Image Fusion with One-stage + Progressive Dense Registration + + +
+ Misalignments between multi-modality images pose challenges in image fusion, +manifesting as structural distortions and edge ghosts. Existing efforts +commonly resort to registering first and fusing later, typically employing two +cascaded stages for registration,i.e., coarse registration and fine +registration. Both stages directly estimate the respective target deformation +fields. In this paper, we argue that the separated two-stage registration is +not compact, and the direct estimation of the target deformation fields is not +accurate enough. To address these challenges, we propose a Cross-modality +Multi-scale Progressive Dense Registration (C-MPDR) scheme, which accomplishes +the coarse-to-fine registration exclusively using a one-stage optimization, +thus improving the fusion performance of misaligned multi-modality images. +Specifically, two pivotal components are involved, a dense Deformation Field +Fusion (DFF) module and a Progressive Feature Fine (PFF) module. The DFF +aggregates the predicted multi-scale deformation sub-fields at the current +scale, while the PFF progressively refines the remaining misaligned features. +Both work together to accurately estimate the final deformation fields. In +addition, we develop a Transformer-Conv-based Fusion (TCF) subnetwork that +considers local and long-range feature dependencies, allowing us to capture +more informative features from the registered infrared and visible images for +the generation of high-quality fused images. Extensive experimental analysis +demonstrates the superiority of the proposed method in the fusion of misaligned +cross-modality images. + +
+
+
+
+
+ + ☆ Decoupled Contrastive Multi-view Clustering with High-order Random Walks + + +
+ In recent, some robust contrastive multi-view clustering (MvC) methods have +been proposed, which construct data pairs from neighborhoods to alleviate the +false negative issue, i.e., some intra-cluster samples are wrongly treated as +negative pairs. Although promising performance has been achieved by these +methods, the false negative issue is still far from addressed and the false +positive issue emerges because all in- and out-of-neighborhood samples are +simply treated as positive and negative, respectively. To address the issues, +we propose a novel robust method, dubbed decoupled contrastive multi-view +clustering with high-order random walks (DIVIDE). In brief, DIVIDE leverages +random walks to progressively identify data pairs in a global instead of local +manner. As a result, DIVIDE could identify in-neighborhood negatives and +out-of-neighborhood positives. Moreover, DIVIDE embraces a novel MvC +architecture to perform inter- and intra-view contrastive learning in different +embedding spaces, thus boosting clustering performance and embracing the +robustness against missing views. To verify the efficacy of DIVIDE, we carry +out extensive experiments on four benchmark datasets comparing with nine +state-of-the-art MvC methods in both complete and incomplete MvC settings. + +
+
+
+
+
+ + ☆ A Preliminary Investigation into Search and Matching for Tumour + Discrimination in WHO Breast Taxonomy Using Deep Networks + + +
+ Breast cancer is one of the most common cancers affecting women worldwide. +They include a group of malignant neoplasms with a variety of biological, +clinical, and histopathological characteristics. There are more than 35 +different histological forms of breast lesions that can be classified and +diagnosed histologically according to cell morphology, growth, and architecture +patterns. Recently, deep learning, in the field of artificial intelligence, has +drawn a lot of attention for the computerized representation of medical images. +Searchable digital atlases can provide pathologists with patch matching tools +allowing them to search among evidently diagnosed and treated archival cases, a +technology that may be regarded as computational second opinion. In this study, +we indexed and analyzed the WHO breast taxonomy (Classification of Tumours 5th +Ed.) spanning 35 tumour types. We visualized all tumour types using deep +features extracted from a state-of-the-art deep learning model, pre-trained on +millions of diagnostic histopathology images from the TCGA repository. +Furthermore, we test the concept of a digital "atlas" as a reference for search +and matching with rare test cases. The patch similarity search within the WHO +breast taxonomy data reached over 88% accuracy when validating through +"majority vote" and more than 91% accuracy when validating using top-n tumour +types. These results show for the first time that complex relationships among +common and rare breast lesions can be investigated using an indexed digital +archive. + +
+
+
+
+
+ + ☆ SwinV2DNet: Pyramid and Self-Supervision Compounded Feature Learning for + Remote Sensing Images Change Detection + + +
+ Among the current mainstream change detection networks, transformer is +deficient in the ability to capture accurate low-level details, while +convolutional neural network (CNN) is wanting in the capacity to understand +global information and establish remote spatial relationships. Meanwhile, both +of the widely used early fusion and late fusion frameworks are not able to well +learn complete change features. Therefore, based on swin transformer V2 (Swin +V2) and VGG16, we propose an end-to-end compounded dense network SwinV2DNet to +inherit the advantages of both transformer and CNN and overcome the +shortcomings of existing networks in feature learning. Firstly, it captures the +change relationship features through the densely connected Swin V2 backbone, +and provides the low-level pre-changed and post-changed features through a CNN +branch. Based on these three change features, we accomplish accurate change +detection results. Secondly, combined with transformer and CNN, we propose +mixed feature pyramid (MFP) which provides inter-layer interaction information +and intra-layer multi-scale information for complete feature learning. MFP is a +plug and play module which is experimentally proven to be also effective in +other change detection networks. Further more, we impose a self-supervision +strategy to guide a new CNN branch, which solves the untrainable problem of the +CNN branch and provides the semantic change information for the features of +encoder. The state-of-the-art (SOTA) change detection scores and fine-grained +change maps were obtained compared with other advanced methods on four commonly +used public remote sensing datasets. The code is available at +https://github.com/DalongZ/SwinV2DNet. + +
+
+
+
+
+ + ☆ Domain Generalization via Rationale Invariance ICCV 2023 + + +
+ This paper offers a new perspective to ease the challenge of domain +generalization, which involves maintaining robust results even in unseen +environments. Our design focuses on the decision-making process in the final +classifier layer. Specifically, we propose treating the element-wise +contributions to the final results as the rationale for making a decision and +representing the rationale for each sample as a matrix. For a well-generalized +model, we suggest the rationale matrices for samples belonging to the same +category should be similar, indicating the model relies on domain-invariant +clues to make decisions, thereby ensuring robust results. To implement this +idea, we introduce a rationale invariance loss as a simple regularization +technique, requiring only a few lines of code. Our experiments demonstrate that +the proposed approach achieves competitive results across various datasets, +despite its simplicity. Code is available at +\url{https://github.com/liangchen527/RIDG}. + +
+
+ comment: Accepted in ICCV 2023 +
+
+
+
+
+ + ☆ TOPIC: A Parallel Association Paradigm for Multi-Object Tracking under + Complex Motions and Diverse Scenes + + +
+ Video data and algorithms have been driving advances in multi-object tracking +(MOT). While existing MOT datasets focus on occlusion and appearance +similarity, complex motion patterns are widespread yet overlooked. To address +this issue, we introduce a new dataset called BEE23 to highlight complex +motions. Identity association algorithms have long been the focus of MOT +research. Existing trackers can be categorized into two association paradigms: +single-feature paradigm (based on either motion or appearance feature) and +serial paradigm (one feature serves as secondary while the other is primary). +However, these paradigms are incapable of fully utilizing different features. +In this paper, we propose a parallel paradigm and present the Two rOund +Parallel matchIng meChanism (TOPIC) to implement it. The TOPIC leverages both +motion and appearance features and can adaptively select the preferable one as +the assignment metric based on motion level. Moreover, we provide an +Attention-based Appearance Reconstruct Module (AARM) to reconstruct appearance +feature embeddings, thus enhancing the representation of appearance features. +Comprehensive experiments show that our approach achieves state-of-the-art +performance on four public datasets and BEE23. Notably, our proposed parallel +paradigm surpasses the performance of existing association paradigms by a large +margin, e.g., reducing false negatives by 12% to 51% compared to the +single-feature association paradigm. The introduced dataset and association +paradigm in this work offers a fresh perspective for advancing the MOT field. +The source code and dataset are available at +https://github.com/holmescao/TOPICTrack. + +
+
+
+
+
+ + ☆ Exploring Unsupervised Cell Recognition with Prior Self-activation Maps MICCAI 2023 + + +
+ The success of supervised deep learning models on cell recognition tasks +relies on detailed annotations. Many previous works have managed to reduce the +dependency on labels. However, considering the large number of cells contained +in a patch, costly and inefficient labeling is still inevitable. To this end, +we explored label-free methods for cell recognition. Prior self-activation maps +(PSM) are proposed to generate pseudo masks as training targets. To be +specific, an activation network is trained with self-supervised learning. The +gradient information in the shallow layers of the network is aggregated to +generate prior self-activation maps. Afterward, a semantic clustering module is +then introduced as a pipeline to transform PSMs to pixel-level semantic pseudo +masks for downstream tasks. We evaluated our method on two histological +datasets: MoNuSeg (cell segmentation) and BCData (multi-class cell detection). +Compared with other fully-supervised and weakly-supervised methods, our method +can achieve competitive performance without any manual annotations. Our simple +but effective framework can also achieve multi-class cell detection which can +not be done by existing unsupervised methods. The results show the potential of +PSMs that might inspire other research to deal with the hunger for labels in +medical area. + +
+
+ comment: MICCAI 2023. arXiv admin note: substantial text overlap with + arXiv:2210.07862 +
+
+
+
+
+ + ☆ High Dynamic Range Imaging of Dynamic Scenes with Saturation + Compensation but without Explicit Motion Compensation WACV 2022 + + +
+ High dynamic range (HDR) imaging is a highly challenging task since a large +amount of information is lost due to the limitations of camera sensors. For HDR +imaging, some methods capture multiple low dynamic range (LDR) images with +altering exposures to aggregate more information. However, these approaches +introduce ghosting artifacts when significant inter-frame motions are present. +Moreover, although multi-exposure images are given, we have little information +in severely over-exposed areas. Most existing methods focus on motion +compensation, i.e., alignment of multiple LDR shots to reduce the ghosting +artifacts, but they still produce unsatisfying results. These methods also +rather overlook the need to restore the saturated areas. In this paper, we +generate well-aligned multi-exposure features by reformulating a motion +alignment problem into a simple brightness adjustment problem. In addition, we +propose a coarse-to-fine merging strategy with explicit saturation +compensation. The saturated areas are reconstructed with similar well-exposed +content using adaptive contextual attention. We demonstrate that our method +outperforms the state-of-the-art methods regarding qualitative and quantitative +evaluations. + +
+
+ comment: WACV 2022 +
+
+
+
+
+ + ☆ Efficient View Synthesis with Neural Radiance Distribution Field ICCV2023 + + +
+ Recent work on Neural Radiance Fields (NeRF) has demonstrated significant +advances in high-quality view synthesis. A major limitation of NeRF is its low +rendering efficiency due to the need for multiple network forwardings to render +a single pixel. Existing methods to improve NeRF either reduce the number of +required samples or optimize the implementation to accelerate the network +forwarding. Despite these efforts, the problem of multiple sampling persists +due to the intrinsic representation of radiance fields. In contrast, Neural +Light Fields (NeLF) reduce the computation cost of NeRF by querying only one +single network forwarding per pixel. To achieve a close visual quality to NeRF, +existing NeLF methods require significantly larger network capacities which +limits their rendering efficiency in practice. In this work, we propose a new +representation called Neural Radiance Distribution Field (NeRDF) that targets +efficient view synthesis in real-time. Specifically, we use a small network +similar to NeRF while preserving the rendering speed with a single network +forwarding per pixel as in NeLF. The key is to model the radiance distribution +along each ray with frequency basis and predict frequency weights using the +network. Pixel values are then computed via volume rendering on radiance +distributions. Experiments show that our proposed method offers a better +trade-off among speed, quality, and network size than existing methods: we +achieve a ~254x speed-up over NeRF with similar network size, with only a +marginal performance decline. Our project page is at +yushuang-wu.github.io/NeRDF. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Hey That's Mine Imperceptible Watermarks are Preserved in Diffusion + Generated Outputs + + +
+ Generative models have seen an explosion in popularity with the release of +huge generative Diffusion models like Midjourney and Stable Diffusion to the +public. Because of this new ease of access, questions surrounding the automated +collection of data and issues regarding content ownership have started to +build. In this paper we present new work which aims to provide ways of +protecting content when shared to the public. We show that a generative +Diffusion model trained on data that has been imperceptibly watermarked will +generate new images with these watermarks present. We further show that if a +given watermark is correlated with a certain feature of the training data, the +generated images will also have this correlation. Using statistical tests we +show that we are able to determine whether a model has been trained on marked +data, and what data was marked. As a result our system offers a solution to +protect intellectual property when sharing content online. + +
+
+
+
+
+ + ☆ Random Word Data Augmentation with CLIP for Zero-Shot Anomaly Detection BMVC2023 + + +
+ This paper presents a novel method that leverages a visual-language model, +CLIP, as a data source for zero-shot anomaly detection. Tremendous efforts have +been put towards developing anomaly detectors due to their potential industrial +applications. Considering the difficulty in acquiring various anomalous samples +for training, most existing methods train models with only normal samples and +measure discrepancies from the distribution of normal samples during inference, +which requires training a model for each object category. The problem of this +inefficient training requirement has been tackled by designing a CLIP-based +anomaly detector that applies prompt-guided classification to each part of an +image in a sliding window manner. However, the method still suffers from the +labor of careful prompt ensembling with known object categories. To overcome +the issues above, we propose leveraging CLIP as a data source for training. Our +method generates text embeddings with the text encoder in CLIP with typical +prompts that include words of normal and anomaly. In addition to these words, +we insert several randomly generated words into prompts, which enables the +encoder to generate a diverse set of normal and anomalous samples. Using the +generated embeddings as training data, a feed-forward neural network learns to +extract features of normal and anomaly from CLIP's embeddings, and as a result, +a category-agnostic anomaly detector can be obtained without any training +images. Experimental results demonstrate that our method achieves +state-of-the-art performance without laborious prompt ensembling in zero-shot +setups. + +
+
+ comment: Accepted to BMVC2023 +
+
+
+
+
+ + ☆ LAN-HDR: Luminance-based Alignment Network for High Dynamic Range Video + Reconstruction ICCV 2023 + + +
+ As demands for high-quality videos continue to rise, high-resolution and +high-dynamic range (HDR) imaging techniques are drawing attention. To generate +an HDR video from low dynamic range (LDR) images, one of the critical steps is +the motion compensation between LDR frames, for which most existing works +employed the optical flow algorithm. However, these methods suffer from flow +estimation errors when saturation or complicated motions exist. In this paper, +we propose an end-to-end HDR video composition framework, which aligns LDR +frames in the feature space and then merges aligned features into an HDR frame, +without relying on pixel-domain optical flow. Specifically, we propose a +luminance-based alignment network for HDR (LAN-HDR) consisting of an alignment +module and a hallucination module. The alignment module aligns a frame to the +adjacent reference by evaluating luminance-based attention, excluding color +information. The hallucination module generates sharp details, especially for +washed-out areas due to saturation. The aligned and hallucinated features are +then blended adaptively to complement each other. Finally, we merge the +features to generate a final HDR frame. In training, we adopt a temporal loss, +in addition to frame reconstruction losses, to enhance temporal consistency and +thus reduce flickering. Extensive experiments demonstrate that our method +performs better or comparable to state-of-the-art methods on several +benchmarks. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Development of a Novel Quantum Pre-processing Filter to Improve Image + Classification Accuracy of Neural Network Models + + +
+ This paper proposes a novel quantum pre-processing filter (QPF) to improve +the image classification accuracy of neural network (NN) models. A simple four +qubit quantum circuit that uses Y rotation gates for encoding and two +controlled NOT gates for creating correlation among the qubits is applied as a +feature extraction filter prior to passing data into the fully connected NN +architecture. By applying the QPF approach, the results show that the image +classification accuracy based on the MNIST (handwritten 10 digits) and the +EMNIST (handwritten 47 class digits and letters) datasets can be improved, from +92.5% to 95.4% and from 68.9% to 75.9%, respectively. These improvements were +obtained without introducing extra model parameters or optimizations in the +machine learning process. However, tests performed on the developed QPF +approach against a relatively complex GTSRB dataset with 43 distinct class +real-life traffic sign images showed a degradation in the classification +accuracy. Considering this result, further research into the understanding and +the design of a more suitable quantum circuit approach for image classification +neural networks could be explored utilizing the baseline method proposed in +this paper. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ☆ CAME: Contrastive Automated Model Evaluation ICCV2023 + + +
+ The Automated Model Evaluation (AutoEval) framework entertains the +possibility of evaluating a trained machine learning model without resorting to +a labeled testing set. Despite the promise and some decent results, the +existing AutoEval methods heavily rely on computing distribution shifts between +the unlabelled testing set and the training set. We believe this reliance on +the training set becomes another obstacle in shipping this technology to +real-world ML development. In this work, we propose Contrastive Automatic Model +Evaluation (CAME), a novel AutoEval framework that is rid of involving training +set in the loop. The core idea of CAME bases on a theoretical analysis which +bonds the model performance with a contrastive loss. Further, with extensive +empirical validation, we manage to set up a predictable relationship between +the two, simply by deducing on the unlabeled/unseen testing set. The resulting +framework CAME establishes a new SOTA results for AutoEval by surpassing prior +work significantly. + +
+
+ comment: ICCV2023 main conference +
+
+
+
+
+ + ☆ Classification of the lunar surface pattern by AI architectures: Does AI + see a rabbit in the Moon? + + +
+ In Asian countries, there is a tradition that a rabbit (the Moon rabbit) +lives on the Moon. As the origin of this tradition, usually, two reasons are +mentioned. One reason is that the color pattern of the lunar surface is similar +to the shape of a rabbit. The other reason is that both the Moon and rabbit are +symbols of fertility because the Moon appears and disappears (i.e., waxing and +waning) cyclically, and rabbits bear children frequently. Considering the +latter reason, is the lunar surface color pattern not similar to a rabbit? +Here, the similarity between rabbit and the lunar surface pattern was evaluated +using seven AI architectures. In the test by CLIP, assuming that people look at +the Moon in the early evening frequently, the lunar surface is more similar to +a rabbit than a face at low latitude regions, while it can be classified as +face as latitude increases, which is consistent with that the oldest literature +about the Moon rabbit was written in India and that there is a culture of +human's face in the Moon in Europe. Tested with ImageNet weights, ConvNeXt and +CLIP sometimes classified the lunar surface pattern into rabbit with relatively +high probabilities. Cultures are generated by our attitude to the environment. +Both dynamic and static similarities may be required to induce our imagination. + +
+
+ comment: 15 pages, 5 figures, 2 tables +
+
+
+
+
+ + ☆ Recursive Video Lane Detection ICCV 2023 + + +
+ A novel algorithm to detect road lanes in videos, called recursive video lane +detector (RVLD), is proposed in this paper, which propagates the state of a +current frame recursively to the next frame. RVLD consists of an intra-frame +lane detector (ILD) and a predictive lane detector (PLD). First, we design ILD +to localize lanes in a still frame. Second, we develop PLD to exploit the +information of the previous frame for lane detection in a current frame. To +this end, we estimate a motion field and warp the previous output to the +current frame. Using the warped information, we refine the feature map of the +current frame to detect lanes more reliably. Experimental results show that +RVLD outperforms existing detectors on video lane datasets. Our codes are +available at https://github.com/dongkwonjin/RVLD. + +
+
+ comment: ICCV 2023 accepted +
+
+
+
+
+ + ☆ MosaiQ: Quantum Generative Adversarial Networks for Image Generation on + NISQ Computers ICCV'23 + + +
+ Quantum machine learning and vision have come to the fore recently, with +hardware advances enabling rapid advancement in the capabilities of quantum +machines. Recently, quantum image generation has been explored with many +potential advantages over non-quantum techniques; however, previous techniques +have suffered from poor quality and robustness. To address these problems, we +introduce, MosaiQ, a high-quality quantum image generation GAN framework that +can be executed on today's Near-term Intermediate Scale Quantum (NISQ) +computers. + +
+
+ comment: Accepted to appear at ICCV'23 +
+
+
+
+
+ + ☆ Video OWL-ViT: Temporally-consistent open-world localization in video ICCV 2023 + + +
+ We present an architecture and a training recipe that adapts pre-trained +open-world image models to localization in videos. Understanding the open +visual world (without being constrained by fixed label spaces) is crucial for +many real-world vision tasks. Contrastive pre-training on large image-text +datasets has recently led to significant improvements for image-level tasks. +For more structured tasks involving object localization applying pre-trained +models is more challenging. This is particularly true for video tasks, where +task-specific data is limited. We show successful transfer of open-world models +by building on the OWL-ViT open-vocabulary detection model and adapting it to +video by adding a transformer decoder. The decoder propagates object +representations recurrently through time by using the output tokens for one +frame as the object queries for the next. Our model is end-to-end trainable on +video data and enjoys improved temporal consistency compared to +tracking-by-detection baselines, while retaining the open-world capabilities of +the backbone detector. We evaluate our model on the challenging TAO-OW +benchmark and demonstrate that open-world capabilities, learned from +large-scale image-text pre-training, can be transferred successfully to +open-world localization across diverse videos. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Addressing Fairness and Explainability in Image Classification Using + Optimal Transport + + +
+ Algorithmic Fairness and the explainability of potentially unfair outcomes +are crucial for establishing trust and accountability of Artificial +Intelligence systems in domains such as healthcare and policing. Though +significant advances have been made in each of the fields separately, achieving +explainability in fairness applications remains challenging, particularly so in +domains where deep neural networks are used. At the same time, ethical +data-mining has become ever more relevant, as it has been shown countless times +that fairness-unaware algorithms result in biased outcomes. Current approaches +focus on mitigating biases in the outcomes of the model, but few attempts have +been made to try to explain \emph{why} a model is biased. To bridge this gap, +we propose a comprehensive approach that leverages optimal transport theory to +uncover the causes and implications of biased regions in images, which easily +extends to tabular data as well. Through the use of Wasserstein barycenters, we +obtain scores that are independent of a sensitive variable but keep their +marginal orderings. This step ensures predictive accuracy but also helps us to +recover the regions most associated with the generation of the biases. Our +findings hold significant implications for the development of trustworthy and +unbiased AI systems, fostering transparency, accountability, and fairness in +critical decision-making scenarios across diverse domains. + +
+
+
+
+
+ + ☆ PatchBackdoor: Backdoor Attack against Deep Neural Networks without + Model Modification ACM MM 2023 + + +
+ Backdoor attack is a major threat to deep learning systems in safety-critical +scenarios, which aims to trigger misbehavior of neural network models under +attacker-controlled conditions. However, most backdoor attacks have to modify +the neural network models through training with poisoned data and/or direct +model editing, which leads to a common but false belief that backdoor attack +can be easily avoided by properly protecting the model. In this paper, we show +that backdoor attacks can be achieved without any model modification. Instead +of injecting backdoor logic into the training data or the model, we propose to +place a carefully-designed patch (namely backdoor patch) in front of the +camera, which is fed into the model together with the input images. The patch +can be trained to behave normally at most of the time, while producing wrong +prediction when the input image contains an attacker-controlled trigger object. +Our main techniques include an effective training method to generate the +backdoor patch and a digital-physical transformation modeling method to enhance +the feasibility of the patch in real deployments. Extensive experiments show +that PatchBackdoor can be applied to common deep learning models (VGG, +MobileNet, ResNet) with an attack success rate of 93% to 99% on classification +tasks. Moreover, we implement PatchBackdoor in real-world scenarios and show +that the attack is still threatening. + +
+
+ comment: accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ CLIP Multi-modal Hashing: A new baseline CLIPMH ICASSP2024 + + +
+ The multi-modal hashing method is widely used in multimedia retrieval. It can +fuse multi-source data to generate binary hash code. However, the current +multi-modal methods have the problem of low retrieval accuracy. The reason is +that the individual backbone networks have limited feature expression +capabilities and are not jointly pre-trained on large-scale unsupervised +multi-modal data. To solve this problem, we propose a new baseline CLIP +Multi-modal Hashing (CLIPMH) method. It uses CLIP model to extract text and +image features, and then fuse to generate hash code. CLIP improves the +expressiveness of each modal feature. In this way, it can greatly improve the +retrieval performance of multi-modal hashing methods. In comparison to +state-of-the-art unsupervised and supervised multi-modal hashing methods, +experiments reveal that the proposed CLIPMH can significantly enhance +performance (Maximum increase of 8.38%). CLIP also has great advantages over +the text and visual backbone networks commonly used before. + +
+
+ comment: submit to ICASSP2024 +
+
+
+
+
+ + ☆ Time Does Tell: Self-Supervised Time-Tuning of Dense Image + Representations + + +
+ Spatially dense self-supervised learning is a rapidly growing problem domain +with promising applications for unsupervised segmentation and pretraining for +dense downstream tasks. Despite the abundance of temporal data in the form of +videos, this information-rich source has been largely overlooked. Our paper +aims to address this gap by proposing a novel approach that incorporates +temporal consistency in dense self-supervised learning. While methods designed +solely for images face difficulties in achieving even the same performance on +videos, our method improves not only the representation quality for videos-but +also images. Our approach, which we call time-tuning, starts from +image-pretrained models and fine-tunes them with a novel self-supervised +temporal-alignment clustering loss on unlabeled videos. This effectively +facilitates the transfer of high-level information from videos to image +representations. Time-tuning improves the state-of-the-art by 8-10% for +unsupervised semantic segmentation on videos and matches it for images. We +believe this method paves the way for further self-supervised scaling by +leveraging the abundant availability of videos. The implementation can be found +here : https://github.com/SMSD75/Timetuning + +
+
+
+
+
+ + ☆ Enhancing NeRF akin to Enhancing LLMs: Generalizable NeRF Transformer + with Mixture-of-View-Experts ICCV2023 + + +
+ Cross-scene generalizable NeRF models, which can directly synthesize novel +views of unseen scenes, have become a new spotlight of the NeRF field. Several +existing attempts rely on increasingly end-to-end "neuralized" architectures, +i.e., replacing scene representation and/or rendering modules with performant +neural networks such as transformers, and turning novel view synthesis into a +feed-forward inference pipeline. While those feedforward "neuralized" +architectures still do not fit diverse scenes well out of the box, we propose +to bridge them with the powerful Mixture-of-Experts (MoE) idea from large +language models (LLMs), which has demonstrated superior generalization ability +by balancing between larger overall model capacity and flexible per-instance +specialization. Starting from a recent generalizable NeRF architecture called +GNT, we first demonstrate that MoE can be neatly plugged in to enhance the +model. We further customize a shared permanent expert and a geometry-aware +consistency loss to enforce cross-scene consistency and spatial smoothness +respectively, which are essential for generalizable view synthesis. Our +proposed model, dubbed GNT with Mixture-of-View-Experts (GNT-MOVE), has +experimentally shown state-of-the-art results when transferring to unseen +scenes, indicating remarkably better cross-scene generalization in both +zero-shot and few-shot settings. Our codes are available at +https://github.com/VITA-Group/GNT-MOVE. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ An extensible point-based method for data chart value detection + + +
+ We present an extensible method for identifying semantic points to reverse +engineer (i.e. extract the values of) data charts, particularly those in +scientific articles. Our method uses a point proposal network (akin to region +proposal networks for object detection) to directly predict the position of +points of interest in a chart, and it is readily extensible to multiple chart +types and chart elements. We focus on complex bar charts in the scientific +literature, on which our model is able to detect salient points with an +accuracy of 0.8705 F1 (@1.5-cell max deviation); it achieves 0.9810 F1 on +synthetically-generated charts similar to those used in prior works. We also +explore training exclusively on synthetic data with novel augmentations, +reaching surprisingly competent performance in this way (0.6621 F1) on real +charts with widely varying appearance, and we further demonstrate our unchanged +method applied directly to synthetic pie charts (0.8343 F1). Datasets, trained +models, and evaluation code are available at +https://github.com/BNLNLP/PPN_model. + +
+
+
+
+
+ + ☆ Coarse-to-Fine Multi-Scene Pose Regression with Transformers + + +
+ Absolute camera pose regressors estimate the position and orientation of a +camera given the captured image alone. Typically, a convolutional backbone with +a multi-layer perceptron (MLP) head is trained using images and pose labels to +embed a single reference scene at a time. Recently, this scheme was extended to +learn multiple scenes by replacing the MLP head with a set of fully connected +layers. In this work, we propose to learn multi-scene absolute camera pose +regression with Transformers, where encoders are used to aggregate activation +maps with self-attention and decoders transform latent features and scenes +encoding into pose predictions. This allows our model to focus on general +features that are informative for localization, while embedding multiple scenes +in parallel. We extend our previous MS-Transformer approach +\cite{shavit2021learning} by introducing a mixed classification-regression +architecture that improves the localization accuracy. Our method is evaluated +on commonly benchmark indoor and outdoor datasets and has been shown to exceed +both multi-scene and state-of-the-art single-scene absolute pose regressors. + +
+
+ comment: Accepted to IEEE Transactions on Pattern Analysis and Machine + Intelligence (TPAMI). arXiv admin note: substantial text overlap with + arXiv:2103.11468 +
+
+
+
+
+ + ☆ Understanding Hessian Alignment for Domain Generalization ICCV 2023 + + +
+ Out-of-distribution (OOD) generalization is a critical ability for deep +learning models in many real-world scenarios including healthcare and +autonomous vehicles. Recently, different techniques have been proposed to +improve OOD generalization. Among these methods, gradient-based regularizers +have shown promising performance compared with other competitors. Despite this +success, our understanding of the role of Hessian and gradient alignment in +domain generalization is still limited. To address this shortcoming, we analyze +the role of the classifier's head Hessian matrix and gradient in domain +generalization using recent OOD theory of transferability. Theoretically, we +show that spectral norm between the classifier's head Hessian matrices across +domains is an upper bound of the transfer measure, a notion of distance between +target and source domains. Furthermore, we analyze all the attributes that get +aligned when we encourage similarity between Hessians and gradients. Our +analysis explains the success of many regularizers like CORAL, IRM, V-REx, +Fish, IGA, and Fishr as they regularize part of the classifier's head Hessian +and/or gradient. Finally, we propose two simple yet effective methods to match +the classifier's head Hessians and gradients in an efficient way, based on the +Hessian Gradient Product (HGP) and Hutchinson's method (Hutchinson), and +without directly calculating Hessians. We validate the OOD generalization +ability of proposed methods in different scenarios, including transferability, +severe correlation shift, label shift and diversity shift. Our results show +that Hessian alignment methods achieve promising performance on various OOD +benchmarks. The code is available at +\url{https://github.com/huawei-noah/Federated-Learning/tree/main/HessianAlignment}. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ WS-SfMLearner: Self-supervised Monocular Depth and Ego-motion Estimation + on Surgical Videos with Unknown Camera Parameters + + +
+ Depth estimation in surgical video plays a crucial role in many image-guided +surgery procedures. However, it is difficult and time consuming to create depth +map ground truth datasets in surgical videos due in part to inconsistent +brightness and noise in the surgical scene. Therefore, building an accurate and +robust self-supervised depth and camera ego-motion estimation system is gaining +more attention from the computer vision community. Although several +self-supervision methods alleviate the need for ground truth depth maps and +poses, they still need known camera intrinsic parameters, which are often +missing or not recorded. Moreover, the camera intrinsic prediction methods in +existing works depend heavily on the quality of datasets. In this work, we +aimed to build a self-supervised depth and ego-motion estimation system which +can predict not only accurate depth maps and camera pose, but also camera +intrinsic parameters. We proposed a cost-volume-based supervision manner to +give the system auxiliary supervision for camera parameters prediction. The +experimental results showed that the proposed method improved the accuracy of +estimated camera parameters, ego-motion, and depth estimation. + +
+
+
+
+
+ + ☆ SAMSNeRF: Segment Anything Model (SAM) Guides Dynamic Surgical Scene + Reconstruction by Neural Radiance Field (NeRF) + + +
+ The accurate reconstruction of surgical scenes from surgical videos is +critical for various applications, including intraoperative navigation and +image-guided robotic surgery automation. However, previous approaches, mainly +relying on depth estimation, have limited effectiveness in reconstructing +surgical scenes with moving surgical tools. To address this limitation and +provide accurate 3D position prediction for surgical tools in all frames, we +propose a novel approach called SAMSNeRF that combines Segment Anything Model +(SAM) and Neural Radiance Field (NeRF) techniques. Our approach generates +accurate segmentation masks of surgical tools using SAM, which guides the +refinement of the dynamic surgical scene reconstruction by NeRF. Our +experimental results on public endoscopy surgical videos demonstrate that our +approach successfully reconstructs high-fidelity dynamic surgical scenes and +accurately reflects the spatial information of surgical tools. Our proposed +approach can significantly enhance surgical navigation and automation by +providing surgeons with accurate 3D position information of surgical tools +during surgery.The source code will be released soon. + +
+
+
+
+
+ + ☆ 3ET: Efficient Event-based Eye Tracking using a Change-Based ConvLSTM + Network + + +
+ This paper presents a sparse Change-Based Convolutional Long Short-Term +Memory (CB-ConvLSTM) model for event-based eye tracking, key for +next-generation wearable healthcare technology such as AR/VR headsets. We +leverage the benefits of retina-inspired event cameras, namely their +low-latency response and sparse output event stream, over traditional +frame-based cameras. Our CB-ConvLSTM architecture efficiently extracts +spatio-temporal features for pupil tracking from the event stream, +outperforming conventional CNN structures. Utilizing a delta-encoded recurrent +path enhancing activation sparsity, CB-ConvLSTM reduces arithmetic operations +by approximately 4.7$\times$ without losing accuracy when tested on a +\texttt{v2e}-generated event dataset of labeled pupils. This increase in +efficiency makes it ideal for real-time eye tracking in resource-constrained +devices. The project code and dataset are openly available at +\url{https://github.com/qinche106/cb-convlstm-eyetracking}. + +
+
+ comment: To be published at the 2023 IEEE Biomedical Circuits and Systems + (BioCAS) Conference +
+
+
+
+
+ + ☆ Weakly Supervised Face and Whole Body Recognition in Turbulent + Environments + + +
+ Face and person recognition have recently achieved remarkable success under +challenging scenarios, such as off-pose and cross-spectrum matching. However, +long-range recognition systems are often hindered by atmospheric turbulence, +leading to spatially and temporally varying distortions in the image. Current +solutions rely on generative models to reconstruct a turbulent-free image, but +often preserve photo-realism instead of discriminative features that are +essential for recognition. This can be attributed to the lack of large-scale +datasets of turbulent and pristine paired images, necessary for optimal +reconstruction. To address this issue, we propose a new weakly supervised +framework that employs a parameter-efficient self-attention module to generate +domain agnostic representations, aligning turbulent and pristine images into a +common subspace. Additionally, we introduce a new tilt map estimator that +predicts geometric distortions observed in turbulent images. This estimate is +used to re-rank gallery matches, resulting in up to 13.86\% improvement in +rank-1 accuracy. Our method does not require synthesizing turbulent-free images +or ground-truth paired images, and requires significantly fewer annotated +samples, enabling more practical and rapid utility of increasingly large +datasets. We analyze our framework using two datasets -- Long-Range Face +Identification Dataset (LRFID) and BRIAR Government Collection 1 (BGC1) -- +achieving enhanced discriminability under varying turbulence and standoff +distance. + +
+
+ comment: IJCB 2023 +
+
+
+
+
+ + ☆ Efficient Controllable Multi-Task Architectures ICCV 2023 + + +
+ We aim to train a multi-task model such that users can adjust the desired +compute budget and relative importance of task performances after deployment, +without retraining. This enables optimizing performance for dynamically varying +user needs, without heavy computational overhead to train and save models for +various scenarios. To this end, we propose a multi-task model consisting of a +shared encoder and task-specific decoders where both encoder and decoder +channel widths are slimmable. Our key idea is to control the task importance by +varying the capacities of task-specific decoders, while controlling the total +computational cost by jointly adjusting the encoder capacity. This improves +overall accuracy by allowing a stronger encoder for a given budget, increases +control over computational cost, and delivers high-quality slimmed +sub-architectures based on user's constraints. Our training strategy involves a +novel 'Configuration-Invariant Knowledge Distillation' loss that enforces +backbone representations to be invariant under different runtime width +configurations to enhance accuracy. Further, we present a simple but effective +search algorithm that translates user constraints to runtime width +configurations of both the shared encoder and task decoders, for sampling the +sub-architectures. The key rule for the search algorithm is to provide a larger +computational budget to the higher preferred task decoder, while searching a +shared encoder configuration that enhances the overall MTL performance. Various +experiments on three multi-task benchmarks (PASCALContext, NYUDv2, and +CIFAR100-MTL) with diverse backbone architectures demonstrate the advantage of +our approach. For example, our method shows a higher controllability by ~33.5% +in the NYUD-v2 dataset over prior methods, while incurring much less compute +cost. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Animal3D: A Comprehensive Dataset of 3D Animal Pose and Shape + + +
+ Accurately estimating the 3D pose and shape is an essential step towards +understanding animal behavior, and can potentially benefit many downstream +applications, such as wildlife conservation. However, research in this area is +held back by the lack of a comprehensive and diverse dataset with high-quality +3D pose and shape annotations. In this paper, we propose Animal3D, the first +comprehensive dataset for mammal animal 3D pose and shape estimation. Animal3D +consists of 3379 images collected from 40 mammal species, high-quality +annotations of 26 keypoints, and importantly the pose and shape parameters of +the SMAL model. All annotations were labeled and checked manually in a +multi-stage process to ensure highest quality results. Based on the Animal3D +dataset, we benchmark representative shape and pose estimation models at: (1) +supervised learning from only the Animal3D data, (2) synthetic to real transfer +from synthetically generated images, and (3) fine-tuning human pose and shape +estimation models. Our experimental results demonstrate that predicting the 3D +shape and pose of animals across species remains a very challenging task, +despite significant advances in human pose estimation. Our results further +demonstrate that synthetic pre-training is a viable strategy to boost the model +performance. Overall, Animal3D opens new directions for facilitating future +research in animal 3D pose and shape estimation, and is publicly available. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ (Un)fair Exposure in Deep Face Rankings at a Distance + + +
+ Law enforcement regularly faces the challenge of ranking suspects from their +facial images. Deep face models aid this process but frequently introduce +biases that disproportionately affect certain demographic segments. While bias +investigation is common in domains like job candidate ranking, the field of +forensic face rankings remains underexplored. In this paper, we propose a novel +experimental framework, encompassing six state-of-the-art face encoders and two +public data sets, designed to scrutinize the extent to which demographic groups +suffer from biases in exposure in the context of forensic face rankings. +Through comprehensive experiments that cover both re-identification and +identification tasks, we show that exposure biases within this domain are far +from being countered, demanding attention towards establishing ad-hoc policies +and corrective measures. The source code is available at +https://github.com/atzoriandrea/ijcb2023-unfair-face-rankings + +
+
+ comment: Accepted as a full paper at IJCB 2023 Special Session "Long-Range + Biometrics Challenges": 2023 International Joint Conference on Biometrics +
+
+
+
+
+ + ☆ Efficient Benchmarking (of Language Models) + + +
+ The increasing versatility of language models LMs has given rise to a new +class of benchmarks that comprehensively assess a broad range of capabilities. +Such benchmarks are associated with massive computational costs reaching +thousands of GPU hours per model. However the efficiency aspect of these +evaluation efforts had raised little discussion in the literature. In this work +we present the problem of Efficient Benchmarking namely intelligently reducing +the computation costs of LM evaluation without compromising reliability. Using +the HELM benchmark as a test case we investigate how different benchmark design +choices affect the computation-reliability tradeoff. We propose to evaluate the +reliability of such decisions by using a new measure Decision Impact on +Reliability DIoR for short. We find for example that the current leader on HELM +may change by merely removing a low-ranked model from the benchmark and observe +that a handful of examples suffice to obtain the correct benchmark ranking. +Conversely a slightly different choice of HELM scenarios varies ranking widely. +Based on our findings we outline a set of concrete recommendations for more +efficient benchmark design and utilization practices leading to dramatic cost +savings with minimal loss of benchmark reliability often reducing computation +by x100 or more. + +
+
+
+
+
+ + ♻ ☆ Deep Learning Approaches on Image Captioning: A Review + + +
+ Image captioning is a research area of immense importance, aiming to generate +natural language descriptions for visual content in the form of still images. +The advent of deep learning and more recently vision-language pre-training +techniques has revolutionized the field, leading to more sophisticated methods +and improved performance. In this survey paper, we provide a structured review +of deep learning methods in image captioning by presenting a comprehensive +taxonomy and discussing each method category in detail. Additionally, we +examine the datasets commonly employed in image captioning research, as well as +the evaluation metrics used to assess the performance of different captioning +models. We address the challenges faced in this field by emphasizing issues +such as object hallucination, missing context, illumination conditions, +contextual understanding, and referring expressions. We rank different deep +learning methods' performance according to widely used evaluation metrics, +giving insight into the current state of the art. Furthermore, we identify +several potential future directions for research in this area, which include +tackling the information misalignment problem between image and text +modalities, mitigating dataset bias, incorporating vision-language pre-training +methods to enhance caption generation, and developing improved evaluation tools +to accurately measure the quality of image captions. + +
+
+ comment: 41 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Phenotype-preserving metric design for high-content image reconstruction + by generative inpainting + + +
+ In the past decades, automated high-content microscopy demonstrated its +ability to deliver large quantities of image-based data powering the +versatility of phenotypic drug screening and systems biology applications. +However, as the sizes of image-based datasets grew, it became infeasible for +humans to control, avoid and overcome the presence of imaging and sample +preparation artefacts in the images. While novel techniques like machine +learning and deep learning may address these shortcomings through generative +image inpainting, when applied to sensitive research data this may come at the +cost of undesired image manipulation. Undesired manipulation may be caused by +phenomena such as neural hallucinations, to which some artificial neural +networks are prone. To address this, here we evaluate the state-of-the-art +inpainting methods for image restoration in a high-content fluorescence +microscopy dataset of cultured cells with labelled nuclei. We show that +architectures like DeepFill V2 and Edge Connect can faithfully restore +microscopy images upon fine-tuning with relatively little data. Our results +demonstrate that the area of the region to be restored is of higher importance +than shape. Furthermore, to control for the quality of restoration, we propose +a novel phenotype-preserving metric design strategy. In this strategy, the size +and count of the restored biological phenotypes like cell nuclei are quantified +to penalise undesirable manipulation. We argue that the design principles of +our approach may also generalise to other applications. + +
+
+ comment: 8 pages, 3 figures, conference proceedings +
+
+
+
+
+ + ♻ ☆ Skin Lesion Correspondence Localization in Total Body Photography MICCAI-2023 + + +
+ Longitudinal tracking of skin lesions - finding correspondence, changes in +morphology, and texture - is beneficial to the early detection of melanoma. +However, it has not been well investigated in the context of full-body imaging. +We propose a novel framework combining geometric and texture information to +localize skin lesion correspondence from a source scan to a target scan in +total body photography (TBP). Body landmarks or sparse correspondence are first +created on the source and target 3D textured meshes. Every vertex on each of +the meshes is then mapped to a feature vector characterizing the geodesic +distances to the landmarks on that mesh. Then, for each lesion of interest +(LOI) on the source, its corresponding location on the target is first coarsely +estimated using the geometric information encoded in the feature vectors and +then refined using the texture information. We evaluated the framework +quantitatively on both a public and a private dataset, for which our success +rates (at 10 mm criterion) are comparable to the only reported longitudinal +study. As full-body 3D capture becomes more prevalent and has higher quality, +we expect the proposed method to constitute a valuable step in the longitudinal +tracking of skin lesions. + +
+
+ comment: MICCAI-2023 +
+
+
+
+
+ + ♻ ☆ Type-to-Track: Retrieve Any Object via Prompt-based Tracking + + +
+ One of the recent trends in vision problems is to use natural language +captions to describe the objects of interest. This approach can overcome some +limitations of traditional methods that rely on bounding boxes or category +annotations. This paper introduces a novel paradigm for Multiple Object +Tracking called Type-to-Track, which allows users to track objects in videos by +typing natural language descriptions. We present a new dataset for that +Grounded Multiple Object Tracking task, called GroOT, that contains videos with +various types of objects and their corresponding textual captions describing +their appearance and action in detail. Additionally, we introduce two new +evaluation protocols and formulate evaluation metrics specifically for this +task. We develop a new efficient method that models a transformer-based +eMbed-ENcoDE-extRact framework (MENDER) using the third-order tensor +decomposition. The experiments in five scenarios show that our MENDER approach +outperforms another two-stage design in terms of accuracy and efficiency, up to +14.7% accuracy and 4$\times$ speed faster. + +
+
+ comment: 23 pages, 9 tables, 8 figures +
+
+
+
+
+ + ♻ ☆ Two Approaches to Supervised Image Segmentation + + +
+ Though performed almost effortlessly by humans, segmenting 2D gray-scale or +color images into respective regions of interest (e.g.~background, objects, or +portions of objects) constitutes one of the greatest challenges in science and +technology as a consequence of several effects including dimensionality +reduction(3D to 2D), noise, reflections, shades, and occlusions, among many +other possibilities. While a large number of interesting related approaches +have been suggested along the last decades, it was mainly thanks to the recent +development of deep learning that more effective and general solutions have +been obtained, currently constituting the basic comparison reference for this +type of operation. Also developed recently, a multiset-based methodology has +been described that is capable of encouraging image segmentation performance +combining spatial accuracy, stability, and robustness while requiring little +computational resources (hardware and/or training and recognition time). The +interesting features of the multiset neurons methodology mostly follow from the +enhanced selectivity and sensitivity, as well as good robustness to data +perturbations and outliers, allowed by the coincidence similarity index on +which the multiset approach to supervised image segmentation is founded. After +describing the deep learning and multiset neurons approaches, the present work +develops comparison experiments between them which are primarily aimed at +illustrating their respective main interesting features when applied to the +adopted specific type of data and parameter configurations. While the deep +learning approach confirmed its potential for performing image segmentation, +the alternative multiset methodology allowed for enhanced accuracy while +requiring little computational resources. + +
+
+ comment: 38 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ H4VDM: H.264 Video Device Matching + + +
+ Methods that can determine if two given video sequences are captured by the +same device (e.g., mobile telephone or digital camera) can be used in many +forensics tasks. In this paper we refer to this as "video device matching". In +open-set video forensics scenarios it is easier to determine if two video +sequences were captured with the same device than identifying the specific +device. In this paper, we propose a technique for open-set video device +matching. Given two H.264 compressed video sequences, our method can determine +if they are captured by the same device, even if our method has never +encountered the device in training. We denote our proposed technique as H.264 +Video Device Matching (H4VDM). H4VDM uses H.264 compression information +extracted from video sequences to make decisions. It is more robust against +artifacts that alter camera sensor fingerprints, and it can be used to analyze +relatively small fragments of the H.264 sequence. We trained and tested our +method on a publicly available video forensics dataset consisting of 35 +devices, where our proposed method demonstrated good performance. + +
+
+
+
+
+ + ♻ ☆ An Instance Segmentation Dataset of Yeast Cells in Microstructures + + +
+ Extracting single-cell information from microscopy data requires accurate +instance-wise segmentations. Obtaining pixel-wise segmentations from microscopy +imagery remains a challenging task, especially with the added complexity of +microstructured environments. This paper presents a novel dataset for +segmenting yeast cells in microstructures. We offer pixel-wise instance +segmentation labels for both cells and trap microstructures. In total, we +release 493 densely annotated microscopy images. To facilitate a unified +comparison between novel segmentation algorithms, we propose a standardized +evaluation strategy for our dataset. The aim of the dataset and evaluation +strategy is to facilitate the development of new cell segmentation approaches. +The dataset is publicly available at +https://christophreich1996.github.io/yeast_in_microstructures_dataset/ . + +
+
+ comment: IEEE EMBC 2023 (in press), Christoph Reich and Tim Prangemeier - both + authors contributed equally +
+
+
+
+
+ + ♻ ☆ Fairness in Image Search: A Study of Occupational Stereotyping in Image + Retrieval and its Debiasing + + +
+ Multi-modal search engines have experienced significant growth and widespread +use in recent years, making them the second most common internet use. While +search engine systems offer a range of services, the image search field has +recently become a focal point in the information retrieval community, as the +adage goes, "a picture is worth a thousand words". Although popular search +engines like Google excel at image search accuracy and agility, there is an +ongoing debate over whether their search results can be biased in terms of +gender, language, demographics, socio-cultural aspects, and stereotypes. This +potential for bias can have a significant impact on individuals' perceptions +and influence their perspectives. + In this paper, we present our study on bias and fairness in web search, with +a focus on keyword-based image search. We first discuss several kinds of biases +that exist in search systems and why it is important to mitigate them. We +narrow down our study to assessing and mitigating occupational stereotypes in +image search, which is a prevalent fairness issue in image retrieval. For the +assessment of stereotypes, we take gender as an indicator. We explore various +open-source and proprietary APIs for gender identification from images. With +these, we examine the extent of gender bias in top-tanked image search results +obtained for several occupational keywords. To mitigate the bias, we then +propose a fairness-aware re-ranking algorithm that optimizes (a) relevance of +the search result with the keyword and (b) fairness w.r.t genders identified. +We experiment on 100 top-ranked images obtained for 10 occupational keywords +and consider random re-ranking and re-ranking based on relevance as baselines. +Our experimental results show that the fairness-aware re-ranking algorithm +produces rankings with better fairness scores and competitive relevance scores +than the baselines. + +
+
+ comment: 20 Pages, Work uses Proprietary Search Systems from the year 2021 +
+
+
+
+
+ + ♻ ☆ FedSIS: Federated Split Learning with Intermediate Representation + Sampling for Privacy-preserving Generalized Face Presentation Attack + Detection + + +
+ Lack of generalization to unseen domains/attacks is the Achilles heel of most +face presentation attack detection (FacePAD) algorithms. Existing attempts to +enhance the generalizability of FacePAD solutions assume that data from +multiple source domains are available with a single entity to enable +centralized training. In practice, data from different source domains may be +collected by diverse entities, who are often unable to share their data due to +legal and privacy constraints. While collaborative learning paradigms such as +federated learning (FL) can overcome this problem, standard FL methods are +ill-suited for domain generalization because they struggle to surmount the twin +challenges of handling non-iid client data distributions during training and +generalizing to unseen domains during inference. In this work, a novel +framework called Federated Split learning with Intermediate representation +Sampling (FedSIS) is introduced for privacy-preserving domain generalization. +In FedSIS, a hybrid Vision Transformer (ViT) architecture is learned using a +combination of FL and split learning to achieve robustness against statistical +heterogeneity in the client data distributions without any sharing of raw data +(thereby preserving privacy). To further improve generalization to unseen +domains, a novel feature augmentation strategy called intermediate +representation sampling is employed, and discriminative information from +intermediate blocks of a ViT is distilled using a shared adapter network. The +FedSIS approach has been evaluated on two well-known benchmarks for +cross-domain FacePAD to demonstrate that it is possible to achieve +state-of-the-art generalization performance without data sharing. Code: +https://github.com/Naiftt/FedSIS + +
+
+ comment: Accepted to the IEEE International Joint Conference on Biometrics + (IJCB), 2023 +
+
+
+
+
+ + ♻ ☆ Exploiting Inter-Sample Affinity for Knowability-Aware Universal Domain + Adaptation + + +
+ Universal domain adaptation (UniDA) aims to transfer the knowledge of common +classes from the source domain to the target domain without any prior knowledge +on the label set, which requires distinguishing in the target domain the +unknown samples from the known ones. Recent methods usually focused on +categorizing a target sample into one of the source classes rather than +distinguishing known and unknown samples, which ignores the inter-sample +affinity between known and unknown samples and may lead to suboptimal +performance. Aiming at this issue, we propose a novel UDA framework where such +inter-sample affinity is exploited. Specifically, we introduce a +knowability-based labeling scheme which can be divided into two steps: 1) +Knowability-guided detection of known and unknown samples based on the +intrinsic structure of the neighborhoods of samples, where we leverage the +first singular vectors of the affinity matrices to obtain the knowability of +every target sample. 2) Label refinement based on neighborhood consistency to +relabel the target samples, where we refine the labels of each target sample +based on its neighborhood consistency of predictions. Then, auxiliary losses +based on the two steps are used to reduce the inter-sample affinity between the +unknown and the known target samples. Finally, experiments on four public +datasets demonstrate that our method significantly outperforms existing +state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Improving automatic endoscopic stone recognition using a multi-view + fusion approach enhanced with two-step transfer learning ICCV 2023 + + +
+ This contribution presents a deep-learning method for extracting and fusing +image information acquired from different viewpoints, with the aim to produce +more discriminant object features for the identification of the type of kidney +stones seen in endoscopic images. The model was further improved with a +two-step transfer learning approach and by attention blocks to refine the +learned feature maps. Deep feature fusion strategies improved the results of +single view extraction backbone models by more than 6% in terms of accuracy of +the kidney stones classification. + +
+
+ comment: This paper has been accepted at the LatinX in Computer Vision (LXCV) + Research workshop at ICCV 2023 (Paris, France) +
+
+
+
+
+ + ♻ ☆ Constrained Probabilistic Mask Learning for Task-specific Undersampled + MRI Reconstruction WACV 2024 + + +
+ Undersampling is a common method in Magnetic Resonance Imaging (MRI) to +subsample the number of data points in k-space, reducing acquisition times at +the cost of decreased image quality. A popular approach is to employ +undersampling patterns following various strategies, e.g., variable density +sampling or radial trajectories. In this work, we propose a method that +directly learns the undersampling masks from data points, thereby also +providing task- and domain-specific patterns. To solve the resulting discrete +optimization problem, we propose a general optimization routine called ProM: A +fully probabilistic, differentiable, versatile, and model-free framework for +mask optimization that enforces acceleration factors through a convex +constraint. Analyzing knee, brain, and cardiac MRI datasets with our method, we +discover that different anatomic regions reveal distinct optimal undersampling +masks, demonstrating the benefits of using custom masks, tailored for a +downstream task. For example, ProM can create undersampling masks that maximize +performance in downstream tasks like segmentation with networks trained on +fully-sampled MRIs. Even with extreme acceleration factors, ProM yields +reasonable performance while being more versatile than existing methods, paving +the way for data-driven all-purpose mask generation. + +
+
+ comment: accepted at WACV 2024 +
+
+
+
+
+ + ♻ ☆ Evading Watermark based Detection of AI-Generated Content CCS + + +
+ A generative AI model can generate extremely realistic-looking content, +posing growing challenges to the authenticity of information. To address the +challenges, watermark has been leveraged to detect AI-generated content. +Specifically, a watermark is embedded into an AI-generated content before it is +released. A content is detected as AI-generated if a similar watermark can be +decoded from it. In this work, we perform a systematic study on the robustness +of such watermark-based AI-generated content detection. We focus on +AI-generated images. Our work shows that an attacker can post-process a +watermarked image via adding a small, human-imperceptible perturbation to it, +such that the post-processed image evades detection while maintaining its +visual quality. We show the effectiveness of our attack both theoretically and +empirically. Moreover, to evade detection, our adversarial post-processing +method adds much smaller perturbations to AI-generated images and thus better +maintain their visual quality than existing popular post-processing methods +such as JPEG compression, Gaussian blur, and Brightness/Contrast. Our work +shows the insufficiency of existing watermark-based detection of AI-generated +content, highlighting the urgent needs of new methods. Our code is publicly +available: https://github.com/zhengyuan-jiang/WEvade. + +
+
+ comment: To appear in ACM Conference on Computer and Communications Security + (CCS), 2023 +
+
+
+
+
+ + ♻ ☆ What Can Simple Arithmetic Operations Do for Temporal Modeling? ICCV 2023 + + +
+ Temporal modeling plays a crucial role in understanding video content. To +tackle this problem, previous studies built complicated temporal relations +through time sequence thanks to the development of computationally powerful +devices. In this work, we explore the potential of four simple arithmetic +operations for temporal modeling. Specifically, we first capture auxiliary +temporal cues by computing addition, subtraction, multiplication, and division +between pairs of extracted frame features. Then, we extract corresponding +features from these cues to benefit the original temporal-irrespective domain. +We term such a simple pipeline as an Arithmetic Temporal Module (ATM), which +operates on the stem of a visual backbone with a plug-and-play style. We +conduct comprehensive ablation studies on the instantiation of ATMs and +demonstrate that this module provides powerful temporal modeling capability at +a low computational cost. Moreover, the ATM is compatible with both CNNs- and +ViTs-based architectures. Our results show that ATM achieves superior +performance over several popular video benchmarks. Specifically, on +Something-Something V1, V2 and Kinetics-400, we reach top-1 accuracy of 65.6%, +74.6%, and 89.4% respectively. The code is available at +https://github.com/whwu95/ATM. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ High-Fidelity Eye Animatable Neural Radiance Fields for Human Face BMVC 2023 + + +
+ Face rendering using neural radiance fields (NeRF) is a rapidly developing +research area in computer vision. While recent methods primarily focus on +controlling facial attributes such as identity and expression, they often +overlook the crucial aspect of modeling eyeball rotation, which holds +importance for various downstream tasks. In this paper, we aim to learn a face +NeRF model that is sensitive to eye movements from multi-view images. We +address two key challenges in eye-aware face NeRF learning: how to effectively +capture eyeball rotation for training and how to construct a manifold for +representing eyeball rotation. To accomplish this, we first fit FLAME, a +well-established parametric face model, to the multi-view images considering +multi-view consistency. Subsequently, we introduce a new Dynamic Eye-aware NeRF +(DeNeRF). DeNeRF transforms 3D points from different views into a canonical +space to learn a unified face NeRF model. We design an eye deformation field +for the transformation, including rigid transformation, e.g., eyeball rotation, +and non-rigid transformation. Through experiments conducted on the ETH-XGaze +dataset, we demonstrate that our model is capable of generating high-fidelity +images with accurate eyeball rotation and non-rigid periocular deformation, +even under novel viewing angles. Furthermore, we show that utilizing the +rendered images can effectively enhance gaze estimation performance. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+ + ♻ ☆ Performance Enhancement Leveraging Mask-RCNN on Bengali Document Layout + Analysis + + +
+ Understanding digital documents is like solving a puzzle, especially +historical ones. Document Layout Analysis (DLA) helps with this puzzle by +dividing documents into sections like paragraphs, images, and tables. This is +crucial for machines to read and understand these documents. In the DL Sprint +2.0 competition, we worked on understanding Bangla documents. We used a dataset +called BaDLAD with lots of examples. We trained a special model called Mask +R-CNN to help with this understanding. We made this model better by +step-by-step hyperparameter tuning, and we achieved a good dice score of 0.889. +However, not everything went perfectly. We tried using a model trained for +English documents, but it didn't fit well with Bangla. This showed us that each +language has its own challenges. Our solution for the DL Sprint 2.0 is publicly +available at https://www.kaggle.com/competitions/dlsprint2/discussion/432201 +along with notebooks, weights, and inference notebook. + +
+
+ comment: Contest paper, Conest: DL sprint 2.0 (Link: + https://www.kaggle.com/competitions/dlsprint2), Solution link: + https://www.kaggle.com/competitions/dlsprint2/discussion/432201 +
+
+
+
+
+ + ♻ ☆ DynaMITe: Dynamic Query Bootstrapping for Multi-object Interactive + Segmentation Transformer ICCV 2023 + + +
+ Most state-of-the-art instance segmentation methods rely on large amounts of +pixel-precise ground-truth annotations for training, which are expensive to +create. Interactive segmentation networks help generate such annotations based +on an image and the corresponding user interactions such as clicks. Existing +methods for this task can only process a single instance at a time and each +user interaction requires a full forward pass through the entire deep network. +We introduce a more efficient approach, called DynaMITe, in which we represent +user interactions as spatio-temporal queries to a Transformer decoder with a +potential to segment multiple object instances in a single iteration. Our +architecture also alleviates any need to re-compute image features during +refinement, and requires fewer interactions for segmenting multiple instances +in a single image when compared to other methods. DynaMITe achieves +state-of-the-art results on multiple existing interactive segmentation +benchmarks, and also on the new multi-instance benchmark that we propose in +this paper. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Human Preference Score: Better Aligning Text-to-Image Models with Human + Preference ICCV 2023 + + +
+ Recent years have witnessed a rapid growth of deep generative models, with +text-to-image models gaining significant attention from the public. However, +existing models often generate images that do not align well with human +preferences, such as awkward combinations of limbs and facial expressions. To +address this issue, we collect a dataset of human choices on generated images +from the Stable Foundation Discord channel. Our experiments demonstrate that +current evaluation metrics for generative models do not correlate well with +human choices. Thus, we train a human preference classifier with the collected +dataset and derive a Human Preference Score (HPS) based on the classifier. +Using HPS, we propose a simple yet effective method to adapt Stable Diffusion +to better align with human preferences. Our experiments show that HPS +outperforms CLIP in predicting human choices and has good generalization +capability toward images generated from other models. By tuning Stable +Diffusion with the guidance of HPS, the adapted model is able to generate +images that are more preferred by human users. The project page is available +here: https://tgxs002.github.io/align_sd_web/ . + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ The Unreasonable Effectiveness of Large Language-Vision Models for + Source-free Video Domain Adaptation ICCV2023 + + +
+ Source-Free Video Unsupervised Domain Adaptation (SFVUDA) task consists in +adapting an action recognition model, trained on a labelled source dataset, to +an unlabelled target dataset, without accessing the actual source data. The +previous approaches have attempted to address SFVUDA by leveraging +self-supervision (e.g., enforcing temporal consistency) derived from the target +data itself. In this work, we take an orthogonal approach by exploiting +"web-supervision" from Large Language-Vision Models (LLVMs), driven by the +rationale that LLVMs contain a rich world prior surprisingly robust to +domain-shift. We showcase the unreasonable effectiveness of integrating LLVMs +for SFVUDA by devising an intuitive and parameter-efficient method, which we +name Domain Adaptation with Large Language-Vision models (DALL-V), that +distills the world prior and complementary source model information into a +student network tailored for the target. Despite the simplicity, DALL-V +achieves significant improvement over state-of-the-art SFVUDA methods. + +
+
+ comment: Accepted at ICCV2023, 14 pages, 7 figures, code is available at + https://github.com/giaczara/dallv +
+
+
+
+
+ + ♻ ☆ GMD: Controllable Human Motion Synthesis via Guided Diffusion Models ICCV23 + + +
+ Denoising diffusion models have shown great promise in human motion synthesis +conditioned on natural language descriptions. However, integrating spatial +constraints, such as pre-defined motion trajectories and obstacles, remains a +challenge despite being essential for bridging the gap between isolated human +motion and its surrounding environment. To address this issue, we propose +Guided Motion Diffusion (GMD), a method that incorporates spatial constraints +into the motion generation process. Specifically, we propose an effective +feature projection scheme that manipulates motion representation to enhance the +coherency between spatial information and local poses. Together with a new +imputation formulation, the generated motion can reliably conform to spatial +constraints such as global motion trajectories. Furthermore, given sparse +spatial constraints (e.g. sparse keyframes), we introduce a new dense guidance +approach to turn a sparse signal, which is susceptible to being ignored during +the reverse steps, into denser signals to guide the generated motion to the +given constraints. Our extensive experiments justify the development of GMD, +which achieves a significant improvement over state-of-the-art methods in +text-based motion generation while allowing control of the synthesized motions +with spatial constraints. + +
+
+ comment: ICCV23. Project page: https://korrawe.github.io/gmd-project/ +
+
+
+
+
+ + ♻ ☆ EA-LSS: Edge-aware Lift-splat-shot Framework for 3D BEV Object Detection + + +
+ In recent years, great progress has been made in the Lift-Splat-Shot-based +(LSS-based) 3D object detection method. However, inaccurate depth estimation +remains an important constraint to the accuracy of camera-only and multi-model +3D object detection models, especially in regions where the depth changes +significantly (i.e., the ``depth jump'' problem). In this paper, we proposed a +novel Edge-aware Lift-splat-shot (EA-LSS) framework. Specifically, edge-aware +depth fusion (EADF) module is proposed to alleviate the ``depth jump'' problem +and fine-grained depth (FGD) module to further enforce refined supervision on +depth. Our EA-LSS framework is compatible for any LSS-based 3D object detection +models, and effectively boosts their performances with negligible increment of +inference time. Experiments on nuScenes benchmarks demonstrate that EA-LSS is +effective in either camera-only or multi-model models. It is worth mentioning +that EA-LSS achieved the state-of-the-art performance on nuScenes test +benchmarks with mAP and NDS of 76.5% and 77.6%, respectively. + +
+
+
+
+
+ + ♻ ☆ Semantic Parsing of Colonoscopy Videos with Multi-Label Temporal + Networks + + +
+ Following the successful debut of polyp detection and characterization, more +advanced automation tools are being developed for colonoscopy. The new +automation tasks, such as quality metrics or report generation, require +understanding of the procedure flow that includes activities, events, +anatomical landmarks, etc. In this work we present a method for automatic +semantic parsing of colonoscopy videos. The method uses a novel DL multi-label +temporal segmentation model trained in supervised and unsupervised regimes. We +evaluate the accuracy of the method on a test set of over 300 annotated +colonoscopy videos, and use ablation to explore the relative importance of +various method's components. + +
+
+
+
+
+ + ♻ ☆ Robustness of SAM: Segment Anything Under Corruptions and Beyond + + +
+ Segment anything model (SAM), as the name suggests, is claimed to be capable +of cutting out any object and demonstrates impressive zero-shot transfer +performance with the guidance of a prompt. However, there is currently a lack +of comprehensive evaluation regarding its robustness under various corruptions. +Understanding SAM's robustness across different corruption scenarios is crucial +for its real-world deployment. Prior works show that SAM is biased towards +texture (style) rather than shape, motivated by which we start by investigating +SAM's robustness against style transfer, which is synthetic corruption. +Following the interpretation of the corruption's effect as style change, we +proceed to conduct a comprehensive evaluation of the SAM for its robustness +against 15 types of common corruption. These corruptions mainly fall into +categories such as digital, noise, weather, and blur. Within each of these +corruption categories, we explore 5 severity levels to simulate real-world +corruption scenarios. Beyond the corruptions, we further assess its robustness +regarding local occlusion and local adversarial patch attacks in images. To the +best of our knowledge, our work is the first of its kind to evaluate the +robustness of SAM under style change, local occlusion, and local adversarial +patch attacks. Considering that patch attacks visible to human eyes are easily +detectable, we also assess SAM's robustness against adversarial perturbations +that are imperceptible to human eyes. Overall, this work provides a +comprehensive empirical study on SAM's robustness, evaluating its performance +under various corruptions and extending the assessment to critical aspects like +local occlusion, local patch attacks, and imperceptible adversarial +perturbations, which yields valuable insights into SAM's practical +applicability and effectiveness in addressing real-world challenges. + +
+
+ comment: The first work evaluates the robustness of SAM under various + corruptions such as style transfer, local occlusion, and adversarial patch + attack +
+
+
+
+
+ + ♻ ☆ Implicit Neural Representation for Cooperative Low-light Image + Enhancement + + +
+ The following three factors restrict the application of existing low-light +image enhancement methods: unpredictable brightness degradation and noise, +inherent gap between metric-favorable and visual-friendly versions, and the +limited paired training data. To address these limitations, we propose an +implicit Neural Representation method for Cooperative low-light image +enhancement, dubbed NeRCo. It robustly recovers perceptual-friendly results in +an unsupervised manner. Concretely, NeRCo unifies the diverse degradation +factors of real-world scenes with a controllable fitting function, leading to +better robustness. In addition, for the output results, we introduce +semantic-orientated supervision with priors from the pre-trained +vision-language model. Instead of merely following reference images, it +encourages results to meet subjective expectations, finding more +visual-friendly solutions. Further, to ease the reliance on paired data and +reduce solution space, we develop a dual-closed-loop constrained enhancement +module. It is trained cooperatively with other affiliated modules in a +self-supervised manner. Finally, extensive experiments demonstrate the +robustness and superior effectiveness of our proposed NeRCo. Our code is +available at https://github.com/Ysz2022/NeRCo. + +
+
+
+
+
+ + ♻ ☆ Probable Domain Generalization via Quantile Risk Minimization NeurIPS 2022 + + +
+ Domain generalization (DG) seeks predictors which perform well on unseen test +distributions by leveraging data drawn from multiple related training +distributions or domains. To achieve this, DG is commonly formulated as an +average- or worst-case problem over the set of possible domains. However, +predictors that perform well on average lack robustness while predictors that +perform well in the worst case tend to be overly-conservative. To address this, +we propose a new probabilistic framework for DG where the goal is to learn +predictors that perform well with high probability. Our key idea is that +distribution shifts seen during training should inform us of probable shifts at +test time, which we realize by explicitly relating training and test domains as +draws from the same underlying meta-distribution. To achieve probable DG, we +propose a new optimization problem called Quantile Risk Minimization (QRM). By +minimizing the $\alpha$-quantile of predictor's risk distribution over domains, +QRM seeks predictors that perform well with probability $\alpha$. To solve QRM +in practice, we propose the Empirical QRM (EQRM) algorithm and provide: (i) a +generalization bound for EQRM; and (ii) the conditions under which EQRM +recovers the causal predictor as $\alpha \to 1$. In our experiments, we +introduce a more holistic quantile-focused evaluation protocol for DG and +demonstrate that EQRM outperforms state-of-the-art baselines on datasets from +WILDS and DomainBed. + +
+
+ comment: NeurIPS 2022 camera-ready (+ minor corrections) +
+
+
+
+
+ + ♻ ☆ CROSSFIRE: Camera Relocalization On Self-Supervised Features from an + Implicit Representation ICCV 2023 + + +
+ Beyond novel view synthesis, Neural Radiance Fields are useful for +applications that interact with the real world. In this paper, we use them as +an implicit map of a given scene and propose a camera relocalization algorithm +tailored for this representation. The proposed method enables to compute in +real-time the precise position of a device using a single RGB camera, during +its navigation. In contrast with previous work, we do not rely on pose +regression or photometric alignment but rather use dense local features +obtained through volumetric rendering which are specialized on the scene with a +self-supervised objective. As a result, our algorithm is more accurate than +competitors, able to operate in dynamic outdoor environments with changing +lightning conditions and can be readily integrated in any volumetric neural +renderer. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Understanding Silent Failures in Medical Image Classification MICCAI 23 + + +
+ To ensure the reliable use of classification systems in medical applications, +it is crucial to prevent silent failures. This can be achieved by either +designing classifiers that are robust enough to avoid failures in the first +place, or by detecting remaining failures using confidence scoring functions +(CSFs). A predominant source of failures in image classification is +distribution shifts between training data and deployment data. To understand +the current state of silent failure prevention in medical imaging, we conduct +the first comprehensive analysis comparing various CSFs in four biomedical +tasks and a diverse range of distribution shifts. Based on the result that none +of the benchmarked CSFs can reliably prevent silent failures, we conclude that +a deeper understanding of the root causes of failures in the data is required. +To facilitate this, we introduce SF-Visuals, an interactive analysis tool that +uses latent space clustering to visualize shifts and failures. On the basis of +various examples, we demonstrate how this tool can help researchers gain +insight into the requirements for safe application of classification systems in +the medical domain. The open-source benchmark and tool are at: +https://github.com/IML-DKFZ/sf-visuals. + +
+
+ comment: Accepted at MICCAI 23 +
+
+
+
+
+ + ♻ ☆ Panoptic Mapping with Fruit Completion and Pose Estimation for + Horticultural Robots IROS 2023 + + +
+ Monitoring plants and fruits at high resolution play a key role in the future +of agriculture. Accurate 3D information can pave the way to a diverse number of +robotic applications in agriculture ranging from autonomous harvesting to +precise yield estimation. Obtaining such 3D information is non-trivial as +agricultural environments are often repetitive and cluttered, and one has to +account for the partial observability of fruit and plants. In this paper, we +address the problem of jointly estimating complete 3D shapes of fruit and their +pose in a 3D multi-resolution map built by a mobile robot. To this end, we +propose an online multi-resolution panoptic mapping system where regions of +interest are represented with a higher resolution. We exploit data to learn a +general fruit shape representation that we use at inference time together with +an occlusion-aware differentiable rendering pipeline to complete partial fruit +observations and estimate the 7 DoF pose of each fruit in the map. The +experiments presented in this paper evaluated both in the controlled +environment and in a commercial greenhouse, show that our novel algorithm +yields higher completion and pose estimation accuracy than existing methods, +with an improvement of 41% in completion accuracy and 52% in pose estimation +accuracy while keeping a low inference time of 0.6s in average. Codes are +available at: https://github.com/PRBonn/HortiMapping. + +
+
+ comment: 8 pages, IROS 2023 +
+
+
+
+
+ + ♻ ☆ Classification Committee for Active Deep Object Detection + + +
+ In object detection, the cost of labeling is much high because it needs not +only to confirm the categories of multiple objects in an image but also to +accurately determine the bounding boxes of each object. Thus, integrating +active learning into object detection will raise pretty positive significance. +In this paper, we propose a classification committee for active deep object +detection method by introducing a discrepancy mechanism of multiple classifiers +for samples' selection when training object detectors. The model contains a +main detector and a classification committee. The main detector denotes the +target object detector trained from a labeled pool composed of the selected +informative images. The role of the classification committee is to select the +most informative images according to their uncertainty values from the view of +classification, which is expected to focus more on the discrepancy and +representative of instances. Specifically, they compute the uncertainty for a +specified instance within the image by measuring its discrepancy output by the +committee pre-trained via the proposed Maximum Classifiers Discrepancy Group +Loss (MCDGL). The most informative images are finally determined by selecting +the ones with many high-uncertainty instances. Besides, to mitigate the impact +of interference instances, we design a Focus on Positive Instances Loss (FPIL) +to make the committee the ability to automatically focus on the representative +instances as well as precisely encode their discrepancies for the same +instance. Experiments are conducted on Pascal VOC and COCO datasets versus some +popular object detectors. And results show that our method outperforms the +state-of-the-art active learning methods, which verifies the effectiveness of +the proposed method. + +
+
+
+
+
+ + ♻ ☆ Adapting Pre-trained Language Models to Vision-Language Tasks via + Dynamic Visual Prompting + + +
+ Pre-trained language models (PLMs) have played an increasing role in +multimedia research. In terms of vision-language (VL) tasks, they often serve +as a language encoder and still require an additional fusion network for VL +reasoning, resulting in excessive memory overhead. In this paper, we focus on +exploring PLMs as a stand-alone model for VL reasoning tasks. Inspired by the +recently popular prompt tuning, we first prove that the processed visual +features can be also projected onto the semantic space of PLMs and act as +prompt tokens to bridge the gap between single- and multi-modal learning. +However, this solution exhibits obvious redundancy in visual information and +model inference, and the placement of prompt tokens also greatly affects the +final performance. Based on these observations, we further propose a novel +transfer learning approach for PLMs, termed Dynamic Visual Prompting (DVP). +Concretely, DVP first deploys a cross-attention module to obtain text-related +and compact visual prompt tokens, thereby greatly reducing the input length of +PLMs. To obtain the optimal placement, we also equip DVP with a +reinforcement-learning based search algorithm, which can automatically merge +DVP with PLMs for different VL tasks via a very short search process. In +addition, we also experiment DVP with the recently popular adapter approach to +keep the most parameters of PLMs intact when adapting to VL tasks, helping PLMs +achieve a quick shift between single- and multi-modal tasks. We apply DVP to +two representative PLMs, namely BERT and T5, and conduct extensive experiments +on a set of VL reasoning benchmarks including VQA2.0, GQA and SNLIVE. The +experimental results not only show the advantage of DVP on efficiency and +performance, but also confirm its superiority in adapting pre-trained language +models to VL tasks. + +
+
+
+
+
+ + ♻ ☆ Improving Adversarial Robustness of Masked Autoencoders via Test-time + Frequency-domain Prompting ICCV 2023 + + +
+ In this paper, we investigate the adversarial robustness of vision +transformers that are equipped with BERT pretraining (e.g., BEiT, MAE). A +surprising observation is that MAE has significantly worse adversarial +robustness than other BERT pretraining methods. This observation drives us to +rethink the basic differences between these BERT pretraining methods and how +these differences affect the robustness against adversarial perturbations. Our +empirical analysis reveals that the adversarial robustness of BERT pretraining +is highly related to the reconstruction target, i.e., predicting the raw pixels +of masked image patches will degrade more adversarial robustness of the model +than predicting the semantic context, since it guides the model to concentrate +more on medium-/high-frequency components of images. Based on our analysis, we +provide a simple yet effective way to boost the adversarial robustness of MAE. +The basic idea is using the dataset-extracted domain knowledge to occupy the +medium-/high-frequency of images, thus narrowing the optimization space of +adversarial perturbations. Specifically, we group the distribution of +pretraining data and optimize a set of cluster-specific visual prompts on +frequency domain. These prompts are incorporated with input images through +prototype-based prompt selection during test period. Extensive evaluation shows +that our method clearly boost MAE's adversarial robustness while maintaining +its clean performance on ImageNet-1k classification. Our code is available at: +https://github.com/shikiw/RobustMAE. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Pre-train, Adapt and Detect: Multi-Task Adapter Tuning for Camouflaged + Object Detection + + +
+ Camouflaged object detection (COD), aiming to segment camouflaged objects +which exhibit similar patterns with the background, is a challenging task. Most +existing works are dedicated to establishing specialized modules to identify +camouflaged objects with complete and fine details, while the boundary can not +be well located for the lack of object-related semantics. In this paper, we +propose a novel ``pre-train, adapt and detect" paradigm to detect camouflaged +objects. By introducing a large pre-trained model, abundant knowledge learned +from massive multi-modal data can be directly transferred to COD. A lightweight +parallel adapter is inserted to adjust the features suitable for the downstream +COD task. Extensive experiments on four challenging benchmark datasets +demonstrate that our method outperforms existing state-of-the-art COD models by +large margins. Moreover, we design a multi-task learning scheme for tuning the +adapter to exploit the shareable knowledge across different semantic classes. +Comprehensive experimental results showed that the generalization ability of +our model can be substantially improved with multi-task adapter initialization +on source tasks and multi-task adaptation on target tasks. + +
+
+
+
+
+ + ♻ ☆ Improved YOLOv8 Detection Algorithm in Security Inspection Image + + +
+ Security inspection is the first line of defense to ensure the safety of +people's lives and property, and intelligent security inspection is an +inevitable trend in the future development of the security inspection industry. +Aiming at the problems of overlapping detection objects, false detection of +contraband, and missed detection in the process of X-ray image detection, an +improved X-ray contraband detection algorithm CSS-YOLO based on YOLOv8s is +proposed. + +
+
+ comment: 23 pages,23 figures +
+
+
+
+
+ + ♻ ☆ Event-based Camera Simulation using Monte Carlo Path Tracing with + Adaptive Denoising + + +
+ This paper presents an algorithm to obtain an event-based video from noisy +frames given by physics-based Monte Carlo path tracing over a synthetic 3D +scene. Given the nature of dynamic vision sensor (DVS), rendering event-based +video can be viewed as a process of detecting the changes from noisy brightness +values. We extend a denoising method based on a weighted local regression (WLR) +to detect the brightness changes rather than applying denoising to every pixel. +Specifically, we derive a threshold to determine the likelihood of event +occurrence and reduce the number of times to perform the regression. Our method +is robust to noisy video frames obtained from a few path-traced samples. +Despite its efficiency, our method performs comparably to or even better than +an approach that exhaustively denoises every frame. + +
+
+ comment: 8 pages, 6 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Enhancing Modality-Agnostic Representations via Meta-Learning for Brain + Tumor Segmentation ICCV 2023 + + +
+ In medical vision, different imaging modalities provide complementary +information. However, in practice, not all modalities may be available during +inference or even training. Previous approaches, e.g., knowledge distillation +or image synthesis, often assume the availability of full modalities for all +patients during training; this is unrealistic and impractical due to the +variability in data collection across sites. We propose a novel approach to +learn enhanced modality-agnostic representations by employing a meta-learning +strategy in training, even when only limited full modality samples are +available. Meta-learning enhances partial modality representations to full +modality representations by meta-training on partial modality data and +meta-testing on limited full modality samples. Additionally, we co-supervise +this feature enrichment by introducing an auxiliary adversarial learning +branch. More specifically, a missing modality detector is used as a +discriminator to mimic the full modality setting. Our segmentation framework +significantly outperforms state-of-the-art brain tumor segmentation techniques +in missing modality scenarios. + +
+
+ comment: Accepted in ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Boosting Adversarial Transferability by Block Shuffle and Rotation + + +
+ Adversarial examples mislead deep neural networks with imperceptible +perturbations and have brought significant threats to deep learning. An +important aspect is their transferability, which refers to their ability to +deceive other models, thus enabling attacks in the black-box setting. Though +various methods have been proposed to boost transferability, the performance +still falls short compared with white-box attacks. In this work, we observe +that existing input transformation based attacks, one of the mainstream +transfer-based attacks, result in different attention heatmaps on various +models, which might limit the transferability. We also find that breaking the +intrinsic relation of the image can disrupt the attention heatmap of the +original image. Based on this finding, we propose a novel input transformation +based attack called block shuffle and rotation (BSR). Specifically, BSR splits +the input image into several blocks, then randomly shuffles and rotates these +blocks to construct a set of new images for gradient calculation. Empirical +evaluations on the ImageNet dataset demonstrate that BSR could achieve +significantly better transferability than the existing input transformation +based methods under single-model and ensemble-model settings. Combining BSR +with the current input transformation method can further improve the +transferability, which significantly outperforms the state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ Information Theory-Guided Heuristic Progressive Multi-View Coding + + +
+ Multi-view representation learning aims to capture comprehensive information +from multiple views of a shared context. Recent works intuitively apply +contrastive learning to different views in a pairwise manner, which is still +scalable: view-specific noise is not filtered in learning view-shared +representations; the fake negative pairs, where the negative terms are actually +within the same class as the positive, and the real negative pairs are +coequally treated; evenly measuring the similarities between terms might +interfere with optimization. Importantly, few works study the theoretical +framework of generalized self-supervised multi-view learning, especially for +more than two views. To this end, we rethink the existing multi-view learning +paradigm from the perspective of information theory and then propose a novel +information theoretical framework for generalized multi-view learning. Guided +by it, we build a multi-view coding method with a three-tier progressive +architecture, namely Information theory-guided hierarchical Progressive +Multi-view Coding (IPMC). In the distribution-tier, IPMC aligns the +distribution between views to reduce view-specific noise. In the set-tier, IPMC +constructs self-adjusted contrasting pools, which are adaptively modified by a +view filter. Lastly, in the instance-tier, we adopt a designed unified loss to +learn representations and reduce the gradient interference. Theoretically and +empirically, we demonstrate the superiority of IPMC over state-of-the-art +methods. + +
+
+ comment: This paper is accepted by the jourcal of Neural Networks (Elsevier) + by 2023. A revised manuscript of arXiv:2109.02344 +
+
+
+
+
+ + ♻ ☆ Information Theory-Guided Heuristic Progressive Multi-View Coding + + +
+ Multi-view representation learning captures comprehensive information from +multiple views of a shared context. Recent works intuitively apply contrastive +learning (CL) to learn representations, regarded as a pairwise manner, which is +still scalable: view-specific noise is not filtered in learning view-shared +representations; the fake negative pairs, where the negative terms are actually +within the same class as the positive, and the real negative pairs are +coequally treated; and evenly measuring the similarities between terms might +interfere with optimization. Importantly, few works research the theoretical +framework of generalized self-supervised multi-view learning, especially for +more than two views. To this end, we rethink the existing multi-view learning +paradigm from the information theoretical perspective and then propose a novel +information theoretical framework for generalized multi-view learning. Guided +by it, we build a multi-view coding method with a three-tier progressive +architecture, namely Information theory-guided heuristic Progressive Multi-view +Coding (IPMC). In the distribution-tier, IPMC aligns the distribution between +views to reduce view-specific noise. In the set-tier, IPMC builds self-adjusted +pools for contrasting, which utilizes a view filter to adaptively modify the +pools. Lastly, in the instance-tier, we adopt a designed unified loss to learn +discriminative representations and reduce the gradient interference. +Theoretically and empirically, we demonstrate the superiority of IPMC over +state-of-the-art methods. + +
+
+ comment: We have uploaded a new version of this paper in arXiv:2308.10522, so + that we have to withdrawal this paper +
+
+
+
+
+ + ♻ ☆ FocalDreamer: Text-driven 3D Editing via Focal-fusion Assembly + + +
+ While text-3D editing has made significant strides in leveraging score +distillation sampling, emerging approaches still fall short in delivering +separable, precise and consistent outcomes that are vital to content creation. +In response, we introduce FocalDreamer, a framework that merges base shape with +editable parts according to text prompts for fine-grained editing within +desired regions. Specifically, equipped with geometry union and dual-path +rendering, FocalDreamer assembles independent 3D parts into a complete object, +tailored for convenient instance reuse and part-wise control. We propose +geometric focal loss and style consistency regularization, which encourage +focal fusion and congruent overall appearance. Furthermore, FocalDreamer +generates high-fidelity geometry and PBR textures which are compatible with +widely-used graphics engines. Extensive experiments have highlighted the +superior editing capabilities of FocalDreamer in both quantitative and +qualitative evaluations. + +
+
+ comment: Project website: https://focaldreamer.github.io +
+
+
+
+
+ + ♻ ☆ WanJuan: A Comprehensive Multimodal Dataset for Advancing English and + Chinese Large Models + + +
+ The rise in popularity of ChatGPT and GPT-4 has significantly accelerated the +development of large models, leading to the creation of numerous impressive +large language models(LLMs) and multimodal large language models (MLLMs). These +cutting-edge models owe their remarkable performance to high-quality data. +However, the details of the training data used in leading paradigms are often +kept confidential. This lack of transparency, coupled with the scarcity of +open-source data, impedes further developments within the community. As a +response, this paper presents "Wan Juan", a large-scale multimodal dataset +composed of both Chinese and English data, collected from a wide range of web +sources. The dataset incorporates text, image-text, and video modalities, with +a total volume exceeding 2TB. It was utilized in the training of InternLM, a +model that demonstrated significant advantages in multi-dimensional evaluations +when compared to models of a similar scale. All data can be accessed at +https://opendatalab.org.cn/WanJuan1.0. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ WMFormer++: Nested Transformer for Visible Watermark Removal via Implict + Joint Learning + + +
+ Watermarking serves as a widely adopted approach to safeguard media +copyright. In parallel, the research focus has extended to watermark removal +techniques, offering an adversarial means to enhance watermark robustness and +foster advancements in the watermarking field. Existing watermark removal +methods mainly rely on UNet with task-specific decoder branches--one for +watermark localization and the other for background image restoration. However, +watermark localization and background restoration are not isolated tasks; +precise watermark localization inherently implies regions necessitating +restoration, and the background restoration process contributes to more +accurate watermark localization. To holistically integrate information from +both branches, we introduce an implicit joint learning paradigm. This empowers +the network to autonomously navigate the flow of information between implicit +branches through a gate mechanism. Furthermore, we employ cross-channel +attention to facilitate local detail restoration and holistic structural +comprehension, while harnessing nested structures to integrate multi-scale +information. Extensive experiments are conducted on various challenging +benchmarks to validate the effectiveness of our proposed method. The results +demonstrate our approach's remarkable superiority, surpassing existing +state-of-the-art methods by a large margin. + +
+
+
+
+
+ + ♻ ☆ DatasetEquity: Are All Samples Created Equal? In The Quest For Equity + Within Datasets ICCV 2023 + + +
+ Data imbalance is a well-known issue in the field of machine learning, +attributable to the cost of data collection, the difficulty of labeling, and +the geographical distribution of the data. In computer vision, bias in data +distribution caused by image appearance remains highly unexplored. Compared to +categorical distributions using class labels, image appearance reveals complex +relationships between objects beyond what class labels provide. Clustering deep +perceptual features extracted from raw pixels gives a richer representation of +the data. This paper presents a novel method for addressing data imbalance in +machine learning. The method computes sample likelihoods based on image +appearance using deep perceptual embeddings and clustering. It then uses these +likelihoods to weigh samples differently during training with a proposed +$\textbf{Generalized Focal Loss}$ function. This loss can be easily integrated +with deep learning algorithms. Experiments validate the method's effectiveness +across autonomous driving vision datasets including KITTI and nuScenes. The +loss function improves state-of-the-art 3D object detection methods, achieving +over $200\%$ AP gains on under-represented classes (Cyclist) in the KITTI +dataset. The results demonstrate the method is generalizable, complements +existing techniques, and is particularly beneficial for smaller datasets and +rare classes. Code is available at: +https://github.com/towardsautonomy/DatasetEquity + +
+
+ comment: ICCV 2023 Workshop +
+
+
+
+
+ + ♻ ☆ bbOCR: An Open-source Multi-domain OCR Pipeline for Bengali Documents + + +
+ Despite the existence of numerous Optical Character Recognition (OCR) tools, +the lack of comprehensive open-source systems hampers the progress of document +digitization in various low-resource languages, including Bengali. Low-resource +languages, especially those with an alphasyllabary writing system, suffer from +the lack of large-scale datasets for various document OCR components such as +word-level OCR, document layout extraction, and distortion correction; which +are available as individual modules in high-resource languages. In this paper, +we introduce Bengali$.$AI-BRACU-OCR (bbOCR): an open-source scalable document +OCR system that can reconstruct Bengali documents into a structured searchable +digitized format that leverages a novel Bengali text recognition model and two +novel synthetic datasets. We present extensive component-level and system-level +evaluation: both use a novel diversified evaluation dataset and comprehensive +evaluation metrics. Our extensive evaluation suggests that our proposed +solution is preferable over the current state-of-the-art Bengali OCR systems. +The source codes and datasets are available here: +https://bengaliai.github.io/bbocr. + +
+
+
+
+
+ + ♻ ☆ Embedded Feature Similarity Optimization with Specific Parameter + Initialization for 2D/3D Medical Image Registration + + +
+ We present a novel deep learning-based framework: Embedded Feature Similarity +Optimization with Specific Parameter Initialization (SOPI) for 2D/3D medical +image registration which is a most challenging problem due to the difficulty +such as dimensional mismatch, heavy computation load and lack of golden +evaluation standard. The framework we design includes a parameter specification +module to efficiently choose initialization pose parameter and a +fine-registration module to align images. The proposed framework takes +extracting multi-scale features into consideration using a novel composite +connection encoder with special training techniques. We compare the method with +both learning-based methods and optimization-based methods on a in-house +CT/X-ray dataset as well as simulated data to further evaluate performance. Our +experiments demonstrate that the method in this paper has improved the +registration performance, and thereby outperforms the existing methods in terms +of accuracy and running time. We also show the potential of the proposed method +as an initial pose estimator. The code is available at +https://github.com/m1nhengChen/SOPI + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Unsupervised Anomaly Detection in Medical Images with a Memory-augmented + Multi-level Cross-attentional Masked Autoencoder MICCAI + + +
+ Unsupervised anomaly detection (UAD) aims to find anomalous images by +optimising a detector using a training set that contains only normal images. +UAD approaches can be based on reconstruction methods, self-supervised +approaches, and Imagenet pre-trained models. Reconstruction methods, which +detect anomalies from image reconstruction errors, are advantageous because +they do not rely on the design of problem-specific pretext tasks needed by +self-supervised approaches, and on the unreliable translation of models +pre-trained from non-medical datasets. However, reconstruction methods may fail +because they can have low reconstruction errors even for anomalous images. In +this paper, we introduce a new reconstruction-based UAD approach that addresses +this low-reconstruction error issue for anomalous images. Our UAD approach, the +memory-augmented multi-level cross-attentional masked autoencoder (MemMC-MAE), +is a transformer-based approach, consisting of a novel memory-augmented +self-attention operator for the encoder and a new multi-level cross-attention +operator for the decoder. MemMCMAE masks large parts of the input image during +its reconstruction, reducing the risk that it will produce low reconstruction +errors because anomalies are likely to be masked and cannot be reconstructed. +However, when the anomaly is not masked, then the normal patterns stored in the +encoder's memory combined with the decoder's multi-level cross attention will +constrain the accurate reconstruction of the anomaly. We show that our method +achieves SOTA anomaly detection and localisation on colonoscopy, pneumonia, and +covid-19 chest x-ray datasets. + +
+
+ comment: Accepted to MICCAI MLMI2023 +
+
+
+
+
+ + ♻ ☆ Robust Detection, Association, and Localization of Vehicle Lights: A + Context-Based Cascaded CNN Approach and Evaluations + + +
+ Vehicle light detection, association, and localization are required for +important downstream safe autonomous driving tasks, such as predicting a +vehicle's light state to determine if the vehicle is making a lane change or +turning. Currently, many vehicle light detectors use single-stage detectors +which predict bounding boxes to identify a vehicle light, in a manner decoupled +from vehicle instances. In this paper, we present a method for detecting a +vehicle light given an upstream vehicle detection and approximation of a +visible light's center. Our method predicts four approximate corners associated +with each vehicle light. We experiment with CNN architectures, data +augmentation, and contextual preprocessing methods designed to reduce +surrounding-vehicle confusion. We achieve an average distance error from the +ground truth corner of 4.77 pixels, about 16.33% of the size of the vehicle +light on average. We train and evaluate our model on the LISA Lights Dataset, +allowing us to thoroughly evaluate our vehicle light corner detection model on +a large variety of vehicle light shapes and lighting conditions. We propose +that this model can be integrated into a pipeline with vehicle detection and +vehicle light center detection to make a fully-formed vehicle light detection +network, valuable to identifying trajectory-informative signals in driving +scenes. + +
+
+
+
+
+ + ♻ ☆ Perceptual Grouping in Contrastive Vision-Language Models ICCV 2023 + + +
+ Recent advances in zero-shot image recognition suggest that vision-language +models learn generic visual representations with a high degree of semantic +information that may be arbitrarily probed with natural language phrases. +Understanding an image, however, is not just about understanding what content +resides within an image, but importantly, where that content resides. In this +work we examine how well vision-language models are able to understand where +objects reside within an image and group together visually related parts of the +imagery. We demonstrate how contemporary vision and language representation +learning models based on contrastive losses and large web-based data capture +limited object localization information. We propose a minimal set of +modifications that results in models that uniquely learn both semantic and +spatial information. We measure this performance in terms of zero-shot image +recognition, unsupervised bottom-up and top-down semantic segmentations, as +well as robustness analyses. We find that the resulting model achieves +state-of-the-art results in terms of unsupervised segmentation, and demonstrate +that the learned representations are uniquely robust to spurious correlations +in datasets designed to probe the causal behavior of vision models. + +
+
+ comment: Accepted and presented at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Geo6D: Geometric Constraints Learning for 6D Pose Estimation + + +
+ Numerous 6D pose estimation methods have been proposed that employ end-to-end +regression to directly estimate the target pose parameters. Since the visible +features of objects are implicitly influenced by their poses, the network +allows inferring the pose by analyzing the differences in features in the +visible region. However, due to the unpredictable and unrestricted range of +pose variations, the implicitly learned visible feature-pose constraints are +insufficiently covered by the training samples, making the network vulnerable +to unseen object poses. To tackle these challenges, we proposed a novel +geometric constraints learning approach called Geo6D for direct regression 6D +pose estimation methods. It introduces a pose transformation formula expressed +in relative offset representation, which is leveraged as geometric constraints +to reconstruct the input and output targets of the network. These reconstructed +data enable the network to estimate the pose based on explicit geometric +constraints and relative offset representation mitigates the issue of the pose +distribution gap. Extensive experimental results show that when equipped with +Geo6D, the direct 6D methods achieve state-of-the-art performance on multiple +datasets and demonstrate significant effectiveness, even with only 10% amount +of data. + +
+
+
+
+
+ + ♻ ☆ Scene-Aware Feature Matching ICCV 2023 + + +
+ Current feature matching methods focus on point-level matching, pursuing +better representation learning of individual features, but lacking further +understanding of the scene. This results in significant performance degradation +when handling challenging scenes such as scenes with large viewpoint and +illumination changes. To tackle this problem, we propose a novel model named +SAM, which applies attentional grouping to guide Scene-Aware feature Matching. +SAM handles multi-level features, i.e., image tokens and group tokens, with +attention layers, and groups the image tokens with the proposed token grouping +module. Our model can be trained by ground-truth matches only and produce +reasonable grouping results. With the sense-aware grouping guidance, SAM is not +only more accurate and robust but also more interpretable than conventional +feature matching models. Sufficient experiments on various applications, +including homography estimation, pose estimation, and image matching, +demonstrate that our model achieves state-of-the-art performance. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Relightify: Relightable 3D Faces from a Single Image via Diffusion + Models ICCV 2023 + + +
+ Following the remarkable success of diffusion models on image generation, +recent works have also demonstrated their impressive ability to address a +number of inverse problems in an unsupervised way, by properly constraining the +sampling process based on a conditioning input. Motivated by this, in this +paper, we present the first approach to use diffusion models as a prior for +highly accurate 3D facial BRDF reconstruction from a single image. We start by +leveraging a high-quality UV dataset of facial reflectance (diffuse and +specular albedo and normals), which we render under varying illumination +settings to simulate natural RGB textures and, then, train an unconditional +diffusion model on concatenated pairs of rendered textures and reflectance +components. At test time, we fit a 3D morphable model to the given image and +unwrap the face in a partial UV texture. By sampling from the diffusion model, +while retaining the observed texture part intact, the model inpaints not only +the self-occluded areas but also the unknown reflectance components, in a +single sequence of denoising steps. In contrast to existing methods, we +directly acquire the observed texture from the input image, thus, resulting in +more faithful and consistent reflectance estimation. Through a series of +qualitative and quantitative comparisons, we demonstrate superior performance +in both texture completion as well as reflectance reconstruction tasks. + +
+
+ comment: ICCV 2023, 15 pages, 14 figures. Project page: + https://foivospar.github.io/Relightify/ +
+
+
+
+
+ + ♻ ☆ EAVL: Explicitly Align Vision and Language for Referring Image + Segmentation + + +
+ Referring image segmentation aims to segment an object mentioned in natural +language from an image. A main challenge is language-related localization, +which means locating the object with the relevant language. Previous approaches +mainly focus on the fusion of vision and language features without fully +addressing language-related localization. In previous approaches, fused +vision-language features are directly fed into a decoder and pass through a +convolution with a fixed kernel to obtain the result, which follows a similar +pattern as traditional image segmentation. This approach does not explicitly +align language and vision features in the segmentation stage, resulting in a +suboptimal language-related localization. Different from previous methods, we +propose Explicitly Align the Vision and Language for Referring Image +Segmentation (EAVL). Instead of using a fixed convolution kernel, we propose an +Aligner which explicitly aligns the vision and language features in the +segmentation stage. Specifically, a series of unfixed convolution kernels are +generated based on the input l, and then are use to explicitly align the vision +and language features. To achieve this, We generate multiple queries that +represent different emphases of the language expression. These queries are +transformed into a series of query-based convolution kernels. Then, we utilize +these kernels to do convolutions in the segmentation stage and obtain a series +of segmentation masks. The final result is obtained through the aggregation of +all masks. Our method can not only fuse vision and language features +effectively but also exploit their potential in the segmentation stage. And +most importantly, we explicitly align language features of different emphases +with the image features to achieve language-related localization. Our method +surpasses previous state-of-the-art methods on RefCOCO, RefCOCO+, and G-Ref by +large margins. + +
+
+ comment: 10 pages, 4 figures. arXiv admin note: text overlap with + arXiv:2305.14969 +
+
+
+
+
+ + ♻ ☆ DDFM: Denoising Diffusion Model for Multi-Modality Image Fusion ICCV 2023 + + +
+ Multi-modality image fusion aims to combine different modalities to produce +fused images that retain the complementary features of each modality, such as +functional highlights and texture details. To leverage strong generative priors +and address challenges such as unstable training and lack of interpretability +for GAN-based generative methods, we propose a novel fusion algorithm based on +the denoising diffusion probabilistic model (DDPM). The fusion task is +formulated as a conditional generation problem under the DDPM sampling +framework, which is further divided into an unconditional generation subproblem +and a maximum likelihood subproblem. The latter is modeled in a hierarchical +Bayesian manner with latent variables and inferred by the +expectation-maximization (EM) algorithm. By integrating the inference solution +into the diffusion sampling iteration, our method can generate high-quality +fused images with natural image generative priors and cross-modality +information from source images. Note that all we required is an unconditional +pre-trained generative model, and no fine-tuning is needed. Our extensive +experiments indicate that our approach yields promising fusion results in +infrared-visible image fusion and medical image fusion. The code is available +at \url{https://github.com/Zhaozixiang1228/MMIF-DDFM}. + +
+
+ comment: Accepted by ICCV 2023 (Oral) +
+
+
+
+
+ + ♻ ☆ SAFE: Sensitivity-Aware Features for Out-of-Distribution Object + Detection + + +
+ We address the problem of out-of-distribution (OOD) detection for the task of +object detection. We show that residual convolutional layers with batch +normalisation produce Sensitivity-Aware FEatures (SAFE) that are consistently +powerful for distinguishing in-distribution from out-of-distribution +detections. We extract SAFE vectors for every detected object, and train a +multilayer perceptron on the surrogate task of distinguishing adversarially +perturbed from clean in-distribution examples. This circumvents the need for +realistic OOD training data, computationally expensive generative models, or +retraining of the base object detector. SAFE outperforms the state-of-the-art +OOD object detectors on multiple benchmarks by large margins, e.g. reducing the +FPR95 by an absolute 30.6% from 48.3% to 17.7% on the OpenImages dataset. + +
+
+
+
+
+ + ♻ ☆ MIMT: Multi-Illuminant Color Constancy via Multi-Task Local Surface and + Light Color Learning + + +
+ The assumption of a uniform light color distribution is no longer applicable +in scenes that have multiple light colors. Most color constancy methods are +designed to deal with a single light color, and thus are erroneous when applied +to multiple light colors. The spatial variability in multiple light colors +causes the color constancy problem to be more challenging and requires the +extraction of local surface/light information. Motivated by this, we introduce +a multi-task learning method to discount multiple light colors in a single +input image. To have better cues of the local surface/light colors under +multiple light color conditions, we design a novel multi-task learning +framework. Our framework includes auxiliary tasks of achromatic-pixel detection +and surface-color similarity prediction, providing better cues for local light +and surface colors, respectively. Moreover, to ensure that our model maintains +the constancy of surface colors regardless of the variations of light colors, a +novel local surface color feature preservation scheme is developed. We +demonstrate that our model achieves 47.1% improvement (from 4.69 mean angular +error to 2.48) compared to a state-of-the-art multi-illuminant color constancy +method on a multi-illuminant dataset (LSMI). + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Concept Evolution in Deep Learning Training: A Unified Interpretation + Framework and Discoveries CIKM'23 + + +
+ We present ConceptEvo, a unified interpretation framework for deep neural +networks (DNNs) that reveals the inception and evolution of learned concepts +during training. Our work addresses a critical gap in DNN interpretation +research, as existing methods primarily focus on post-training interpretation. +ConceptEvo introduces two novel technical contributions: (1) an algorithm that +generates a unified semantic space, enabling side-by-side comparison of +different models during training, and (2) an algorithm that discovers and +quantifies important concept evolutions for class predictions. Through a +large-scale human evaluation and quantitative experiments, we demonstrate that +ConceptEvo successfully identifies concept evolutions across different models, +which are not only comprehensible to humans but also crucial for class +predictions. ConceptEvo is applicable to both modern DNN architectures, such as +ConvNeXt, and classic DNNs, such as VGGs and InceptionV3. + +
+
+ comment: Accepted at CIKM'23 +
+
+
+
+
+ + ♻ ☆ Discriminative Class Tokens for Text-to-Image Diffusion Models ICCV 2023 + + +
+ Recent advances in text-to-image diffusion models have enabled the generation +of diverse and high-quality images. While impressive, the images often fall +short of depicting subtle details and are susceptible to errors due to +ambiguity in the input text. One way of alleviating these issues is to train +diffusion models on class-labeled datasets. This approach has two +disadvantages: (i) supervised datasets are generally small compared to +large-scale scraped text-image datasets on which text-to-image models are +trained, affecting the quality and diversity of the generated images, or (ii) +the input is a hard-coded label, as opposed to free-form text, limiting the +control over the generated images. + In this work, we propose a non-invasive fine-tuning technique that +capitalizes on the expressive potential of free-form text while achieving high +accuracy through discriminative signals from a pretrained classifier. This is +done by iteratively modifying the embedding of an added input token of a +text-to-image diffusion model, by steering generated images toward a given +target class according to a classifier. Our method is fast compared to prior +fine-tuning methods and does not require a collection of in-class images or +retraining of a noise-tolerant classifier. We evaluate our method extensively, +showing that the generated images are: (i) more accurate and of higher quality +than standard diffusion models, (ii) can be used to augment training data in a +low-resource setting, and (iii) reveal information about the data used to train +the guiding classifier. The code is available at +\url{https://github.com/idansc/discriminative_class_tokens}. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Dive into Deep Learning + + +
+ This open-source book represents our attempt to make deep learning +approachable, teaching readers the concepts, the context, and the code. The +entire book is drafted in Jupyter notebooks, seamlessly integrating exposition +figures, math, and interactive examples with self-contained code. Our goal is +to offer a resource that could (i) be freely available for everyone; (ii) offer +sufficient technical depth to provide a starting point on the path to actually +becoming an applied machine learning scientist; (iii) include runnable code, +showing readers how to solve problems in practice; (iv) allow for rapid +updates, both by us and also by the community at large; (v) be complemented by +a forum for interactive discussion of technical details and to answer +questions. + +
+
+ comment: (HTML) https://D2L.ai (GitHub) https://github.com/d2l-ai/d2l-en/ +
+
+
+
+
+
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ Multi-event Video-Text Retrieval ICCV2023 + + +
+ Video-Text Retrieval (VTR) is a crucial multi-modal task in an era of massive +video-text data on the Internet. A plethora of work characterized by using a +two-stream Vision-Language model architecture that learns a joint +representation of video-text pairs has become a prominent approach for the VTR +task. However, these models operate under the assumption of bijective +video-text correspondences and neglect a more practical scenario where video +content usually encompasses multiple events, while texts like user queries or +webpage metadata tend to be specific and correspond to single events. This +establishes a gap between the previous training objective and real-world +applications, leading to the potential performance degradation of earlier +models during inference. In this study, we introduce the Multi-event Video-Text +Retrieval (MeVTR) task, addressing scenarios in which each video contains +multiple different events, as a niche scenario of the conventional Video-Text +Retrieval Task. We present a simple model, Me-Retriever, which incorporates key +event video representation and a new MeVTR loss for the MeVTR task. +Comprehensive experiments show that this straightforward framework outperforms +other models in the Video-to-Text and Text-to-Video tasks, effectively +establishing a robust baseline for the MeVTR task. We believe this work serves +as a strong foundation for future studies. Code is available at +https://github.com/gengyuanmax/MeVTR. + +
+
+ comment: accepted to ICCV2023 +
+
+
+
+
+ + ☆ L^2R: Lifelong Learning for First-stage Retrieval with + Backward-Compatible Representations CIKM2023 + + +
+ First-stage retrieval is a critical task that aims to retrieve relevant +document candidates from a large-scale collection. While existing retrieval +models have achieved impressive performance, they are mostly studied on static +data sets, ignoring that in the real-world, the data on the Web is continuously +growing with potential distribution drift. Consequently, retrievers trained on +static old data may not suit new-coming data well and inevitably produce +sub-optimal results. In this work, we study lifelong learning for first-stage +retrieval, especially focusing on the setting where the emerging documents are +unlabeled since relevance annotation is expensive and may not keep up with data +emergence. Under this setting, we aim to develop model updating with two goals: +(1) to effectively adapt to the evolving distribution with the unlabeled +new-coming data, and (2) to avoid re-inferring all embeddings of old documents +to efficiently update the index each time the model is updated. + We first formalize the task and then propose a novel Lifelong Learning method +for the first-stage Retrieval, namely L^2R. L^2R adopts the typical memory +mechanism for lifelong learning, and incorporates two crucial components: (1) +selecting diverse support negatives for model training and memory updating for +effective model adaptation, and (2) a ranking alignment objective to ensure the +backward-compatibility of representations to save the cost of index rebuilding +without hurting the model performance. For evaluation, we construct two new +benchmarks from LoTTE and Multi-CPR datasets to simulate the document +distribution drift in realistic retrieval scenarios. Extensive experiments show +that L^2R significantly outperforms competitive lifelong learning baselines. + +
+
+ comment: accepted by CIKM2023 +
+
+
+
+
+ + ☆ Pre-training with Aspect-Content Text Mutual Prediction for Multi-Aspect + Dense Retrieval + + +
+ Grounded on pre-trained language models (PLMs), dense retrieval has been +studied extensively on plain text. In contrast, there has been little research +on retrieving data with multiple aspects using dense models. In the scenarios +such as product search, the aspect information plays an essential role in +relevance matching, e.g., category: Electronics, Computers, and Pet Supplies. A +common way of leveraging aspect information for multi-aspect retrieval is to +introduce an auxiliary classification objective, i.e., using item contents to +predict the annotated value IDs of item aspects. However, by learning the value +embeddings from scratch, this approach may not capture the various semantic +similarities between the values sufficiently. To address this limitation, we +leverage the aspect information as text strings rather than class IDs during +pre-training so that their semantic similarities can be naturally captured in +the PLMs. To facilitate effective retrieval with the aspect strings, we propose +mutual prediction objectives between the text of the item aspect and content. +In this way, our model makes more sufficient use of aspect information than +conducting undifferentiated masked language modeling (MLM) on the concatenated +text of aspects and content. Extensive experiments on two real-world datasets +(product and mini-program search) show that our approach can outperform +competitive baselines both treating aspect values as classes and conducting the +same MLM for aspect and content strings. Code and related dataset will be +available at the URL \footnote{https://github.com/sunxiaojie99/ATTEMPT}. + +
+
+ comment: accepted by cikm2023 +
+
+
+
+
+ + ☆ On the Opportunities and Challenges of Offline Reinforcement Learning + for Recommender Systems + + +
+ Reinforcement learning serves as a potent tool for modeling dynamic user +interests within recommender systems, garnering increasing research attention +of late. However, a significant drawback persists: its poor data efficiency, +stemming from its interactive nature. The training of reinforcement +learning-based recommender systems demands expensive online interactions to +amass adequate trajectories, essential for agents to learn user preferences. +This inefficiency renders reinforcement learning-based recommender systems a +formidable undertaking, necessitating the exploration of potential solutions. +Recent strides in offline reinforcement learning present a new perspective. +Offline reinforcement learning empowers agents to glean insights from offline +datasets and deploy learned policies in online settings. Given that recommender +systems possess extensive offline datasets, the framework of offline +reinforcement learning aligns seamlessly. Despite being a burgeoning field, +works centered on recommender systems utilizing offline reinforcement learning +remain limited. This survey aims to introduce and delve into offline +reinforcement learning within recommender systems, offering an inclusive review +of existing literature in this domain. Furthermore, we strive to underscore +prevalent challenges, opportunities, and future pathways, poised to propel +research in this evolving field. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Test Time Embedding Normalization for Popularity Bias Mitigation CIKM 2023 + + +
+ Popularity bias is a widespread problem in the field of recommender systems, +where popular items tend to dominate recommendation results. In this work, we +propose 'Test Time Embedding Normalization' as a simple yet effective strategy +for mitigating popularity bias, which surpasses the performance of the previous +mitigation approaches by a significant margin. Our approach utilizes the +normalized item embedding during the inference stage to control the influence +of embedding magnitude, which is highly correlated with item popularity. +Through extensive experiments, we show that our method combined with the +sampled softmax loss effectively reduces popularity bias compare to previous +approaches for bias mitigation. We further investigate the relationship between +user and item embeddings and find that the angular similarity between +embeddings distinguishes preferable and non-preferable items regardless of +their popularity. The analysis explains the mechanism behind the success of our +approach in eliminating the impact of popularity bias. Our code is available at +https://github.com/ml-postech/TTEN. + +
+
+ comment: 5 pages, CIKM 2023 +
+
+
+
+
+ + ☆ MISSRec: Pre-training and Transferring Multi-modal Interest-aware + Sequence Representation for Recommendation ACM MM 2023 + + +
+ The goal of sequential recommendation (SR) is to predict a user's potential +interested items based on her/his historical interaction sequences. Most +existing sequential recommenders are developed based on ID features, which, +despite their widespread use, often underperform with sparse IDs and struggle +with the cold-start problem. Besides, inconsistent ID mappings hinder the +model's transferability, isolating similar recommendation domains that could +have been co-optimized. This paper aims to address these issues by exploring +the potential of multi-modal information in learning robust and generalizable +sequence representations. We propose MISSRec, a multi-modal pre-training and +transfer learning framework for SR. On the user side, we design a +Transformer-based encoder-decoder model, where the contextual encoder learns to +capture the sequence-level multi-modal synergy while a novel interest-aware +decoder is developed to grasp item-modality-interest relations for better +sequence representation. On the candidate item side, we adopt a dynamic fusion +module to produce user-adaptive item representation, providing more precise +matching between users and items. We pre-train the model with contrastive +learning objectives and fine-tune it in an efficient manner. Extensive +experiments demonstrate the effectiveness and flexibility of MISSRec, promising +an practical solution for real-world recommendation scenarios. + +
+
+ comment: Accepted to ACM MM 2023 +
+
+
+
+
+ + ☆ Towards Validating Long-Term User Feedbacks in Interactive + Recommendation Systems SIGIR'22 + + +
+ Interactive Recommender Systems (IRSs) have attracted a lot of attention, due +to their ability to model interactive processes between users and recommender +systems. Numerous approaches have adopted Reinforcement Learning (RL) +algorithms, as these can directly maximize users' cumulative rewards. In IRS, +researchers commonly utilize publicly available review datasets to compare and +evaluate algorithms. However, user feedback provided in public datasets merely +includes instant responses (e.g., a rating), with no inclusion of delayed +responses (e.g., the dwell time and the lifetime value). Thus, the question +remains whether these review datasets are an appropriate choice to evaluate the +long-term effects of the IRS. In this work, we revisited experiments on IRS +with review datasets and compared RL-based models with a simple reward model +that greedily recommends the item with the highest one-step reward. Following +extensive analysis, we can reveal three main findings: First, a simple greedy +reward model consistently outperforms RL-based models in maximizing cumulative +rewards. Second, applying higher weighting to long-term rewards leads to a +degradation of recommendation performance. Third, user feedbacks have mere +long-term effects on the benchmark datasets. Based on our findings, we conclude +that a dataset has to be carefully verified and that a simple greedy baseline +should be included for a proper evaluation of RL-based IRS approaches. + +
+
+ comment: Accepted to SIGIR'22 +
+
+
+
+
+ + ☆ ReLLa: Retrieval-enhanced Large Language Models for Lifelong Sequential + Behavior Comprehension in Recommendation + + +
+ With large language models (LLMs) achieving remarkable breakthroughs in +natural language processing (NLP) domains, LLM-enhanced recommender systems +have received much attention and have been actively explored currently. In this +paper, we focus on adapting and empowering a pure large language model for +zero-shot and few-shot recommendation tasks. First and foremost, we identify +and formulate the lifelong sequential behavior incomprehension problem for LLMs +in recommendation domains, i.e., LLMs fail to extract useful information from a +textual context of long user behavior sequence, even if the length of context +is far from reaching the context limitation of LLMs. To address such an issue +and improve the recommendation performance of LLMs, we propose a novel +framework, namely Retrieval-enhanced Large Language models (ReLLa) for +recommendation tasks in both zero-shot and few-shot settings. For zero-shot +recommendation, we perform semantic user behavior retrieval (SUBR) to improve +the data quality of testing samples, which greatly reduces the difficulty for +LLMs to extract the essential knowledge from user behavior sequences. As for +few-shot recommendation, we further design retrieval-enhanced instruction +tuning (ReiT) by adopting SUBR as a data augmentation technique for training +samples. Specifically, we develop a mixed training dataset consisting of both +the original data samples and their retrieval-enhanced counterparts. We conduct +extensive experiments on a real-world public dataset (i.e., MovieLens-1M) to +demonstrate the superiority of ReLLa compared with existing baseline models, as +well as its capability for lifelong sequential behavior comprehension. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ How Expressive are Graph Neural Networks in Recommendation? CIKM + + +
+ Graph Neural Networks (GNNs) have demonstrated superior performance on +various graph learning tasks, including recommendation, where they leverage +user-item collaborative filtering signals in graphs. However, theoretical +formulations of their capability are scarce, despite their empirical +effectiveness in state-of-the-art recommender models. Recently, research has +explored the expressiveness of GNNs in general, demonstrating that message +passing GNNs are at most as powerful as the Weisfeiler-Lehman test, and that +GNNs combined with random node initialization are universal. Nevertheless, the +concept of "expressiveness" for GNNs remains vaguely defined. Most existing +works adopt the graph isomorphism test as the metric of expressiveness, but +this graph-level task may not effectively assess a model's ability in +recommendation, where the objective is to distinguish nodes of different +closeness. In this paper, we provide a comprehensive theoretical analysis of +the expressiveness of GNNs in recommendation, considering three levels of +expressiveness metrics: graph isomorphism (graph-level), node automorphism +(node-level), and topological closeness (link-level). We propose the +topological closeness metric to evaluate GNNs' ability to capture the +structural distance between nodes, which aligns closely with the objective of +recommendation. To validate the effectiveness of this new metric in evaluating +recommendation performance, we introduce a learning-less GNN algorithm that is +optimal on the new metric and can be optimal on the node-level metric with +suitable modification. We conduct extensive experiments comparing the proposed +algorithm against various types of state-of-the-art GNN models to explore the +explainability of the new metric in the recommendation task. For +reproducibility, implementation codes are available at +https://github.com/HKUDS/GTE. + +
+
+ comment: 32nd ACM International Conference on Information and Knowledge + Management (CIKM) 2023 +
+
+
+
+
+ + ☆ Anonymity at Risk? Assessing Re-Identification Capabilities of Large + Language Models + + +
+ Anonymity of both natural and legal persons in court rulings is a critical +aspect of privacy protection in the European Union and Switzerland. With the +advent of LLMs, concerns about large-scale re-identification of anonymized +persons are growing. In accordance with the Federal Supreme Court of +Switzerland, we explore the potential of LLMs to re-identify individuals in +court rulings by constructing a proof-of-concept using actual legal data from +the Swiss federal supreme court. Following the initial experiment, we +constructed an anonymized Wikipedia dataset as a more rigorous testing ground +to further investigate the findings. With the introduction and application of +the new task of re-identifying people in texts, we also introduce new metrics +to measure performance. We systematically analyze the factors that influence +successful re-identifications, identifying model size, input length, and +instruction tuning among the most critical determinants. Despite high +re-identification rates on Wikipedia, even the best LLMs struggled with court +decisions. The complexity is attributed to the lack of test datasets, the +necessity for substantial training resources, and data sparsity in the +information used for re-identification. In conclusion, this study demonstrates +that re-identification using LLMs may not be feasible for now, but as the +proof-of-concept on Wikipedia showed, it might become possible in the future. +We hope that our system can help enhance the confidence in the security of +anonymized decisions, thus leading to the courts being more confident to +publish decisions. + +
+
+
+
+
+ + ☆ CLIP Multi-modal Hashing: A new baseline CLIPMH ICASSP2024 + + +
+ The multi-modal hashing method is widely used in multimedia retrieval. It can +fuse multi-source data to generate binary hash code. However, the current +multi-modal methods have the problem of low retrieval accuracy. The reason is +that the individual backbone networks have limited feature expression +capabilities and are not jointly pre-trained on large-scale unsupervised +multi-modal data. To solve this problem, we propose a new baseline CLIP +Multi-modal Hashing (CLIPMH) method. It uses CLIP model to extract text and +image features, and then fuse to generate hash code. CLIP improves the +expressiveness of each modal feature. In this way, it can greatly improve the +retrieval performance of multi-modal hashing methods. In comparison to +state-of-the-art unsupervised and supervised multi-modal hashing methods, +experiments reveal that the proposed CLIPMH can significantly enhance +performance (Maximum increase of 8.38%). CLIP also has great advantages over +the text and visual backbone networks commonly used before. + +
+
+ comment: submit to ICASSP2024 +
+
+
+
+
+ + ☆ Knowledge Graph Prompting for Multi-Document Question Answering + + +
+ The 'pre-train, prompt, predict' paradigm of large language models (LLMs) has +achieved remarkable success in open-domain question answering (OD-QA). However, +few works explore this paradigm in the scenario of multi-document question +answering (MD-QA), a task demanding a thorough understanding of the logical +associations among the contents and structures of different documents. To fill +this crucial gap, we propose a Knowledge Graph Prompting (KGP) method to +formulate the right context in prompting LLMs for MD-QA, which consists of a +graph construction module and a graph traversal module. For graph construction, +we create a knowledge graph (KG) over multiple documents with nodes symbolizing +passages or document structures (e.g., pages/tables), and edges denoting the +semantic/lexical similarity between passages or intra-document structural +relations. For graph traversal, we design an LM-guided graph traverser that +navigates across nodes and gathers supporting passages assisting LLMs in MD-QA. +The constructed graph serves as the global ruler that regulates the +transitional space among passages and reduces retrieval latency. Concurrently, +the LM-guided traverser acts as a local navigator that gathers pertinent +context to progressively approach the question and guarantee retrieval quality. +Extensive experiments underscore the efficacy of KGP for MD-QA, signifying the +potential of leveraging graphs in enhancing the prompt design for LLMs. Our +code is at https://github.com/YuWVandy/KG-LLM-MDQA. + +
+
+
+
+
+ + ☆ Invariant representation learning for sequential recommendation + + +
+ Sequential recommendation involves automatically recommending the next item +to users based on their historical item sequence. While most prior research +employs RNN or transformer methods to glean information from the item +sequence-generating probabilities for each user-item pair and recommending the +top items, these approaches often overlook the challenge posed by spurious +relationships. This paper specifically addresses these spurious relations. We +introduce a novel sequential recommendation framework named Irl4Rec. This +framework harnesses invariant learning and employs a new objective that factors +in the relationship between spurious variables and adjustment variables during +model training. This approach aids in identifying spurious relations. +Comparative analyses reveal that our framework outperforms three typical +methods, underscoring the effectiveness of our model. Moreover, an ablation +study further demonstrates the critical role our model plays in detecting +spurious relations. + +
+
+
+
+
+ + ♻ ☆ Fairness in Image Search: A Study of Occupational Stereotyping in Image + Retrieval and its Debiasing + + +
+ Multi-modal search engines have experienced significant growth and widespread +use in recent years, making them the second most common internet use. While +search engine systems offer a range of services, the image search field has +recently become a focal point in the information retrieval community, as the +adage goes, "a picture is worth a thousand words". Although popular search +engines like Google excel at image search accuracy and agility, there is an +ongoing debate over whether their search results can be biased in terms of +gender, language, demographics, socio-cultural aspects, and stereotypes. This +potential for bias can have a significant impact on individuals' perceptions +and influence their perspectives. + In this paper, we present our study on bias and fairness in web search, with +a focus on keyword-based image search. We first discuss several kinds of biases +that exist in search systems and why it is important to mitigate them. We +narrow down our study to assessing and mitigating occupational stereotypes in +image search, which is a prevalent fairness issue in image retrieval. For the +assessment of stereotypes, we take gender as an indicator. We explore various +open-source and proprietary APIs for gender identification from images. With +these, we examine the extent of gender bias in top-tanked image search results +obtained for several occupational keywords. To mitigate the bias, we then +propose a fairness-aware re-ranking algorithm that optimizes (a) relevance of +the search result with the keyword and (b) fairness w.r.t genders identified. +We experiment on 100 top-ranked images obtained for 10 occupational keywords +and consider random re-ranking and re-ranking based on relevance as baselines. +Our experimental results show that the fairness-aware re-ranking algorithm +produces rankings with better fairness scores and competitive relevance scores +than the baselines. + +
+
+ comment: 20 Pages, Work uses Proprietary Search Systems from the year 2021 +
+
+
+
+
+ + ♻ ☆ Taken by Surprise: Contrast effect for Similarity Scores + + +
+ Accurately evaluating the similarity of object vector embeddings is of +critical importance for natural language processing, information retrieval and +classification tasks. Popular similarity scores (e.g cosine similarity) are +based on pairs of embedding vectors and disregard the distribution of the +ensemble from which objects are drawn. Human perception of object similarity +significantly depends on the context in which the objects appear. In this work +we propose the $\textit{surprise score}$, an ensemble-normalized similarity +metric that encapsulates the contrast effect of human perception and +significantly improves the classification performance on zero- and few-shot +document classification tasks. This score quantifies the surprise to find a +given similarity between two elements relative to the pairwise ensemble +similarities. We evaluate this metric on zero/few shot classification and +clustering tasks and typically find 10-15 % better performance compared to raw +cosine similarity. Our code is available at +https://github.com/MeetElise/surprise-similarity. + +
+
+ comment: 9 pages, 2 figures and 4 tables +
+
+
+
+
+
+
+
+ + Machine Learning 143 + +
+
+
+ + ☆ Semantic Multi-Resolution Communications + + +
+ Deep learning based joint source-channel coding (JSCC) has demonstrated +significant advancements in data reconstruction compared to separate +source-channel coding (SSCC). This superiority arises from the suboptimality of +SSCC when dealing with finite block-length data. Moreover, SSCC falls short in +reconstructing data in a multi-user and/or multi-resolution fashion, as it only +tries to satisfy the worst channel and/or the highest quality data. To overcome +these limitations, we propose a novel deep learning multi-resolution JSCC +framework inspired by the concept of multi-task learning (MTL). This proposed +framework excels at encoding data for different resolutions through +hierarchical layers and effectively decodes it by leveraging both current and +past layers of encoded data. Moreover, this framework holds great potential for +semantic communication, where the objective extends beyond data reconstruction +to preserving specific semantic attributes throughout the communication +process. These semantic features could be crucial elements such as class +labels, essential for classification tasks, or other key attributes that +require preservation. Within this framework, each level of encoded data can be +carefully designed to retain specific data semantics. As a result, the +precision of a semantic classifier can be progressively enhanced across +successive layers, emphasizing the preservation of targeted semantics +throughout the encoding and decoding stages. We conduct experiments on MNIST +and CIFAR10 dataset. The experiment with both datasets illustrates that our +proposed method is capable of surpassing the SSCC method in reconstructing data +with different resolutions, enabling the extraction of semantic features with +heightened confidence in successive layers. This capability is particularly +advantageous for prioritizing and preserving more crucial semantic features +within the datasets. + +
+
+
+
+
+ + ☆ Tryage: Real-time, intelligent Routing of User Prompts to Large Language + Model + + +
+ The introduction of the transformer architecture and the self-attention +mechanism has led to an explosive production of language models trained on +specific downstream tasks and data domains. With over 200, 000 models in the +Hugging Face ecosystem, users grapple with selecting and optimizing models to +suit multifaceted workflows and data domains while addressing computational, +security, and recency concerns. There is an urgent need for machine learning +frameworks that can eliminate the burden of model selection and customization +and unleash the incredible power of the vast emerging model library for end +users. Here, we propose a context-aware routing system, Tryage, that leverages +a language model router for optimal selection of expert models from a model +library based on analysis of individual input prompts. Inspired by the thalamic +router in the brain, Tryage employs a perceptive router to predict down-stream +model performance on prompts and, then, makes a routing decision using an +objective function that integrates performance predictions with user goals and +constraints that are incorporated through flags (e.g., model size, model +recency). Tryage allows users to explore a Pareto front and automatically +trade-off between task accuracy and secondary goals including minimization of +model size, recency, security, verbosity, and readability. Across heterogeneous +data sets that include code, text, clinical data, and patents, the Tryage +framework surpasses Gorilla and GPT3.5 turbo in dynamic model selection +identifying the optimal model with an accuracy of 50.9% , compared to 23.6% by +GPT 3.5 Turbo and 10.8% by Gorilla. Conceptually, Tryage demonstrates how +routing models can be applied to program and control the behavior of +multi-model LLM systems to maximize efficient use of the expanding and evolving +language model ecosystem. + +
+
+
+
+
+ + ☆ Low Tensor Rank Learning of Neural Dynamics + + +
+ Learning relies on coordinated synaptic changes in recurrently connected +populations of neurons. Therefore, understanding the collective evolution of +synaptic connectivity over learning is a key challenge in neuroscience and +machine learning. In particular, recent work has shown that the weight matrices +of task-trained RNNs are typically low rank, but how this low rank structure +unfolds over learning is unknown. To address this, we investigate the rank of +the 3-tensor formed by the weight matrices throughout learning. By fitting RNNs +of varying rank to large-scale neural recordings during a motor learning task, +we find that the inferred weights are low-tensor-rank and therefore evolve over +a fixed low-dimensional subspace throughout the entire course of learning. We +next validate the observation of low-tensor-rank learning on an RNN trained to +solve the same task by performing a low-tensor-rank decomposition directly on +the ground truth weights, and by showing that the method we applied to the data +faithfully recovers this low rank structure. Finally, we present a set of +mathematical results bounding the matrix and tensor ranks of gradient descent +learning dynamics which show that low-tensor-rank weights emerge naturally in +RNNs trained to solve low-dimensional tasks. Taken together, our findings +provide novel constraints on the evolution of population connectivity over +learning in both biological and artificial neural networks, and enable reverse +engineering of learning-induced changes in recurrent network dynamics from +large-scale neural recordings. + +
+
+ comment: The last two authors contributed equally +
+
+
+
+
+ + ☆ Multi-event Video-Text Retrieval ICCV2023 + + +
+ Video-Text Retrieval (VTR) is a crucial multi-modal task in an era of massive +video-text data on the Internet. A plethora of work characterized by using a +two-stream Vision-Language model architecture that learns a joint +representation of video-text pairs has become a prominent approach for the VTR +task. However, these models operate under the assumption of bijective +video-text correspondences and neglect a more practical scenario where video +content usually encompasses multiple events, while texts like user queries or +webpage metadata tend to be specific and correspond to single events. This +establishes a gap between the previous training objective and real-world +applications, leading to the potential performance degradation of earlier +models during inference. In this study, we introduce the Multi-event Video-Text +Retrieval (MeVTR) task, addressing scenarios in which each video contains +multiple different events, as a niche scenario of the conventional Video-Text +Retrieval Task. We present a simple model, Me-Retriever, which incorporates key +event video representation and a new MeVTR loss for the MeVTR task. +Comprehensive experiments show that this straightforward framework outperforms +other models in the Video-to-Text and Text-to-Video tasks, effectively +establishing a robust baseline for the MeVTR task. We believe this work serves +as a strong foundation for future studies. Code is available at +https://github.com/gengyuanmax/MeVTR. + +
+
+ comment: accepted to ICCV2023 +
+
+
+
+
+ + ☆ A free from local minima algorithm for training regressive MLP neural + networks + + +
+ In this article an innovative method for training regressive MLP networks is +presented, which is not subject to local minima. The Error-Back-Propagation +algorithm, proposed by William-Hinton-Rummelhart, has had the merit of +favouring the development of machine learning techniques, which has permeated +every branch of research and technology since the mid-1980s. This extraordinary +success is largely due to the black-box approach, but this same factor was also +seen as a limitation, as soon more challenging problems were approached. One of +the most critical aspects of the training algorithms was that of local minima +of the loss function, typically the mean squared error of the output on the +training set. In fact, as the most popular training algorithms are driven by +the derivatives of the loss function, there is no possibility to evaluate if a +reached minimum is local or global. The algorithm presented in this paper +avoids the problem of local minima, as the training is based on the properties +of the distribution of the training set, or better on its image internal to the +neural network. The performance of the algorithm is shown for a well-known +benchmark. + +
+
+ comment: 9 pages, 4 figures, theoretical work +
+
+
+
+
+ + ☆ ReLiCADA -- Reservoir Computing using Linear Cellular Automata Design + Algorithm + + +
+ In this paper, we present a novel algorithm to optimize the design of +Reservoir Computing using Cellular Automata models for time series +applications. Besides selecting the models' hyperparameters, the proposed +algorithm particularly solves the open problem of linear Cellular Automaton +rule selection. The selection method pre-selects only a few promising candidate +rules out of an exponentially growing rule space. When applied to relevant +benchmark datasets, the selected rules achieve low errors, with the best rules +being among the top 5% of the overall rule space. The algorithm was developed +based on mathematical analysis of linear Cellular Automaton properties and is +backed by almost one million experiments, adding up to a computational runtime +of nearly one year. Comparisons to other state-of-the-art time series models +show that the proposed Reservoir Computing using Cellular Automata models have +lower computational complexity, at the same time, achieve lower errors. Hence, +our approach reduces the time needed for training and hyperparameter +optimization by up to several orders of magnitude. + +
+
+ comment: 19 pages, 14 figures +
+
+
+
+
+ + ☆ EM for Mixture of Linear Regression with Clustered Data + + +
+ Modern data-driven and distributed learning frameworks deal with diverse +massive data generated by clients spread across heterogeneous environments. +Indeed, data heterogeneity is a major bottleneck in scaling up many distributed +learning paradigms. In many settings however, heterogeneous data may be +generated in clusters with shared structures, as is the case in several +applications such as federated learning where a common latent variable governs +the distribution of all the samples generated by a client. It is therefore +natural to ask how the underlying clustered structures in distributed data can +be exploited to improve learning schemes. In this paper, we tackle this +question in the special case of estimating $d$-dimensional parameters of a +two-component mixture of linear regressions problem where each of $m$ nodes +generates $n$ samples with a shared latent variable. We employ the well-known +Expectation-Maximization (EM) method to estimate the maximum likelihood +parameters from $m$ batches of dependent samples each containing $n$ +measurements. Discarding the clustered structure in the mixture model, EM is +known to require $O(\log(mn/d))$ iterations to reach the statistical accuracy +of $O(\sqrt{d/(mn)})$. In contrast, we show that if initialized properly, EM on +the structured data requires only $O(1)$ iterations to reach the same +statistical accuracy, as long as $m$ grows up as $e^{o(n)}$. Our analysis +establishes and combines novel asymptotic optimization and generalization +guarantees for population and empirical EM with dependent samples, which may be +of independent interest. + +
+
+
+
+
+ + ☆ TrackFlow: Multi-Object Tracking with Normalizing Flows ICCV 2023 + + +
+ The field of multi-object tracking has recently seen a renewed interest in +the good old schema of tracking-by-detection, as its simplicity and strong +priors spare it from the complex design and painful babysitting of +tracking-by-attention approaches. In view of this, we aim at extending +tracking-by-detection to multi-modal settings, where a comprehensive cost has +to be computed from heterogeneous information e.g., 2D motion cues, visual +appearance, and pose estimates. More precisely, we follow a case study where a +rough estimate of 3D information is also available and must be merged with +other traditional metrics (e.g., the IoU). To achieve that, recent approaches +resort to either simple rules or complex heuristics to balance the contribution +of each cost. However, i) they require careful tuning of tailored +hyperparameters on a hold-out set, and ii) they imply these costs to be +independent, which does not hold in reality. We address these issues by +building upon an elegant probabilistic formulation, which considers the cost of +a candidate association as the negative log-likelihood yielded by a deep +density estimator, trained to model the conditional joint probability +distribution of correct associations. Our experiments, conducted on both +simulated and real benchmarks, show that our approach consistently enhances the +performance of several tracking-by-detection algorithms. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Mode Combinability: Exploring Convex Combinations of Permutation Aligned + Models + + +
+ We explore element-wise convex combinations of two permutation-aligned neural +network parameter vectors $\Theta_A$ and $\Theta_B$ of size $d$. We conduct +extensive experiments by examining various distributions of such model +combinations parametrized by elements of the hypercube $[0,1]^{d}$ and its +vicinity. Our findings reveal that broad regions of the hypercube form surfaces +of low loss values, indicating that the notion of linear mode connectivity +extends to a more general phenomenon which we call mode combinability. We also +make several novel observations regarding linear mode connectivity and model +re-basin. We demonstrate a transitivity property: two models re-based to a +common third model are also linear mode connected, and a robustness property: +even with significant perturbations of the neuron matchings the resulting +combinations continue to form a working model. Moreover, we analyze the +functional and weight similarity of model combinations and show that such +combinations are non-vacuous in the sense that there are significant functional +differences between the resulting models. + +
+
+
+
+
+ + ☆ Can Authorship Representation Learning Capture Stylistic Features? ACL 2023 + + +
+ Automatically disentangling an author's style from the content of their +writing is a longstanding and possibly insurmountable problem in computational +linguistics. At the same time, the availability of large text corpora furnished +with author labels has recently enabled learning authorship representations in +a purely data-driven manner for authorship attribution, a task that ostensibly +depends to a greater extent on encoding writing style than encoding content. +However, success on this surrogate task does not ensure that such +representations capture writing style since authorship could also be correlated +with other latent variables, such as topic. In an effort to better understand +the nature of the information these representations convey, and specifically to +validate the hypothesis that they chiefly encode writing style, we +systematically probe these representations through a series of targeted +experiments. The results of these experiments suggest that representations +learned for the surrogate authorship prediction task are indeed sensitive to +writing style. As a consequence, authorship representations may be expected to +be robust to certain kinds of data shift, such as topic drift over time. +Additionally, our findings may open the door to downstream applications that +require stylistic representations, such as style transfer. + +
+
+ comment: appearing at TACL 2023 +
+
+
+
+
+ + ☆ Large Language Models Sensitivity to The Order of Options in + Multiple-Choice Questions + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +various NLP tasks. However, previous works have shown these models are +sensitive towards prompt wording, and few-shot demonstrations and their order, +posing challenges to fair assessment of these models. As these models become +more powerful, it becomes imperative to understand and address these +limitations. In this paper, we focus on LLMs robustness on the task of +multiple-choice questions -- commonly adopted task to study reasoning and +fact-retrieving capability of LLMs. Investigating the sensitivity of LLMs +towards the order of options in multiple-choice questions, we demonstrate a +considerable performance gap of approximately 13% to 75% in LLMs on different +benchmarks, when answer options are reordered, even when using demonstrations +in a few-shot setting. Through a detailed analysis, we conjecture that this +sensitivity arises when LLMs are uncertain about the prediction between the +top-2/3 choices, and specific options placements may favor certain prediction +between those top choices depending on the question caused by positional bias. +We also identify patterns in top-2 choices that amplify or mitigate the model's +bias toward option placement. We found that for amplifying bias, the optimal +strategy involves positioning the top two choices as the first and last +options. Conversely, to mitigate bias, we recommend placing these choices among +the adjacent options. To validate our conjecture, we conduct various +experiments and adopt two approaches to calibrate LLMs' predictions, leading to +up to 8 percentage points improvement across different models and benchmarks. + +
+
+
+
+
+ + ☆ Expecting The Unexpected: Towards Broad Out-Of-Distribution Detection + + +
+ Improving the reliability of deployed machine learning systems often involves +developing methods to detect out-of-distribution (OOD) inputs. However, +existing research often narrowly focuses on samples from classes that are +absent from the training set, neglecting other types of plausible distribution +shifts. This limitation reduces the applicability of these methods in +real-world scenarios, where systems encounter a wide variety of anomalous +inputs. In this study, we categorize five distinct types of distribution shifts +and critically evaluate the performance of recent OOD detection methods on each +of them. We publicly release our benchmark under the name BROAD (Benchmarking +Resilience Over Anomaly Diversity). Our findings reveal that while these +methods excel in detecting unknown classes, their performance is inconsistent +when encountering other types of distribution shifts. In other words, they only +reliably detect unexpected inputs that they have been specifically designed to +expect. As a first step toward broad OOD detection, we learn a generative model +of existing detection scores with a Gaussian mixture. By doing so, we present +an ensemble approach that offers a more consistent and comprehensive solution +for broad OOD detection, demonstrating superior performance compared to +existing methods. Our code to download BROAD and reproduce our experiments is +publicly available. + +
+
+
+
+
+ + ☆ Revisiting column-generation-based matheuristic for learning + classification trees + + +
+ Decision trees are highly interpretable models for solving classification +problems in machine learning (ML). The standard ML algorithms for training +decision trees are fast but generate suboptimal trees in terms of accuracy. +Other discrete optimization models in the literature address the optimality +problem but only work well on relatively small datasets. \cite{firat2020column} +proposed a column-generation-based heuristic approach for learning decision +trees. This approach improves scalability and can work with large datasets. In +this paper, we describe improvements to this column generation approach. First, +we modify the subproblem model to significantly reduce the number of +subproblems in multiclass classification instances. Next, we show that the +data-dependent constraints in the master problem are implied, and use them as +cutting planes. Furthermore, we describe a separation model to generate data +points for which the linear programming relaxation solution violates their +corresponding constraints. We conclude by presenting computational results that +show that these modifications result in better scalability. + +
+
+ comment: Submitted to Computers and Operations Research journal +
+
+
+
+
+ + ☆ Internal Cross-layer Gradients for Extending Homogeneity to + Heterogeneity in Federated Learning + + +
+ Federated learning (FL) inevitably confronts the challenge of system +heterogeneity in practical scenarios. To enhance the capabilities of most +model-homogeneous FL methods in handling system heterogeneity, we propose a +training scheme that can extend their capabilities to cope with this challenge. +In this paper, we commence our study with a detailed exploration of homogeneous +and heterogeneous FL settings and discover three key observations: (1) a +positive correlation between client performance and layer similarities, (2) +higher similarities in the shallow layers in contrast to the deep layers, and +(3) the smoother gradients distributions indicate the higher layer +similarities. Building upon these observations, we propose InCo Aggregation +that leverags internal cross-layer gradients, a mixture of gradients from +shallow and deep layers within a server model, to augment the similarity in the +deep layers without requiring additional communication between clients. +Furthermore, our methods can be tailored to accommodate model-homogeneous FL +methods such as FedAvg, FedProx, FedNova, Scaffold, and MOON, to expand their +capabilities to handle the system heterogeneity. Copious experimental results +validate the effectiveness of InCo Aggregation, spotlighting internal +cross-layer gradients as a promising avenue to enhance the performance in +heterogenous FL. + +
+
+ comment: Preprint. Under review +
+
+
+
+
+ + ☆ A Survey on Self-Supervised Representation Learning + + +
+ Learning meaningful representations is at the heart of many tasks in the +field of modern machine learning. Recently, a lot of methods were introduced +that allow learning of image representations without supervision. These +representations can then be used in downstream tasks like classification or +object detection. The quality of these representations is close to supervised +learning, while no labeled images are needed. This survey paper provides a +comprehensive review of these methods in a unified notation, points out +similarities and differences of these methods, and proposes a taxonomy which +sets these methods in relation to each other. Furthermore, our survey +summarizes the most-recent experimental results reported in the literature in +form of a meta-study. Our survey is intended as a starting point for +researchers and practitioners who want to dive into the field of representation +learning. + +
+
+
+
+
+ + ☆ Masked Momentum Contrastive Learning for Zero-shot Semantic + Understanding + + +
+ Self-supervised pretraining (SSP) has emerged as a popular technique in +machine learning, enabling the extraction of meaningful feature representations +without labelled data. In the realm of computer vision, pretrained vision +transformers (ViTs) have played a pivotal role in advancing transfer learning. +Nonetheless, the escalating cost of finetuning these large models has posed a +challenge due to the explosion of model size. This study endeavours to evaluate +the effectiveness of pure self-supervised learning (SSL) techniques in computer +vision tasks, obviating the need for finetuning, with the intention of +emulating human-like capabilities in generalisation and recognition of unseen +objects. To this end, we propose an evaluation protocol for zero-shot +segmentation based on a prompting patch. Given a point on the target object as +a prompt, the algorithm calculates the similarity map between the selected +patch and other patches, upon that, a simple thresholding is applied to segment +the target. Another evaluation is intra-object and inter-object similarity to +gauge discriminatory ability of SSP ViTs. Insights from zero-shot segmentation +from prompting and discriminatory abilities of SSP led to the design of a +simple SSP approach, termed MMC. This approaches combines Masked image +modelling for encouraging similarity of local features, Momentum based +self-distillation for transferring semantics from global to local features, and +global Contrast for promoting semantics of global features, to enhance +discriminative representations of SSP ViTs. Consequently, our proposed method +significantly reduces the overlap of intra-object and inter-object +similarities, thereby facilitating effective object segmentation within an +image. Our experiments reveal that MMC delivers top-tier results in zero-shot +semantic segmentation across various datasets. + +
+
+
+
+
+ + ☆ Exploration of Rashomon Set Assists Explanations for Medical Data + + +
+ The machine learning modeling process conventionally culminates in selecting +a single model that maximizes a selected performance metric. However, this +approach leads to abandoning a more profound analysis of slightly inferior +models. Particularly in medical and healthcare studies, where the objective +extends beyond predictions to valuable insight generation, relying solely on +performance metrics can result in misleading or incomplete conclusions. This +problem is particularly pertinent when dealing with a set of models with +performance close to maximum one, known as $\textit{Rashomon set}$. Such a set +can be numerous and may contain models describing the data in a different way, +which calls for comprehensive analysis. This paper introduces a novel process +to explore Rashomon set models, extending the conventional modeling approach. +The cornerstone is the identification of the most different models within the +Rashomon set, facilitated by the introduced $\texttt{Rashomon_DETECT}$ +algorithm. This algorithm compares profiles illustrating prediction +dependencies on variable values generated by eXplainable Artificial +Intelligence (XAI) techniques. To quantify differences in variable effects +among models, we introduce the Profile Disparity Index (PDI) based on measures +from functional data analysis. To illustrate the effectiveness of our approach, +we showcase its application in predicting survival among hemophagocytic +lymphohistiocytosis (HLH) patients - a foundational case study. Additionally, +we benchmark our approach on other medical data sets, demonstrating its +versatility and utility in various contexts. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ TurboViT: Generating Fast Vision Transformers via Generative + Architecture Search + + +
+ Vision transformers have shown unprecedented levels of performance in +tackling various visual perception tasks in recent years. However, the +architectural and computational complexity of such network architectures have +made them challenging to deploy in real-world applications with +high-throughput, low-memory requirements. As such, there has been significant +research recently on the design of efficient vision transformer architectures. +In this study, we explore the generation of fast vision transformer +architecture designs via generative architecture search (GAS) to achieve a +strong balance between accuracy and architectural and computational efficiency. +Through this generative architecture search process, we create TurboViT, a +highly efficient hierarchical vision transformer architecture design that is +generated around mask unit attention and Q-pooling design patterns. The +resulting TurboViT architecture design achieves significantly lower +architectural computational complexity (>2.47$\times$ smaller than FasterViT-0 +while achieving same accuracy) and computational complexity (>3.4$\times$ fewer +FLOPs and 0.9% higher accuracy than MobileViT2-2.0) when compared to 10 other +state-of-the-art efficient vision transformer network architecture designs +within a similar range of accuracy on the ImageNet-1K dataset. Furthermore, +TurboViT demonstrated strong inference latency and throughput in both +low-latency and batch processing scenarios (>3.21$\times$ lower latency and +>3.18$\times$ higher throughput compared to FasterViT-0 for low-latency +scenario). These promising results demonstrate the efficacy of leveraging +generative architecture search for generating efficient transformer +architecture designs for high-throughput scenarios. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Designing an attack-defense game: how to increase robustness of + financial transaction models via a competition + + +
+ Given the escalating risks of malicious attacks in the finance sector and the +consequential severe damage, a thorough understanding of adversarial strategies +and robust defense mechanisms for machine learning models is critical. The +threat becomes even more severe with the increased adoption in banks more +accurate, but potentially fragile neural networks. We aim to investigate the +current state and dynamics of adversarial attacks and defenses for neural +network models that use sequential financial data as the input. + To achieve this goal, we have designed a competition that allows realistic +and detailed investigation of problems in modern financial transaction data. +The participants compete directly against each other, so possible attacks and +defenses are examined in close-to-real-life conditions. Our main contributions +are the analysis of the competition dynamics that answers the questions on how +important it is to conceal a model from malicious users, how long does it take +to break it, and what techniques one should use to make it more robust, and +introduction additional way to attack models or increase their robustness. + Our analysis continues with a meta-study on the used approaches with their +power, numerical experiments, and accompanied ablations studies. We show that +the developed attacks and defenses outperform existing alternatives from the +literature while being practical in terms of execution, proving the validity of +the competition as a tool for uncovering vulnerabilities of machine learning +models and mitigating them in various domains. + +
+
+
+
+
+ + ☆ Non-Redundant Combination of Hand-Crafted and Deep Learning Radiomics: + Application to the Early Detection of Pancreatic Cancer MICCAI 2023 + + +
+ We address the problem of learning Deep Learning Radiomics (DLR) that are not +redundant with Hand-Crafted Radiomics (HCR). To do so, we extract DLR features +using a VAE while enforcing their independence with HCR features by minimizing +their mutual information. The resulting DLR features can be combined with +hand-crafted ones and leveraged by a classifier to predict early markers of +cancer. We illustrate our method on four early markers of pancreatic cancer and +validate it on a large independent test set. Our results highlight the value of +combining non-redundant DLR and HCR features, as evidenced by an improvement in +the Area Under the Curve compared to baseline methods that do not address +redundancy or solely rely on HCR features. + +
+
+ comment: CaPTion workshop MICCAI 2023 +
+
+
+
+
+ + ☆ Targeted Data Augmentation for bias mitigation + + +
+ The development of fair and ethical AI systems requires careful consideration +of bias mitigation, an area often overlooked or ignored. In this study, we +introduce a novel and efficient approach for addressing biases called Targeted +Data Augmentation (TDA), which leverages classical data augmentation techniques +to tackle the pressing issue of bias in data and models. Unlike the laborious +task of removing biases, our method proposes to insert biases instead, +resulting in improved performance. To identify biases, we annotated two diverse +datasets: a dataset of clinical skin lesions and a dataset of male and female +faces. These bias annotations are published for the first time in this study, +providing a valuable resource for future research. Through Counterfactual Bias +Insertion, we discovered that biases associated with the frame, ruler, and +glasses had a significant impact on models. By randomly introducing biases +during training, we mitigated these biases and achieved a substantial decrease +in bias measures, ranging from two-fold to more than 50-fold, while maintaining +a negligible increase in the error rate. + +
+
+
+
+
+ + ☆ Interpretable Distribution-Invariant Fairness Measures for Continuous + Scores + + +
+ Measures of algorithmic fairness are usually discussed in the context of +binary decisions. We extend the approach to continuous scores. So far, +ROC-based measures have mainly been suggested for this purpose. Other existing +methods depend heavily on the distribution of scores, are unsuitable for +ranking tasks, or their effect sizes are not interpretable. Here, we propose a +distributionally invariant version of fairness measures for continuous scores +with a reasonable interpretation based on the Wasserstein distance. Our +measures are easily computable and well suited for quantifying and interpreting +the strength of group disparities as well as for comparing biases across +different models, datasets, or time points. We derive a link between the +different families of existing fairness measures for scores and show that the +proposed distributionally invariant fairness measures outperform ROC-based +fairness measures because they are more explicit and can quantify significant +biases that ROC-based fairness measures miss. Finally, we demonstrate their +effectiveness through experiments on the most commonly used fairness benchmark +datasets. + +
+
+
+
+
+ + ☆ How Much Temporal Long-Term Context is Needed for Action Segmentation? ICCV 2023 + + +
+ Modeling long-term context in videos is crucial for many fine-grained tasks +including temporal action segmentation. An interesting question that is still +open is how much long-term temporal context is needed for optimal performance. +While transformers can model the long-term context of a video, this becomes +computationally prohibitive for long videos. Recent works on temporal action +segmentation thus combine temporal convolutional networks with self-attentions +that are computed only for a local temporal window. While these approaches show +good results, their performance is limited by their inability to capture the +full context of a video. In this work, we try to answer how much long-term +temporal context is required for temporal action segmentation by introducing a +transformer-based model that leverages sparse attention to capture the full +context of a video. We compare our model with the current state of the art on +three datasets for temporal action segmentation, namely 50Salads, Breakfast, +and Assembly101. Our experiments show that modeling the full context of a video +is necessary to obtain the best performance for temporal action segmentation. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Machine learning assisted exploration for affine Deligne-Lusztig + varieties + + +
+ This paper presents a novel, interdisciplinary study that leverages a Machine +Learning (ML) assisted framework to explore the geometry of affine +Deligne-Lusztig varieties (ADLV). The primary objective is to investigate the +nonemptiness pattern, dimension and enumeration of irreducible components of +ADLV. Our proposed framework demonstrates a recursive pipeline of data +generation, model training, pattern analysis, and human examination, presenting +an intricate interplay between ML and pure mathematical research. Notably, our +data-generation process is nuanced, emphasizing the selection of meaningful +subsets and appropriate feature sets. We demonstrate that this framework has a +potential to accelerate pure mathematical research, leading to the discovery of +new conjectures and promising research directions that could otherwise take +significant time to uncover. We rediscover the virtual dimension formula and +provide a full mathematical proof of a newly identified problem concerning a +certain lower bound of dimension. Furthermore, we extend an open invitation to +the readers by providing the source code for computing ADLV and the ML models, +promoting further explorations. This paper concludes by sharing valuable +experiences and highlighting lessons learned from this collaboration. + +
+
+ comment: 36 pages +
+
+
+
+
+ + ☆ Careful at Estimation and Bold at Exploration + + +
+ Exploration strategies in continuous action space are often heuristic due to +the infinite actions, and these kinds of methods cannot derive a general +conclusion. In prior work, it has been shown that policy-based exploration is +beneficial for continuous action space in deterministic policy reinforcement +learning(DPRL). However, policy-based exploration in DPRL has two prominent +issues: aimless exploration and policy divergence, and the policy gradient for +exploration is only sometimes helpful due to inaccurate estimation. Based on +the double-Q function framework, we introduce a novel exploration strategy to +mitigate these issues, separate from the policy gradient. We first propose the +greedy Q softmax update schema for Q value update. The expected Q value is +derived by weighted summing the conservative Q value over actions, and the +weight is the corresponding greedy Q value. Greedy Q takes the maximum value of +the two Q functions, and conservative Q takes the minimum value of the two +different Q functions. For practicality, this theoretical basis is then +extended to allow us to combine action exploration with the Q value update, +except for the premise that we have a surrogate policy that behaves like this +exploration policy. In practice, we construct such an exploration policy with a +few sampled actions, and to meet the premise, we learn such a surrogate policy +by minimizing the KL divergence between the target policy and the exploration +policy constructed by the conservative Q. We evaluate our method on the Mujoco +benchmark and demonstrate superior performance compared to previous +state-of-the-art methods across various environments, particularly in the most +complex Humanoid environment. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ ProAgent: Building Proactive Cooperative AI with Large Language Models + + +
+ Building AIs with adaptive behaviors in human-AI cooperation stands as a +pivotal focus in AGI research. Current methods for developing cooperative +agents predominantly rely on learning-based methods, where policy +generalization heavily hinges on past interactions with specific teammates. +These approaches constrain the agent's capacity to recalibrate its strategy +when confronted with novel teammates. We propose \textbf{ProAgent}, a novel +framework that harnesses large language models (LLMs) to fashion a +\textit{pro}active \textit{agent} empowered with the ability to anticipate +teammates' forthcoming decisions and formulate enhanced plans for itself. +ProAgent excels at cooperative reasoning with the capacity to dynamically adapt +its behavior to enhance collaborative efforts with teammates. Moreover, the +ProAgent framework exhibits a high degree of modularity and interpretability, +facilitating seamless integration to address a wide array of coordination +scenarios. Experimental evaluations conducted within the framework of +\textit{Overcook-AI} unveil the remarkable performance superiority of ProAgent, +outperforming five methods based on self-play and population-based training in +cooperation with AI agents. Further, when cooperating with human proxy models, +its performance exhibits an average improvement exceeding 10\% compared to the +current state-of-the-art, COLE. The advancement was consistently observed +across diverse scenarios involving interactions with both AI agents of varying +characteristics and human counterparts. These findings inspire future research +for human-robot collaborations. For a hands-on demonstration, please visit +\url{https://pku-proagent.github.io}. + +
+
+
+
+
+ + ☆ Protect Federated Learning Against Backdoor Attacks via Data-Free + Trigger Generation + + +
+ As a distributed machine learning paradigm, Federated Learning (FL) enables +large-scale clients to collaboratively train a model without sharing their raw +data. However, due to the lack of data auditing for untrusted clients, FL is +vulnerable to poisoning attacks, especially backdoor attacks. By using poisoned +data for local training or directly changing the model parameters, attackers +can easily inject backdoors into the model, which can trigger the model to make +misclassification of targeted patterns in images. To address these issues, we +propose a novel data-free trigger-generation-based defense approach based on +the two characteristics of backdoor attacks: i) triggers are learned faster +than normal knowledge, and ii) trigger patterns have a greater effect on image +classification than normal class patterns. Our approach generates the images +with newly learned knowledge by identifying the differences between the old and +new global models, and filters trigger images by evaluating the effect of these +generated images. By using these trigger images, our approach eliminates +poisoned models to ensure the updated global model is benign. Comprehensive +experiments demonstrate that our approach can defend against almost all the +existing types of backdoor attacks and outperform all the seven +state-of-the-art defense methods with both IID and non-IID scenarios. +Especially, our approach can successfully defend against the backdoor attack +even when 80\% of the clients are malicious. + +
+
+
+
+
+ + ☆ Uncertainty Estimation of Transformers' Predictions via Topological + Analysis of the Attention Matrices + + +
+ Determining the degree of confidence of deep learning model in its prediction +is an open problem in the field of natural language processing. Most of the +classical methods for uncertainty estimation are quite weak for text +classification models. We set the task of obtaining an uncertainty estimate for +neural networks based on the Transformer architecture. A key feature of such +mo-dels is the attention mechanism, which supports the information flow between +the hidden representations of tokens in the neural network. We explore the +formed relationships between internal representations using Topological Data +Analysis methods and utilize them to predict model's confidence. In this paper, +we propose a method for uncertainty estimation based on the topological +properties of the attention mechanism and compare it with classical methods. As +a result, the proposed algorithm surpasses the existing methods in quality and +opens up a new area of application of the attention mechanism, but requires the +selection of topological features. + +
+
+
+
+
+ + ☆ Network Momentum across Asset Classes + + +
+ We investigate the concept of network momentum, a novel trading signal +derived from momentum spillover across assets. Initially observed within the +confines of pairwise economic and fundamental ties, such as the stock-bond +connection of the same company and stocks linked through supply-demand chains, +momentum spillover implies a propagation of momentum risk premium from one +asset to another. The similarity of momentum risk premium, exemplified by +co-movement patterns, has been spotted across multiple asset classes including +commodities, equities, bonds and currencies. However, studying the network +effect of momentum spillover across these classes has been challenging due to a +lack of readily available common characteristics or economic ties beyond the +company level. In this paper, we explore the interconnections of momentum +features across a diverse range of 64 continuous future contracts spanning +these four classes. We utilise a linear and interpretable graph learning model +with minimal assumptions to reveal the intricacies of the momentum spillover +network. By leveraging the learned networks, we construct a network momentum +strategy that exhibits a Sharpe ratio of 1.5 and an annual return of 22%, after +volatility scaling, from 2000 to 2022. This paper pioneers the examination of +momentum spillover across multiple asset classes using only pricing data, +presents a multi-asset investment strategy based on network momentum, and +underscores the effectiveness of this strategy through robust empirical +analysis. + +
+
+ comment: 27 pages +
+
+
+
+
+ + ☆ Improving Knot Prediction in Wood Logs with Longitudinal Feature + Propagation + + +
+ The quality of a wood log in the wood industry depends heavily on the +presence of both outer and inner defects, including inner knots that are a +result of the growth of tree branches. Today, locating the inner knots require +the use of expensive equipment such as X-ray scanners. In this paper, we +address the task of predicting the location of inner defects from the outer +shape of the logs. The dataset is built by extracting both the contours and the +knots with X-ray measurements. We propose to solve this binary segmentation +task by leveraging convolutional recurrent neural networks. Once the neural +network is trained, inference can be performed from the outer shape measured +with cheap devices such as laser profilers. We demonstrate the effectiveness of +our approach on fir and spruce tree species and perform ablation on the +recurrence to demonstrate its importance. + +
+
+
+
+
+ + ☆ ShadowNet for Data-Centric Quantum System Learning + + +
+ Understanding the dynamics of large quantum systems is hindered by the curse +of dimensionality. Statistical learning offers new possibilities in this regime +by neural-network protocols and classical shadows, while both methods have +limitations: the former is plagued by the predictive uncertainty and the latter +lacks the generalization ability. Here we propose a data-centric learning +paradigm combining the strength of these two approaches to facilitate diverse +quantum system learning (QSL) tasks. Particularly, our paradigm utilizes +classical shadows along with other easily obtainable information of quantum +systems to create the training dataset, which is then learnt by neural networks +to unveil the underlying mapping rule of the explored QSL problem. Capitalizing +on the generalization power of neural networks, this paradigm can be trained +offline and excel at predicting previously unseen systems at the inference +stage, even with few state copies. Besides, it inherits the characteristic of +classical shadows, enabling memory-efficient storage and faithful prediction. +These features underscore the immense potential of the proposed data-centric +approach in discovering novel and large-scale quantum systems. For +concreteness, we present the instantiation of our paradigm in quantum state +tomography and direct fidelity estimation tasks and conduct numerical analysis +up to 60 qubits. Our work showcases the profound prospects of data-centric +artificial intelligence to advance QSL in a faithful and generalizable manner. + +
+
+
+
+
+ + ☆ Test Time Embedding Normalization for Popularity Bias Mitigation CIKM 2023 + + +
+ Popularity bias is a widespread problem in the field of recommender systems, +where popular items tend to dominate recommendation results. In this work, we +propose 'Test Time Embedding Normalization' as a simple yet effective strategy +for mitigating popularity bias, which surpasses the performance of the previous +mitigation approaches by a significant margin. Our approach utilizes the +normalized item embedding during the inference stage to control the influence +of embedding magnitude, which is highly correlated with item popularity. +Through extensive experiments, we show that our method combined with the +sampled softmax loss effectively reduces popularity bias compare to previous +approaches for bias mitigation. We further investigate the relationship between +user and item embeddings and find that the angular similarity between +embeddings distinguishes preferable and non-preferable items regardless of +their popularity. The analysis explains the mechanism behind the success of our +approach in eliminating the impact of popularity bias. Our code is available at +https://github.com/ml-postech/TTEN. + +
+
+ comment: 5 pages, CIKM 2023 +
+
+
+
+
+ + ☆ CNN based Cuneiform Sign Detection Learned from Annotated 3D Renderings + and Mapped Photographs with Illumination Augmentation ICCV23 + + +
+ Motivated by the challenges of the Digital Ancient Near Eastern Studies +(DANES) community, we develop digital tools for processing cuneiform script +being a 3D script imprinted into clay tablets used for more than three +millennia and at least eight major languages. It consists of thousands of +characters that have changed over time and space. Photographs are the most +common representations usable for machine learning, while ink drawings are +prone to interpretation. Best suited 3D datasets that are becoming available. +We created and used the HeiCuBeDa and MaiCuBeDa datasets, which consist of +around 500 annotated tablets. For our novel OCR-like approach to mixed image +data, we provide an additional mapping tool for transferring annotations +between 3D renderings and photographs. Our sign localization uses a RepPoints +detector to predict the locations of characters as bounding boxes. We use image +data from GigaMesh's MSII (curvature, see https://gigamesh.eu) based rendering, +Phong-shaded 3D models, and photographs as well as illumination augmentation. +The results show that using rendered 3D images for sign detection performs +better than other work on photographs. In addition, our approach gives +reasonably good results for photographs only, while it is best used for mixed +datasets. More importantly, the Phong renderings, and especially the MSII +renderings, improve the results on photographs, which is the largest dataset on +a global scale. + +
+
+ comment: This paper was accepted to ICCV23 and includes the DOI for an Open + Access Dataset with annotated cuneiform script +
+
+
+
+
+ + ☆ FoX: Formation-aware exploration in multi-agent reinforcement learning AAAI + + +
+ Recently, deep multi-agent reinforcement learning (MARL) has gained +significant popularity due to its success in various cooperative multi-agent +tasks. However, exploration still remains a challenging problem in MARL due to +the partial observability of the agents and the exploration space that can grow +exponentially as the number of agents increases. Firstly, in order to address +the scalability issue of the exploration space, we define a formation-based +equivalence relation on the exploration space and aim to reduce the search +space by exploring only meaningful states in different formations. Then, we +propose a novel formation-aware exploration (FoX) framework that encourages +partially observable agents to visit the states in diverse formations by +guiding them to be well aware of their current formation solely based on their +own observations. Numerical results show that the proposed FoX framework +significantly outperforms the state-of-the-art MARL algorithms on Google +Research Football (GRF) and sparse Starcraft II multi-agent challenge (SMAC) +tasks. + +
+
+ comment: 7 pages main, 5 pages appendix with reference. 10 figures, submitted + for AAAI +
+
+
+
+
+ + ☆ Quantum-Inspired Machine Learning: a Survey + + +
+ Quantum-inspired Machine Learning (QiML) is a burgeoning field, receiving +global attention from researchers for its potential to leverage principles of +quantum mechanics within classical computational frameworks. However, current +review literature often presents a superficial exploration of QiML, focusing +instead on the broader Quantum Machine Learning (QML) field. In response to +this gap, this survey provides an integrated and comprehensive examination of +QiML, exploring QiML's diverse research domains including tensor network +simulations, dequantized algorithms, and others, showcasing recent +advancements, practical applications, and illuminating potential future +research avenues. Further, a concrete definition of QiML is established by +analyzing various prior interpretations of the term and their inherent +ambiguities. As QiML continues to evolve, we anticipate a wealth of future +developments drawing from quantum mechanics, quantum computing, and classical +machine learning, enriching the field further. This survey serves as a guide +for researchers and practitioners alike, providing a holistic understanding of +QiML's current landscape and future directions. + +
+
+ comment: 56 pages, 13 figures, 8 tables +
+
+
+
+
+ + ☆ Robust Lagrangian and Adversarial Policy Gradient for Robust Constrained + Markov Decision Processes + + +
+ The robust constrained Markov decision process (RCMDP) is a recent +task-modelling framework for reinforcement learning that incorporates +behavioural constraints and that provides robustness to errors in the +transition dynamics model through the use of an uncertainty set. Simulating +RCMDPs requires computing the worst-case dynamics based on value estimates for +each state, an approach which has previously been used in the Robust +Constrained Policy Gradient (RCPG). Highlighting potential downsides of RCPG +such as not robustifying the full constrained objective and the lack of +incremental learning, this paper introduces two algorithms, called RCPG with +Robust Lagrangian and Adversarial RCPG. RCPG with Robust Lagrangian modifies +RCPG by taking the worst-case dynamics based on the Lagrangian rather than +either the value or the constraint. Adversarial RCPG also formulates the +worst-case dynamics based on the Lagrangian but learns this directly and +incrementally as an adversarial policy through gradient descent rather than +indirectly and abruptly through constrained optimisation on a sorted value +list. A theoretical analysis first derives the Lagrangian policy gradient for +the policy optimisation of both proposed algorithms and then the adversarial +policy gradient to learn the adversary for Adversarial RCPG. Empirical +experiments injecting perturbations in inventory management and safe navigation +tasks demonstrate the competitive performance of both algorithms compared to +traditional RCPG variants as well as non-robust and non-constrained ablations. +In particular, Adversarial RCPG ranks among the top two performing algorithms +on all tests. + +
+
+
+
+
+ + ☆ Efficient Last-iterate Convergence Algorithms in Solving Games + + +
+ No-regret algorithms are popular for learning Nash equilibrium (NE) in +two-player zero-sum normal-form games (NFGs) and extensive-form games (EFGs). +Many recent works consider the last-iterate convergence no-regret algorithms. +Among them, the two most famous algorithms are Optimistic Gradient Descent +Ascent (OGDA) and Optimistic Multiplicative Weight Update (OMWU). However, OGDA +has high per-iteration complexity. OMWU exhibits a lower per-iteration +complexity but poorer empirical performance, and its convergence holds only +when NE is unique. Recent works propose a Reward Transformation (RT) framework +for MWU, which removes the uniqueness condition and achieves competitive +performance with OMWU. Unfortunately, RT-based algorithms perform worse than +OGDA under the same number of iterations, and their convergence guarantee is +based on the continuous-time feedback assumption, which does not hold in most +scenarios. To address these issues, we provide a closer analysis of the RT +framework, which holds for both continuous and discrete-time feedback. We +demonstrate that the essence of the RT framework is to transform the problem of +learning NE in the original game into a series of strongly convex-concave +optimization problems (SCCPs). We show that the bottleneck of RT-based +algorithms is the speed of solving SCCPs. To improve the their empirical +performance, we design a novel transformation method to enable the SCCPs can be +solved by Regret Matching+ (RM+), a no-regret algorithm with better empirical +performance, resulting in Reward Transformation RM+ (RTRM+). RTRM+ enjoys +last-iterate convergence under the discrete-time feedback setting. Using the +counterfactual regret decomposition framework, we propose Reward Transformation +CFR+ (RTCFR+) to extend RTRM+ to EFGs. Experimental results show that our +algorithms significantly outperform existing last-iterate convergence +algorithms and RM+ (CFR+). + +
+
+
+
+
+ + ☆ A survey on bias in machine learning research + + +
+ Current research on bias in machine learning often focuses on fairness, while +overlooking the roots or causes of bias. However, bias was originally defined +as a "systematic error," often caused by humans at different stages of the +research process. This article aims to bridge the gap between past literature +on bias in research by providing taxonomy for potential sources of bias and +errors in data and models. The paper focus on bias in machine learning +pipelines. Survey analyses over forty potential sources of bias in the machine +learning (ML) pipeline, providing clear examples for each. By understanding the +sources and consequences of bias in machine learning, better methods can be +developed for its detecting and mitigating, leading to fairer, more +transparent, and more accurate ML models. + +
+
+ comment: Submitted to journal. arXiv admin note: substantial text overlap with + arXiv:2308.09464 +
+
+
+
+
+ + ☆ Multi-Source Domain Adaptation for Cross-Domain Fault Diagnosis of + Chemical Processes + + +
+ Fault diagnosis is an essential component in process supervision. Indeed, it +determines which kind of fault has occurred, given that it has been previously +detected, allowing for appropriate intervention. Automatic fault diagnosis +systems use machine learning for predicting the fault type from sensor +readings. Nonetheless, these models are sensible to changes in the data +distributions, which may be caused by changes in the monitored process, such as +changes in the mode of operation. This scenario is known as Cross-Domain Fault +Diagnosis (CDFD). We provide an extensive comparison of single and multi-source +unsupervised domain adaptation (SSDA and MSDA respectively) algorithms for +CDFD. We study these methods in the context of the Tennessee-Eastmann Process, +a widely used benchmark in the chemical industry. We show that using multiple +domains during training has a positive effect, even when no adaptation is +employed. As such, the MSDA baseline improves over the SSDA baseline +classification accuracy by 23% on average. In addition, under the +multiple-sources scenario, we improve classification accuracy of the no +adaptation setting by 8.4% on average. + +
+
+ comment: 18 pages,15 figures +
+
+
+
+
+ + ☆ An Effective Transformer-based Contextual Model and Temporal Gate + Pooling for Speaker Identification + + +
+ Wav2vec2 has achieved success in applying Transformer architecture and +self-supervised learning to speech recognition. Recently, these have come to be +used not only for speech recognition but also for the entire speech processing. +This paper introduces an effective end-to-end speaker identification model +applied Transformer-based contextual model. We explored the relationship +between the parameters and the performance in order to discern the structure of +an effective model. Furthermore, we propose a pooling method, Temporal Gate +Pooling, with powerful learning ability for speaker identification. We applied +Conformer as encoder and BEST-RQ for pre-training and conducted an evaluation +utilizing the speaker identification of VoxCeleb1. The proposed method has +achieved an accuracy of 85.9% with 28.5M parameters, demonstrating comparable +precision to wav2vec2 with 317.7M parameters. Code is available at +https://github.com/HarunoriKawano/speaker-identification-with-tgp. + +
+
+
+
+
+ + ☆ Minwise-Independent Permutations with Insertion and Deletion of Features + + +
+ In their seminal work, Broder \textit{et. al.}~\citep{BroderCFM98} introduces +the $\mathrm{minHash}$ algorithm that computes a low-dimensional sketch of +high-dimensional binary data that closely approximates pairwise Jaccard +similarity. Since its invention, $\mathrm{minHash}$ has been commonly used by +practitioners in various big data applications. Further, the data is dynamic in +many real-life scenarios, and their feature sets evolve over time. We consider +the case when features are dynamically inserted and deleted in the dataset. We +note that a naive solution to this problem is to repeatedly recompute +$\mathrm{minHash}$ with respect to the updated dimension. However, this is an +expensive task as it requires generating fresh random permutations. To the best +of our knowledge, no systematic study of $\mathrm{minHash}$ is recorded in the +context of dynamic insertion and deletion of features. In this work, we +initiate this study and suggest algorithms that make the $\mathrm{minHash}$ +sketches adaptable to the dynamic insertion and deletion of features. We show a +rigorous theoretical analysis of our algorithms and complement it with +extensive experiments on several real-world datasets. Empirically we observe a +significant speed-up in the running time while simultaneously offering +comparable performance with respect to running $\mathrm{minHash}$ from scratch. +Our proposal is efficient, accurate, and easy to implement in practice. + +
+
+
+
+
+ + ☆ Federated Learning on Patient Data for Privacy-Protecting Polycystic + Ovary Syndrome Treatment + + +
+ The field of women's endocrinology has trailed behind data-driven medical +solutions, largely due to concerns over the privacy of patient data. Valuable +datapoints about hormone levels or menstrual cycling could expose patients who +suffer from comorbidities or terminate a pregnancy, violating their privacy. We +explore the application of Federated Learning (FL) to predict the optimal drug +for patients with polycystic ovary syndrome (PCOS). PCOS is a serious hormonal +disorder impacting millions of women worldwide, yet it's poorly understood and +its research is stunted by a lack of patient data. We demonstrate that a +variety of FL approaches succeed on a synthetic PCOS patient dataset. Our +proposed FL models are a tool to access massive quantities of diverse data and +identify the most effective treatment option while providing PCOS patients with +privacy guarantees. + +
+
+
+
+
+ + ☆ Federated Learning in Big Model Era: Domain-Specific Multimodal Large + Models + + +
+ Multimodal data, which can comprehensively perceive and recognize the +physical world, has become an essential path towards general artificial +intelligence. However, multimodal large models trained on public datasets often +underperform in specific industrial domains. This paper proposes a multimodal +federated learning framework that enables multiple enterprises to utilize +private domain data to collaboratively train large models for vertical domains, +achieving intelligent services across scenarios. The authors discuss in-depth +the strategic transformation of federated learning in terms of intelligence +foundation and objectives in the era of big model, as well as the new +challenges faced in heterogeneous data, model aggregation, performance and cost +trade-off, data privacy, and incentive mechanism. The paper elaborates a case +study of leading enterprises contributing multimodal data and expert knowledge +to city safety operation management , including distributed deployment and +efficient coordination of the federated learning platform, technical +innovations on data quality improvement based on large model capabilities and +efficient joint fine-tuning approaches. Preliminary experiments show that +enterprises can enhance and accumulate intelligent capabilities through +multimodal model federated learning, thereby jointly creating an smart city +model that provides high-quality intelligent services covering energy +infrastructure safety, residential community security, and urban operation +management. The established federated learning cooperation ecosystem is +expected to further aggregate industry, academia, and research resources, +realize large models in multiple vertical domains, and promote the large-scale +industrial application of artificial intelligence and cutting-edge research on +multimodal federated learning. + +
+
+
+
+
+ + ☆ Hamiltonian GAN + + +
+ A growing body of work leverages the Hamiltonian formalism as an inductive +bias for physically plausible neural network based video generation. The +structure of the Hamiltonian ensures conservation of a learned quantity (e.g., +energy) and imposes a phase-space interpretation on the low-dimensional +manifold underlying the input video. While this interpretation has the +potential to facilitate the integration of learned representations in +downstream tasks, existing methods are limited in their applicability as they +require a structural prior for the configuration space at design time. In this +work, we present a GAN-based video generation pipeline with a learned +configuration space map and Hamiltonian neural network motion model, to learn a +representation of the configuration space from data. We train our model with a +physics-inspired cyclic-coordinate loss function which encourages a minimal +representation of the configuration space and improves interpretability. We +demonstrate the efficacy and advantages of our approach on the Hamiltonian +Dynamics Suite Toy Physics dataset. + +
+
+
+
+
+ + ☆ A Simple Framework for Multi-mode Spatial-Temporal Data Modeling + + +
+ Spatial-temporal data modeling aims to mine the underlying spatial +relationships and temporal dependencies of objects in a system. However, most +existing methods focus on the modeling of spatial-temporal data in a single +mode, lacking the understanding of multiple modes. Though very few methods have +been presented to learn the multi-mode relationships recently, they are built +on complicated components with higher model complexities. In this paper, we +propose a simple framework for multi-mode spatial-temporal data modeling to +bring both effectiveness and efficiency together. Specifically, we design a +general cross-mode spatial relationships learning component to adaptively +establish connections between multiple modes and propagate information along +the learned connections. Moreover, we employ multi-layer perceptrons to capture +the temporal dependencies and channel correlations, which are conceptually and +technically succinct. Experiments on three real-world datasets show that our +model can consistently outperform the baselines with lower space and time +complexity, opening up a promising direction for modeling spatial-temporal +data. The generalizability of the cross-mode spatial relationships learning +module is also validated. + +
+
+
+
+
+ + ☆ SegRNN: Segment Recurrent Neural Network for Long-Term Time Series + Forecasting + + +
+ RNN-based methods have faced challenges in the Long-term Time Series +Forecasting (LTSF) domain when dealing with excessively long look-back windows +and forecast horizons. Consequently, the dominance in this domain has shifted +towards Transformer, MLP, and CNN approaches. The substantial number of +recurrent iterations are the fundamental reasons behind the limitations of RNNs +in LTSF. To address these issues, we propose two novel strategies to reduce the +number of iterations in RNNs for LTSF tasks: Segment-wise Iterations and +Parallel Multi-step Forecasting (PMF). RNNs that combine these strategies, +namely SegRNN, significantly reduce the required recurrent iterations for LTSF, +resulting in notable improvements in forecast accuracy and inference speed. +Extensive experiments demonstrate that SegRNN not only outperforms SOTA +Transformer-based models but also reduces runtime and memory usage by more than +78%. These achievements provide strong evidence that RNNs continue to excel in +LTSF tasks and encourage further exploration of this domain with more RNN-based +approaches. The source code is coming soon. + +
+
+
+
+
+ + ☆ ConcatPlexer: Additional Dim1 Batching for Faster ViTs + + +
+ Transformers have demonstrated tremendous success not only in the natural +language processing (NLP) domain but also the field of computer vision, +igniting various creative approaches and applications. Yet, the superior +performance and modeling flexibility of transformers came with a severe +increase in computation costs, and hence several works have proposed methods to +reduce this burden. Inspired by a cost-cutting method originally proposed for +language models, Data Multiplexing (DataMUX), we propose a novel approach for +efficient visual recognition that employs additional dim1 batching (i.e., +concatenation) that greatly improves the throughput with little compromise in +the accuracy. We first introduce a naive adaptation of DataMux for vision +models, Image Multiplexer, and devise novel components to overcome its +weaknesses, rendering our final model, ConcatPlexer, at the sweet spot between +inference speed and accuracy. The ConcatPlexer was trained on ImageNet1K and +CIFAR100 dataset and it achieved 23.5% less GFLOPs than ViT-B/16 with 69.5% and +83.4% validation accuracy, respectively. + +
+
+
+
+
+ + ☆ Toward Generalizable Machine Learning Models in Speech, Language, and + Hearing Sciences: Power Analysis and Sample Size Estimation + + +
+ This study's first purpose is to provide quantitative evidence that would +incentivize researchers to instead use the more robust method of nested +cross-validation. The second purpose is to present methods and MATLAB codes for +doing power analysis for ML-based analysis during the design of a study. Monte +Carlo simulations were used to quantify the interactions between the employed +cross-validation method, the discriminative power of features, the +dimensionality of the feature space, and the dimensionality of the model. Four +different cross-validations (single holdout, 10-fold, train-validation-test, +and nested 10-fold) were compared based on the statistical power and +statistical confidence of the ML models. Distributions of the null and +alternative hypotheses were used to determine the minimum required sample size +for obtaining a statistically significant outcome ({\alpha}=0.05, +1-\b{eta}=0.8). Statistical confidence of the model was defined as the +probability of correct features being selected and hence being included in the +final model. Our analysis showed that the model generated based on the single +holdout method had very low statistical power and statistical confidence and +that it significantly overestimated the accuracy. Conversely, the nested +10-fold cross-validation resulted in the highest statistical confidence and the +highest statistical power, while providing an unbiased estimate of the +accuracy. The required sample size with a single holdout could be 50% higher +than what would be needed if nested cross-validation were used. Confidence in +the model based on nested cross-validation was as much as four times higher +than the confidence in the single holdout-based model. A computational model, +MATLAB codes, and lookup tables are provided to assist researchers with +estimating the sample size during the design of their future studies. + +
+
+ comment: Under review at JSLHR +
+
+
+
+
+ + ☆ Automatic Task Parallelization of Dataflow Graphs in ML/DL models + + +
+ Several methods exist today to accelerate Machine Learning(ML) or +Deep-Learning(DL) model performance for training and inference. However, modern +techniques that rely on various graph and operator parallelism methodologies +rely on search space optimizations which are costly in terms of power and +hardware usage. Especially in the case of inference, when the batch size is 1 +and execution is on CPUs or for power-constrained edge devices, current +techniques can become costly, complicated or inapplicable. To ameliorate this, +we present a Critical-Path-based Linear Clustering approach to exploit inherent +parallel paths in ML dataflow graphs. Our task parallelization approach further +optimizes the structure of graphs via cloning and prunes them via constant +propagation and dead-code elimination. Contrary to other work, we generate +readable and executable parallel Pytorch+Python code from input ML models in +ONNX format via a new tool that we have built called {\bf Ramiel}. This allows +us to benefit from other downstream acceleration techniques like intra-op +parallelism and potentially pipeline parallelism. Our preliminary results on +several ML graphs demonstrate up to 1.9$\times$ speedup over serial execution +and outperform some of the current mechanisms in both compile and runtimes. +Lastly, our methods are lightweight and fast enough so that they can be used +effectively for power and resource-constrained devices, while still enabling +downstream optimizations. + +
+
+
+
+
+ + ☆ Diversity Measures: Domain-Independent Proxies for Failure in Language + Model Queries + + +
+ Error prediction in large language models often relies on domain-specific +information. In this paper, we present measures for quantification of error in +the response of a large language model based on the diversity of responses to a +given prompt - hence independent of the underlying application. We describe how +three such measures - based on entropy, Gini impurity, and centroid distance - +can be employed. We perform a suite of experiments on multiple datasets and +temperature settings to demonstrate that these measures strongly correlate with +the probability of failure. Additionally, we present empirical results +demonstrating how these measures can be applied to few-shot prompting, +chain-of-thought reasoning, and error detection. + +
+
+
+
+
+ + ☆ A three in one bottom-up framework for simultaneous semantic + segmentation, instance segmentation and classification of multi-organ nuclei + in digital cancer histology + + +
+ Simultaneous segmentation and classification of nuclei in digital histology +play an essential role in computer-assisted cancer diagnosis; however, it +remains challenging. The highest achieved binary and multi-class Panoptic +Quality (PQ) remains as low as 0.68 bPQ and 0.49 mPQ, respectively. It is due +to the higher staining variability, variability across the tissue, rough +clinical conditions, overlapping nuclei, and nuclear class imbalance. The +generic deep-learning methods usually rely on end-to-end models, which fail to +address these problems associated explicitly with digital histology. In our +previous work, DAN-NucNet, we resolved these issues for semantic segmentation +with an end-to-end model. This work extends our previous model to simultaneous +instance segmentation and classification. We introduce additional decoder heads +with independent weighted losses, which produce semantic segmentation, edge +proposals, and classification maps. We use the outputs from the three-head +model to apply post-processing to produce the final segmentation and +classification. Our multi-stage approach utilizes edge proposals and semantic +segmentations compared to direct segmentation and classification strategies +followed by most state-of-the-art methods. Due to this, we demonstrate a +significant performance improvement in producing high-quality instance +segmentation and nuclei classification. We have achieved a 0.841 Dice score for +semantic segmentation, 0.713 bPQ scores for instance segmentation, and 0.633 +mPQ for nuclei classification. Our proposed framework is generalized across 19 +types of tissues. Furthermore, the framework is less complex compared to the +state-of-the-art. + +
+
+
+
+
+ + ☆ A Preliminary Investigation into Search and Matching for Tumour + Discrimination in WHO Breast Taxonomy Using Deep Networks + + +
+ Breast cancer is one of the most common cancers affecting women worldwide. +They include a group of malignant neoplasms with a variety of biological, +clinical, and histopathological characteristics. There are more than 35 +different histological forms of breast lesions that can be classified and +diagnosed histologically according to cell morphology, growth, and architecture +patterns. Recently, deep learning, in the field of artificial intelligence, has +drawn a lot of attention for the computerized representation of medical images. +Searchable digital atlases can provide pathologists with patch matching tools +allowing them to search among evidently diagnosed and treated archival cases, a +technology that may be regarded as computational second opinion. In this study, +we indexed and analyzed the WHO breast taxonomy (Classification of Tumours 5th +Ed.) spanning 35 tumour types. We visualized all tumour types using deep +features extracted from a state-of-the-art deep learning model, pre-trained on +millions of diagnostic histopathology images from the TCGA repository. +Furthermore, we test the concept of a digital "atlas" as a reference for search +and matching with rare test cases. The patch similarity search within the WHO +breast taxonomy data reached over 88% accuracy when validating through +"majority vote" and more than 91% accuracy when validating using top-n tumour +types. These results show for the first time that complex relationships among +common and rare breast lesions can be investigated using an indexed digital +archive. + +
+
+
+
+
+ + ☆ xxMD: Benchmarking Neural Force Fields Using Extended Dynamics beyond + Equilibrium + + +
+ Neural force fields (NFFs) have gained prominence in computational chemistry +as surrogate models, superseding quantum-chemistry calculations in ab initio +molecular dynamics. The prevalent benchmark for NFFs has been the MD17 dataset +and its subsequent extension. These datasets predominantly comprise geometries +from the equilibrium region of the ground electronic state potential energy +surface, sampling from direct adiabatic dynamics. However, many chemical +reactions entail significant molecular deformations, notably bond breaking. We +demonstrate the constrained distribution of internal coordinates and energies +in the MD17 datasets, underscoring their inadequacy for representing systems +undergoing chemical reactions. Addressing this sampling limitation, we +introduce the xxMD (Extended Excited-state Molecular Dynamics) dataset, derived +from non-adiabatic dynamics. This dataset encompasses energies and forces +ascertained from both multireference wave function theory and density +functional theory. Furthermore, its nuclear configuration spaces authentically +depict chemical reactions, making xxMD a more chemically relevant dataset. Our +re-assessment of equivariant models on the xxMD datasets reveals notably higher +mean absolute errors than those reported for MD17 and its variants. This +observation underscores the challenges faced in crafting a generalizable NFF +model with extrapolation capability. Our proposed xxMD-CASSCF and xxMD-DFT +datasets are available at \url{https://github.com/zpengmei/xxMD}. + +
+
+ comment: 19 pages, many figures. Data available at + \url{https://github.com/zpengmei/xxMD} +
+
+
+
+
+ + ☆ Mobility-Aware Computation Offloading for Swarm Robotics using Deep + Reinforcement Learning + + +
+ Swarm robotics is envisioned to automate a large number of dirty, dangerous, +and dull tasks. Robots have limited energy, computation capability, and +communication resources. Therefore, current swarm robotics have a small number +of robots, which can only provide limited spatio-temporal information. In this +paper, we propose to leverage the mobile edge computing to alleviate the +computation burden. We develop an effective solution based on a mobility-aware +deep reinforcement learning model at the edge server side for computing +scheduling and resource. Our results show that the proposed approach can meet +delay requirements and guarantee computation precision by using minimum robot +energy. + +
+
+
+
+
+ + ☆ Energy-Efficient On-Board Radio Resource Management for Satellite + Communications via Neuromorphic Computing + + +
+ The latest satellite communication (SatCom) missions are characterized by a +fully reconfigurable on-board software-defined payload, capable of adapting +radio resources to the temporal and spatial variations of the system traffic. +As pure optimization-based solutions have shown to be computationally tedious +and to lack flexibility, machine learning (ML)-based methods have emerged as +promising alternatives. We investigate the application of energy-efficient +brain-inspired ML models for on-board radio resource management. Apart from +software simulation, we report extensive experimental results leveraging the +recently released Intel Loihi 2 chip. To benchmark the performance of the +proposed model, we implement conventional convolutional neural networks (CNN) +on a Xilinx Versal VCK5000, and provide a detailed comparison of accuracy, +precision, recall, and energy efficiency for different traffic demands. Most +notably, for relevant workloads, spiking neural networks (SNNs) implemented on +Loihi 2 yield higher accuracy, while reducing power consumption by more than +100$\times$ as compared to the CNN-based reference platform. Our findings point +to the significant potential of neuromorphic computing and SNNs in supporting +on-board SatCom operations, paving the way for enhanced efficiency and +sustainability in future SatCom systems. + +
+
+ comment: currently under review at IEEE Transactions on Machine Learning in + Communications and Networking +
+
+
+
+
+ + ☆ LLaMA-Reviewer: Advancing Code Review Automation with Large Language + Models through Parameter-Efficient Fine-Tuning (Practical Experience Report) + + +
+ The automation of code review activities, a long-standing pursuit in software +engineering, has been primarily addressed by numerous domain-specific +pre-trained models. Despite their success, these models frequently demand +extensive resources for pre-training from scratch. In contrast, Large Language +Models (LLMs) provide an intriguing alternative, given their remarkable +capabilities when supplemented with domain-specific knowledge. However, their +potential for automating code review tasks remains largely unexplored. + In response to this research gap, we present LLaMA-Reviewer, an innovative +framework that leverages the capabilities of LLaMA, a popular LLM, in the realm +of code review. Mindful of resource constraints, this framework employs +parameter-efficient fine-tuning (PEFT) methods, delivering high performance +while using less than 1% of trainable parameters. + An extensive evaluation of LLaMA-Reviewer is conducted on two diverse, +publicly available datasets. Notably, even with the smallest LLaMA base model +consisting of 6.7B parameters and a limited number of tuning epochs, +LLaMA-Reviewer equals the performance of existing code-review-focused models. + The ablation experiments provide insights into the influence of various +fine-tuning process components, including input representation, instruction +tuning, and different PEFT methods. To foster continuous progress in this +field, the code and all PEFT-weight plugins have been made open-source. + +
+
+ comment: Accepted to the 34th IEEE International Symposium on Software + Reliability Engineering (ISSRE 2023) +
+
+
+
+
+ + ☆ Exploring Unsupervised Cell Recognition with Prior Self-activation Maps MICCAI 2023 + + +
+ The success of supervised deep learning models on cell recognition tasks +relies on detailed annotations. Many previous works have managed to reduce the +dependency on labels. However, considering the large number of cells contained +in a patch, costly and inefficient labeling is still inevitable. To this end, +we explored label-free methods for cell recognition. Prior self-activation maps +(PSM) are proposed to generate pseudo masks as training targets. To be +specific, an activation network is trained with self-supervised learning. The +gradient information in the shallow layers of the network is aggregated to +generate prior self-activation maps. Afterward, a semantic clustering module is +then introduced as a pipeline to transform PSMs to pixel-level semantic pseudo +masks for downstream tasks. We evaluated our method on two histological +datasets: MoNuSeg (cell segmentation) and BCData (multi-class cell detection). +Compared with other fully-supervised and weakly-supervised methods, our method +can achieve competitive performance without any manual annotations. Our simple +but effective framework can also achieve multi-class cell detection which can +not be done by existing unsupervised methods. The results show the potential of +PSMs that might inspire other research to deal with the hunger for labels in +medical area. + +
+
+ comment: MICCAI 2023. arXiv admin note: substantial text overlap with + arXiv:2210.07862 +
+
+
+
+
+ + ☆ Graph Encoding and Neural Network Approaches for Volleyball Analytics: + From Game Outcome to Individual Play Predictions KDD 2023 + + +
+ This research aims to improve the accuracy of complex volleyball predictions +and provide more meaningful insights to coaches and players. We introduce a +specialized graph encoding technique to add additional contact-by-contact +volleyball context to an already available volleyball dataset without any +additional data gathering. We demonstrate the potential benefits of using graph +neural networks (GNNs) on this enriched dataset for three different volleyball +prediction tasks: rally outcome prediction, set location prediction, and hit +type prediction. We compare the performance of our graph-based models to +baseline models and analyze the results to better understand the underlying +relationships in a volleyball rally. Our results show that the use of GNNs with +our graph encoding yields a much more advanced analysis of the data, which +noticeably improves prediction results overall. We also show that these +baseline tasks can be significantly improved with simple adjustments, such as +removing blocked hits. Lastly, we demonstrate the importance of choosing a +model architecture that will better extract the important information for a +certain task. Overall, our study showcases the potential strengths and +weaknesses of using graph encodings in sports data analytics and hopefully will +inspire future improvements in machine learning strategies across sports and +applications by using graphbased encodings. + +
+
+ comment: This paper is an extended version of the one accepted at the KDD 2023 + Workshop on Data Science and AI for Sports (DSAI4Sports), entitled + 'RallyGraph: Specialized Graph Encoding for Enhanced Volleyball' +
+
+
+
+
+ + ☆ Towards Validating Long-Term User Feedbacks in Interactive + Recommendation Systems SIGIR'22 + + +
+ Interactive Recommender Systems (IRSs) have attracted a lot of attention, due +to their ability to model interactive processes between users and recommender +systems. Numerous approaches have adopted Reinforcement Learning (RL) +algorithms, as these can directly maximize users' cumulative rewards. In IRS, +researchers commonly utilize publicly available review datasets to compare and +evaluate algorithms. However, user feedback provided in public datasets merely +includes instant responses (e.g., a rating), with no inclusion of delayed +responses (e.g., the dwell time and the lifetime value). Thus, the question +remains whether these review datasets are an appropriate choice to evaluate the +long-term effects of the IRS. In this work, we revisited experiments on IRS +with review datasets and compared RL-based models with a simple reward model +that greedily recommends the item with the highest one-step reward. Following +extensive analysis, we can reveal three main findings: First, a simple greedy +reward model consistently outperforms RL-based models in maximizing cumulative +rewards. Second, applying higher weighting to long-term rewards leads to a +degradation of recommendation performance. Third, user feedbacks have mere +long-term effects on the benchmark datasets. Based on our findings, we conclude +that a dataset has to be carefully verified and that a simple greedy baseline +should be included for a proper evaluation of RL-based IRS approaches. + +
+
+ comment: Accepted to SIGIR'22 +
+
+
+
+
+ + ☆ Transformers for Capturing Multi-level Graph Structure using + Hierarchical Distances + + +
+ Graph transformers need strong inductive biases to derive meaningful +attention scores. Yet, current proposals rarely address methods capturing +longer ranges, hierarchical structures, or community structures, as they appear +in various graphs such as molecules, social networks, and citation networks. In +this paper, we propose a hierarchy-distance structural encoding (HDSE), which +models a hierarchical distance between the nodes in a graph focusing on its +multi-level, hierarchical nature. In particular, this yields a framework which +can be flexibly integrated with existing graph transformers, allowing for +simultaneous application with other positional representations. Through +extensive experiments on 12 real-world datasets, we demonstrate that our HDSE +method successfully enhances various types of baseline transformers, achieving +state-of-the-art empirical performances on 10 benchmark datasets. + +
+
+
+
+
+ + ☆ How Expressive are Graph Neural Networks in Recommendation? CIKM + + +
+ Graph Neural Networks (GNNs) have demonstrated superior performance on +various graph learning tasks, including recommendation, where they leverage +user-item collaborative filtering signals in graphs. However, theoretical +formulations of their capability are scarce, despite their empirical +effectiveness in state-of-the-art recommender models. Recently, research has +explored the expressiveness of GNNs in general, demonstrating that message +passing GNNs are at most as powerful as the Weisfeiler-Lehman test, and that +GNNs combined with random node initialization are universal. Nevertheless, the +concept of "expressiveness" for GNNs remains vaguely defined. Most existing +works adopt the graph isomorphism test as the metric of expressiveness, but +this graph-level task may not effectively assess a model's ability in +recommendation, where the objective is to distinguish nodes of different +closeness. In this paper, we provide a comprehensive theoretical analysis of +the expressiveness of GNNs in recommendation, considering three levels of +expressiveness metrics: graph isomorphism (graph-level), node automorphism +(node-level), and topological closeness (link-level). We propose the +topological closeness metric to evaluate GNNs' ability to capture the +structural distance between nodes, which aligns closely with the objective of +recommendation. To validate the effectiveness of this new metric in evaluating +recommendation performance, we introduce a learning-less GNN algorithm that is +optimal on the new metric and can be optimal on the node-level metric with +suitable modification. We conduct extensive experiments comparing the proposed +algorithm against various types of state-of-the-art GNN models to explore the +explainability of the new metric in the recommendation task. For +reproducibility, implementation codes are available at +https://github.com/HKUDS/GTE. + +
+
+ comment: 32nd ACM International Conference on Information and Knowledge + Management (CIKM) 2023 +
+
+
+
+
+ + ☆ Random Word Data Augmentation with CLIP for Zero-Shot Anomaly Detection BMVC2023 + + +
+ This paper presents a novel method that leverages a visual-language model, +CLIP, as a data source for zero-shot anomaly detection. Tremendous efforts have +been put towards developing anomaly detectors due to their potential industrial +applications. Considering the difficulty in acquiring various anomalous samples +for training, most existing methods train models with only normal samples and +measure discrepancies from the distribution of normal samples during inference, +which requires training a model for each object category. The problem of this +inefficient training requirement has been tackled by designing a CLIP-based +anomaly detector that applies prompt-guided classification to each part of an +image in a sliding window manner. However, the method still suffers from the +labor of careful prompt ensembling with known object categories. To overcome +the issues above, we propose leveraging CLIP as a data source for training. Our +method generates text embeddings with the text encoder in CLIP with typical +prompts that include words of normal and anomaly. In addition to these words, +we insert several randomly generated words into prompts, which enables the +encoder to generate a diverse set of normal and anomalous samples. Using the +generated embeddings as training data, a feed-forward neural network learns to +extract features of normal and anomaly from CLIP's embeddings, and as a result, +a category-agnostic anomaly detector can be obtained without any training +images. Experimental results demonstrate that our method achieves +state-of-the-art performance without laborious prompt ensembling in zero-shot +setups. + +
+
+ comment: Accepted to BMVC2023 +
+
+
+
+
+ + ☆ Development of a Novel Quantum Pre-processing Filter to Improve Image + Classification Accuracy of Neural Network Models + + +
+ This paper proposes a novel quantum pre-processing filter (QPF) to improve +the image classification accuracy of neural network (NN) models. A simple four +qubit quantum circuit that uses Y rotation gates for encoding and two +controlled NOT gates for creating correlation among the qubits is applied as a +feature extraction filter prior to passing data into the fully connected NN +architecture. By applying the QPF approach, the results show that the image +classification accuracy based on the MNIST (handwritten 10 digits) and the +EMNIST (handwritten 47 class digits and letters) datasets can be improved, from +92.5% to 95.4% and from 68.9% to 75.9%, respectively. These improvements were +obtained without introducing extra model parameters or optimizations in the +machine learning process. However, tests performed on the developed QPF +approach against a relatively complex GTSRB dataset with 43 distinct class +real-life traffic sign images showed a degradation in the classification +accuracy. Considering this result, further research into the understanding and +the design of a more suitable quantum circuit approach for image classification +neural networks could be explored utilizing the baseline method proposed in +this paper. + +
+
+ comment: 13 pages, 10 figures +
+
+
+
+
+ + ☆ CAME: Contrastive Automated Model Evaluation ICCV2023 + + +
+ The Automated Model Evaluation (AutoEval) framework entertains the +possibility of evaluating a trained machine learning model without resorting to +a labeled testing set. Despite the promise and some decent results, the +existing AutoEval methods heavily rely on computing distribution shifts between +the unlabelled testing set and the training set. We believe this reliance on +the training set becomes another obstacle in shipping this technology to +real-world ML development. In this work, we propose Contrastive Automatic Model +Evaluation (CAME), a novel AutoEval framework that is rid of involving training +set in the loop. The core idea of CAME bases on a theoretical analysis which +bonds the model performance with a contrastive loss. Further, with extensive +empirical validation, we manage to set up a predictable relationship between +the two, simply by deducing on the unlabeled/unseen testing set. The resulting +framework CAME establishes a new SOTA results for AutoEval by surpassing prior +work significantly. + +
+
+ comment: ICCV2023 main conference +
+
+
+
+
+ + ☆ Anonymity at Risk? Assessing Re-Identification Capabilities of Large + Language Models + + +
+ Anonymity of both natural and legal persons in court rulings is a critical +aspect of privacy protection in the European Union and Switzerland. With the +advent of LLMs, concerns about large-scale re-identification of anonymized +persons are growing. In accordance with the Federal Supreme Court of +Switzerland, we explore the potential of LLMs to re-identify individuals in +court rulings by constructing a proof-of-concept using actual legal data from +the Swiss federal supreme court. Following the initial experiment, we +constructed an anonymized Wikipedia dataset as a more rigorous testing ground +to further investigate the findings. With the introduction and application of +the new task of re-identifying people in texts, we also introduce new metrics +to measure performance. We systematically analyze the factors that influence +successful re-identifications, identifying model size, input length, and +instruction tuning among the most critical determinants. Despite high +re-identification rates on Wikipedia, even the best LLMs struggled with court +decisions. The complexity is attributed to the lack of test datasets, the +necessity for substantial training resources, and data sparsity in the +information used for re-identification. In conclusion, this study demonstrates +that re-identification using LLMs may not be feasible for now, but as the +proof-of-concept on Wikipedia showed, it might become possible in the future. +We hope that our system can help enhance the confidence in the security of +anonymized decisions, thus leading to the courts being more confident to +publish decisions. + +
+
+
+
+
+ + ☆ Explicability and Inexplicability in the Interpretation of Quantum + Neural Networks + + +
+ Interpretability of artificial intelligence (AI) methods, particularly deep +neural networks, is of great interest due to the widespread use of AI-backed +systems, which often have unexplainable behavior. The interpretability of such +models is a crucial component of building trusted systems. Many methods exist +to approach this problem, but they do not obviously generalize to the quantum +setting. Here we explore the interpretability of quantum neural networks using +local model-agnostic interpretability measures of quantum and classical neural +networks. We introduce the concept of the band of inexplicability, representing +the interpretable region in which data samples have no explanation, likely +victims of inherently random quantum measurements. We see this as a step toward +understanding how to build responsible and accountable quantum AI models. + +
+
+
+
+
+ + ☆ Video OWL-ViT: Temporally-consistent open-world localization in video ICCV 2023 + + +
+ We present an architecture and a training recipe that adapts pre-trained +open-world image models to localization in videos. Understanding the open +visual world (without being constrained by fixed label spaces) is crucial for +many real-world vision tasks. Contrastive pre-training on large image-text +datasets has recently led to significant improvements for image-level tasks. +For more structured tasks involving object localization applying pre-trained +models is more challenging. This is particularly true for video tasks, where +task-specific data is limited. We show successful transfer of open-world models +by building on the OWL-ViT open-vocabulary detection model and adapting it to +video by adding a transformer decoder. The decoder propagates object +representations recurrently through time by using the output tokens for one +frame as the object queries for the next. Our model is end-to-end trainable on +video data and enjoys improved temporal consistency compared to +tracking-by-detection baselines, while retaining the open-world capabilities of +the backbone detector. We evaluate our model on the challenging TAO-OW +benchmark and demonstrate that open-world capabilities, learned from +large-scale image-text pre-training, can be transferred successfully to +open-world localization across diverse videos. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Addressing Fairness and Explainability in Image Classification Using + Optimal Transport + + +
+ Algorithmic Fairness and the explainability of potentially unfair outcomes +are crucial for establishing trust and accountability of Artificial +Intelligence systems in domains such as healthcare and policing. Though +significant advances have been made in each of the fields separately, achieving +explainability in fairness applications remains challenging, particularly so in +domains where deep neural networks are used. At the same time, ethical +data-mining has become ever more relevant, as it has been shown countless times +that fairness-unaware algorithms result in biased outcomes. Current approaches +focus on mitigating biases in the outcomes of the model, but few attempts have +been made to try to explain \emph{why} a model is biased. To bridge this gap, +we propose a comprehensive approach that leverages optimal transport theory to +uncover the causes and implications of biased regions in images, which easily +extends to tabular data as well. Through the use of Wasserstein barycenters, we +obtain scores that are independent of a sensitive variable but keep their +marginal orderings. This step ensures predictive accuracy but also helps us to +recover the regions most associated with the generation of the biases. Our +findings hold significant implications for the development of trustworthy and +unbiased AI systems, fostering transparency, accountability, and fairness in +critical decision-making scenarios across diverse domains. + +
+
+
+
+
+ + ☆ Characterizing normal perinatal development of the human brain + structural connectivity + + +
+ Early brain development is characterized by the formation of a highly +organized structural connectome. The interconnected nature of this connectome +underlies the brain's cognitive abilities and influences its response to +diseases and environmental factors. Hence, quantitative assessment of +structural connectivity in the perinatal stage is useful for studying normal +and abnormal neurodevelopment. However, estimation of the connectome from +diffusion MRI data involves complex computations. For the perinatal period, +these computations are further challenged by the rapid brain development and +imaging difficulties. Combined with high inter-subject variability, these +factors make it difficult to chart the normal development of the structural +connectome. As a result, there is a lack of reliable normative baselines of +structural connectivity metrics at this critical stage in brain development. In +this study, we developed a computational framework, based on spatio-temporal +averaging, for determining such baselines. We used this framework to analyze +the structural connectivity between 33 and 44 postmenstrual weeks using data +from 166 subjects. Our results unveiled clear and strong trends in the +development of structural connectivity in perinatal stage. Connection weighting +based on fractional anisotropy and neurite density produced the most consistent +results. We observed increases in global and local efficiency, a decrease in +characteristic path length, and widespread strengthening of the connections +within and across brain lobes and hemispheres. We also observed asymmetry +patterns that were consistent between different connection weighting +approaches. The new computational method and results are useful for assessing +normal and abnormal development of the structural connectome early in life. + +
+
+
+
+
+ + ☆ Performance Comparison and Implementation of Bayesian Variants for + Network Intrusion Detection + + +
+ Bayesian classifiers perform well when each of the features is completely +independent of the other which is not always valid in real world application. +The aim of this study is to implement and compare the performances of each +variant of Bayesian classifier (Multinomial, Bernoulli, and Gaussian) on +anomaly detection in network intrusion, and to investigate whether there is any +association between each variant assumption and their performance. Our +investigation showed that each variant of Bayesian algorithm blindly follows +its assumption regardless of feature property, and that the assumption is the +single most important factor that influences their accuracy. Experimental +results show that Bernoulli has accuracy of 69.9% test (71% train), Multinomial +has accuracy of 31.2% test (31.2% train), while Gaussian has accuracy of 81.69% +test (82.84% train). Going deeper, we investigated and found that each Naive +Bayes variants performances and accuracy is largely due to each classifier +assumption, Gaussian classifier performed best on anomaly detection due to its +assumption that features follow normal distributions which are continuous, +while multinomial classifier have a dismal performance as it simply assumes +discreet and multinomial distribution. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Exploring the Effectiveness of GPT Models in Test-Taking: A Case Study + of the Driver's License Knowledge Test + + +
+ Large language models such as Open AI's Generative Pre-trained Transformer +(GPT) models are proficient at answering questions, but their knowledge is +confined to the information present in their training data. This limitation +renders them ineffective when confronted with questions about recent +developments or non-public documents. Our research proposes a method that +enables GPT models to answer questions by employing context from an information +source not previously included in their training data. The methodology includes +preprocessing of contextual information, the embedding of contexts and queries, +constructing prompt through the integration of context embeddings, and +generating answers using GPT models. We applied this method in a controlled +test scenario using the California Driver's Handbook as the information source. +The GPT-3 model achieved a 96% passing score on a set of 50 sample driving +knowledge test questions. In contrast, without context, the model's passing +score fell to 82%. However, the model still fails to answer some questions +correctly even with providing library of context, highlighting room for +improvement. The research also examined the impact of prompt length and context +format, on the model's performance. Overall, the study provides insights into +the limitations and potential improvements for GPT models in question-answering +tasks. + +
+
+
+
+
+ + ☆ Accel-GCN: High-Performance GPU Accelerator Design for Graph Convolution + Networks + + +
+ Graph Convolutional Networks (GCNs) are pivotal in extracting latent +information from graph data across various domains, yet their acceleration on +mainstream GPUs is challenged by workload imbalance and memory access +irregularity. To address these challenges, we present Accel-GCN, a GPU +accelerator architecture for GCNs. The design of Accel-GCN encompasses: (i) a +lightweight degree sorting stage to group nodes with similar degree; (ii) a +block-level partition strategy that dynamically adjusts warp workload sizes, +enhancing shared memory locality and workload balance, and reducing metadata +overhead compared to designs like GNNAdvisor; (iii) a combined warp strategy +that improves memory coalescing and computational parallelism in the column +dimension of dense matrices. + Utilizing these principles, we formulated a kernel for sparse matrix +multiplication (SpMM) in GCNs that employs block-level partitioning and +combined warp strategy. This approach augments performance and multi-level +memory efficiency and optimizes memory bandwidth by exploiting memory +coalescing and alignment. Evaluation of Accel-GCN across 18 benchmark graphs +reveals that it outperforms cuSPARSE, GNNAdvisor, and graph-BLAST by factors of +1.17 times, 1.86 times, and 2.94 times respectively. The results underscore +Accel-GCN as an effective solution for enhancing GCN computational efficiency. + +
+
+ comment: ICCAD 2023 accepted publication +
+
+
+
+
+ + ☆ PatchBackdoor: Backdoor Attack against Deep Neural Networks without + Model Modification ACM MM 2023 + + +
+ Backdoor attack is a major threat to deep learning systems in safety-critical +scenarios, which aims to trigger misbehavior of neural network models under +attacker-controlled conditions. However, most backdoor attacks have to modify +the neural network models through training with poisoned data and/or direct +model editing, which leads to a common but false belief that backdoor attack +can be easily avoided by properly protecting the model. In this paper, we show +that backdoor attacks can be achieved without any model modification. Instead +of injecting backdoor logic into the training data or the model, we propose to +place a carefully-designed patch (namely backdoor patch) in front of the +camera, which is fed into the model together with the input images. The patch +can be trained to behave normally at most of the time, while producing wrong +prediction when the input image contains an attacker-controlled trigger object. +Our main techniques include an effective training method to generate the +backdoor patch and a digital-physical transformation modeling method to enhance +the feasibility of the patch in real deployments. Extensive experiments show +that PatchBackdoor can be applied to common deep learning models (VGG, +MobileNet, ResNet) with an attack success rate of 93% to 99% on classification +tasks. Moreover, we implement PatchBackdoor in real-world scenarios and show +that the attack is still threatening. + +
+
+ comment: accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ Mitigating Health Disparity on Biased Electronic Health Records via + Deconfounder + + +
+ The fairness issue of clinical data modeling, especially on Electronic Health +Records (EHRs), is of utmost importance due to EHR's complex latent structure +and potential selection bias. It is frequently necessary to mitigate health +disparity while keeping the model's overall accuracy in practice. However, +traditional methods often encounter the trade-off between accuracy and +fairness, as they fail to capture the underlying factors beyond observed data. +To tackle this challenge, we propose a novel model called Fair Longitudinal +Medical Deconfounder (FLMD) that aims to achieve both fairness and accuracy in +longitudinal Electronic Health Records (EHR) modeling. Drawing inspiration from +the deconfounder theory, FLMD employs a two-stage training process. In the +first stage, FLMD captures unobserved confounders for each encounter, which +effectively represents underlying medical factors beyond observed EHR, such as +patient genotypes and lifestyle habits. This unobserved confounder is crucial +for addressing the accuracy/fairness dilemma. In the second stage, FLMD +combines the learned latent representation with other relevant features to make +predictions. By incorporating appropriate fairness criteria, such as +counterfactual fairness, FLMD ensures that it maintains high prediction +accuracy while simultaneously minimizing health disparities. We conducted +comprehensive experiments on two real-world EHR datasets to demonstrate the +effectiveness of FLMD. Apart from the comparison of baseline methods and FLMD +variants in terms of fairness and accuracy, we assessed the performance of all +models on disturbed/imbalanced and synthetic datasets to showcase the +superiority of FLMD across different settings and provide valuable insights +into its capabilities. + +
+
+
+
+
+ + ☆ Incorporating Nonlocal Traffic Flow Model in Physics-informed Neural + Networks + + +
+ This research contributes to the advancement of traffic state estimation +methods by leveraging the benefits of the nonlocal LWR model within a +physics-informed deep learning framework. The classical LWR model, while +useful, falls short of accurately representing real-world traffic flows. The +nonlocal LWR model addresses this limitation by considering the speed as a +weighted mean of the downstream traffic density. In this paper, we propose a +novel PIDL framework that incorporates the nonlocal LWR model. We introduce +both fixed-length and variable-length kernels and develop the required +mathematics. The proposed PIDL framework undergoes a comprehensive evaluation, +including various convolutional kernels and look-ahead windows, using data from +the NGSIM and CitySim datasets. The results demonstrate improvements over the +baseline PIDL approach using the local LWR model. The findings highlight the +potential of the proposed approach to enhance the accuracy and reliability of +traffic state estimation, enabling more effective traffic management +strategies. + +
+
+
+
+
+ + ☆ Evaluation of Deep Neural Operator Models toward Ocean Forecasting + + +
+ Data-driven, deep-learning modeling frameworks have been recently developed +for forecasting time series data. Such machine learning models may be useful in +multiple domains including the atmospheric and oceanic ones, and in general, +the larger fluids community. The present work investigates the possible +effectiveness of such deep neural operator models for reproducing and +predicting classic fluid flows and simulations of realistic ocean dynamics. We +first briefly evaluate the capabilities of such deep neural operator models +when trained on a simulated two-dimensional fluid flow past a cylinder. We then +investigate their application to forecasting ocean surface circulation in the +Middle Atlantic Bight and Massachusetts Bay, learning from high-resolution +data-assimilative simulations employed for real sea experiments. We confirm +that trained deep neural operator models are capable of predicting idealized +periodic eddy shedding. For realistic ocean surface flows and our preliminary +study, they can predict several of the features and show some skill, providing +potential for future research and applications. + +
+
+ comment: Rajagopal, E., A.N.S. Babu, T. Ryu, P.J. Haley, Jr., C. Mirabito, and + P.F.J. Lermusiaux, 2023. Evaluation of Deep Neural Operator Models toward + Ocean Forecasting. In OCEANS' 23 IEEE/MTS Gulf Coast, 25-28 September 2023, + in press +
+
+
+
+
+ + ☆ Ceci n'est pas une pomme: Adversarial Illusions in Multi-Modal + Embeddings + + +
+ Multi-modal encoders map images, sounds, texts, videos, etc. into a single +embedding space, aligning representations across modalities (e.g., associate an +image of a dog with a barking sound). We show that multi-modal embeddings can +be vulnerable to an attack we call "adversarial illusions." Given an input in +any modality, an adversary can perturb it so as to make its embedding close to +that of an arbitrary, adversary-chosen input in another modality. Illusions +thus enable the adversary to align any image with any text, any text with any +sound, etc. + Adversarial illusions exploit proximity in the embedding space and are thus +agnostic to downstream tasks. Using ImageBind embeddings, we demonstrate how +adversarially aligned inputs, generated without knowledge of specific +downstream tasks, mislead image generation, text generation, and zero-shot +classification. + +
+
+
+
+
+ + ☆ Variational Density Propagation Continual Learning + + +
+ Deep Neural Networks (DNNs) deployed to the real world are regularly subject +to out-of-distribution (OoD) data, various types of noise, and shifting +conceptual objectives. This paper proposes a framework for adapting to data +distribution drift modeled by benchmark Continual Learning datasets. We develop +and evaluate a method of Continual Learning that leverages uncertainty +quantification from Bayesian Inference to mitigate catastrophic forgetting. We +expand on previous approaches by removing the need for Monte Carlo sampling of +the model weights to sample the predictive distribution. We optimize a +closed-form Evidence Lower Bound (ELBO) objective approximating the predictive +distribution by propagating the first two moments of a distribution, i.e. mean +and covariance, through all network layers. Catastrophic forgetting is +mitigated by using the closed-form ELBO to approximate the Minimum Description +Length (MDL) Principle, inherently penalizing changes in the model likelihood +by minimizing the KL Divergence between the variational posterior for the +current task and the previous task's variational posterior acting as the prior. +Leveraging the approximation of the MDL principle, we aim to initially learn a +sparse variational posterior and then minimize additional model complexity +learned for subsequent tasks. Our approach is evaluated for the task +incremental learning scenario using density propagated versions of +fully-connected and convolutional neural networks across multiple sequential +benchmark datasets with varying task sequence lengths. Ultimately, this +procedure produces a minimally complex network over a series of tasks +mitigating catastrophic forgetting. + +
+
+ comment: 6 pages, 13th Int'l Symposium on Image and Signal Processing and + Analysis +
+
+
+
+
+ + ☆ Complex-valued neural networks for voice anti-spoofing + + +
+ Current anti-spoofing and audio deepfake detection systems use either +magnitude spectrogram-based features (such as CQT or Melspectrograms) or raw +audio processed through convolution or sinc-layers. Both methods have +drawbacks: magnitude spectrograms discard phase information, which affects +audio naturalness, and raw-feature-based models cannot use traditional +explainable AI methods. This paper proposes a new approach that combines the +benefits of both methods by using complex-valued neural networks to process the +complex-valued, CQT frequency-domain representation of the input audio. This +method retains phase information and allows for explainable AI methods. Results +show that this approach outperforms previous methods on the "In-the-Wild" +anti-spoofing dataset and enables interpretation of the results through +explainable AI. Ablation studies confirm that the model has learned to use +phase information to detect voice spoofing. + +
+
+ comment: Interspeech 2023 +
+
+
+
+
+ + ☆ Karasu: A Collaborative Approach to Efficient Cluster Configuration for + Big Data Analytics + + +
+ Selecting the right resources for big data analytics jobs is hard because of +the wide variety of configuration options like machine type and cluster size. +As poor choices can have a significant impact on resource efficiency, cost, and +energy usage, automated approaches are gaining popularity. Most existing +methods rely on profiling recurring workloads to find near-optimal solutions +over time. Due to the cold-start problem, this often leads to lengthy and +costly profiling phases. However, big data analytics jobs across users can +share many common properties: they often operate on similar infrastructure, +using similar algorithms implemented in similar frameworks. The potential in +sharing aggregated profiling runs to collaboratively address the cold start +problem is largely unexplored. + We present Karasu, an approach to more efficient resource configuration +profiling that promotes data sharing among users working with similar +infrastructures, frameworks, algorithms, or datasets. Karasu trains lightweight +performance models using aggregated runtime information of collaborators and +combines them into an ensemble method to exploit inherent knowledge of the +configuration search space. Moreover, Karasu allows the optimization of +multiple objectives simultaneously. Our evaluation is based on performance data +from diverse workload executions in a public cloud environment. We show that +Karasu is able to significantly boost existing methods in terms of performance, +search time, and cost, even when few comparable profiling runs are available +that share only partial common characteristics with the target job. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ HypBO: Expert-Guided Chemist-in-the-Loop Bayesian Search for New + Materials + + +
+ Robotics and automation offer massive accelerations for solving intractable, +multivariate scientific problems such as materials discovery, but the available +search spaces can be dauntingly large. Bayesian optimization (BO) has emerged +as a popular sample-efficient optimization engine, thriving in tasks where no +analytic form of the target function/property is known. Here we exploit expert +human knowledge in the form of hypotheses to direct Bayesian searches more +quickly to promising regions of chemical space. Previous methods have used +underlying distributions derived from existing experimental measurements, which +is unfeasible for new, unexplored scientific tasks. Also, such distributions +cannot capture intricate hypotheses. Our proposed method, which we call HypBO, +uses expert human hypotheses to generate an improved seed of samples. +Unpromising seeds are automatically discounted, while promising seeds are used +to augment the surrogate model data, thus achieving better-informed sampling. +This process continues in a global versus local search fashion, organized in a +bilevel optimization framework. We validate the performance of our method on a +range of synthetic functions and demonstrate its practical utility on a real +chemical design task where the use of expert hypotheses accelerates the search +performance significantly. + +
+
+
+
+
+ + ☆ Coarse-to-Fine Multi-Scene Pose Regression with Transformers + + +
+ Absolute camera pose regressors estimate the position and orientation of a +camera given the captured image alone. Typically, a convolutional backbone with +a multi-layer perceptron (MLP) head is trained using images and pose labels to +embed a single reference scene at a time. Recently, this scheme was extended to +learn multiple scenes by replacing the MLP head with a set of fully connected +layers. In this work, we propose to learn multi-scene absolute camera pose +regression with Transformers, where encoders are used to aggregate activation +maps with self-attention and decoders transform latent features and scenes +encoding into pose predictions. This allows our model to focus on general +features that are informative for localization, while embedding multiple scenes +in parallel. We extend our previous MS-Transformer approach +\cite{shavit2021learning} by introducing a mixed classification-regression +architecture that improves the localization accuracy. Our method is evaluated +on commonly benchmark indoor and outdoor datasets and has been shown to exceed +both multi-scene and state-of-the-art single-scene absolute pose regressors. + +
+
+ comment: Accepted to IEEE Transactions on Pattern Analysis and Machine + Intelligence (TPAMI). arXiv admin note: substantial text overlap with + arXiv:2103.11468 +
+
+
+
+
+ + ☆ Addressing Dynamic and Sparse Qualitative Data: A Hilbert Space + Embedding of Categorical Variables + + +
+ We propose a novel framework for incorporating qualitative data into +quantitative models for causal estimation. Previous methods use categorical +variables derived from qualitative data to build quantitative models. However, +this approach can lead to data-sparse categories and yield inconsistent +(asymptotically biased) and imprecise (finite sample biased) estimates if the +qualitative information is dynamic and intricate. We use functional analysis to +create a more nuanced and flexible framework. We embed the observed categories +into a latent Baire space and introduce a continuous linear map -- a Hilbert +space embedding -- from the Baire space of categories to a Reproducing Kernel +Hilbert Space (RKHS) of representation functions. Through the Riesz +representation theorem, we establish that the canonical treatment of +categorical variables in causal models can be transformed into an identified +structure in the RKHS. Transfer learning acts as a catalyst to streamline +estimation -- embeddings from traditional models are paired with the kernel +trick to form the Hilbert space embedding. We validate our model through +comprehensive simulation evidence and demonstrate its relevance in a real-world +study that contrasts theoretical predictions from economics and psychology in +an e-commerce marketplace. The results confirm the superior performance of our +model, particularly in scenarios where qualitative information is nuanced and +complex. + +
+
+
+
+
+ + ☆ Few-shot Anomaly Detection in Text with Deviation Learning ICONIP 2023 + + +
+ Most current methods for detecting anomalies in text concentrate on +constructing models solely relying on unlabeled data. These models operate on +the presumption that no labeled anomalous examples are available, which +prevents them from utilizing prior knowledge of anomalies that are typically +present in small numbers in many real-world applications. Furthermore, these +models prioritize learning feature embeddings rather than optimizing anomaly +scores directly, which could lead to suboptimal anomaly scoring and inefficient +use of data during the learning process. In this paper, we introduce FATE, a +deep few-shot learning-based framework that leverages limited anomaly examples +and learns anomaly scores explicitly in an end-to-end method using deviation +learning. In this approach, the anomaly scores of normal examples are adjusted +to closely resemble reference scores obtained from a prior distribution. +Conversely, anomaly samples are forced to have anomalous scores that +considerably deviate from the reference score in the upper tail of the prior. +Additionally, our model is optimized to learn the distinct behavior of +anomalies by utilizing a multi-head self-attention layer and multiple instance +learning approaches. Comprehensive experiments on several benchmark datasets +demonstrate that our proposed approach attains a new level of state-of-the-art +performance. + +
+
+ comment: Accepted in ICONIP 2023 +
+
+
+
+
+ + ☆ Understanding Hessian Alignment for Domain Generalization ICCV 2023 + + +
+ Out-of-distribution (OOD) generalization is a critical ability for deep +learning models in many real-world scenarios including healthcare and +autonomous vehicles. Recently, different techniques have been proposed to +improve OOD generalization. Among these methods, gradient-based regularizers +have shown promising performance compared with other competitors. Despite this +success, our understanding of the role of Hessian and gradient alignment in +domain generalization is still limited. To address this shortcoming, we analyze +the role of the classifier's head Hessian matrix and gradient in domain +generalization using recent OOD theory of transferability. Theoretically, we +show that spectral norm between the classifier's head Hessian matrices across +domains is an upper bound of the transfer measure, a notion of distance between +target and source domains. Furthermore, we analyze all the attributes that get +aligned when we encourage similarity between Hessians and gradients. Our +analysis explains the success of many regularizers like CORAL, IRM, V-REx, +Fish, IGA, and Fishr as they regularize part of the classifier's head Hessian +and/or gradient. Finally, we propose two simple yet effective methods to match +the classifier's head Hessians and gradients in an efficient way, based on the +Hessian Gradient Product (HGP) and Hutchinson's method (Hutchinson), and +without directly calculating Hessians. We validate the OOD generalization +ability of proposed methods in different scenarios, including transferability, +severe correlation shift, label shift and diversity shift. Our results show +that Hessian alignment methods achieve promising performance on various OOD +benchmarks. The code is available at +\url{https://github.com/huawei-noah/Federated-Learning/tree/main/HessianAlignment}. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Self-Training: A Survey + + +
+ Semi-supervised algorithms aim to learn prediction functions from a small set +of labeled observations and a large set of unlabeled observations. Because this +framework is relevant in many applications, they have received a lot of +interest in both academia and industry. Among the existing techniques, +self-training methods have undoubtedly attracted greater attention in recent +years. These models are designed to find the decision boundary on low density +regions without making additional assumptions about the data distribution, and +use the unsigned output score of a learned classifier, or its margin, as an +indicator of confidence. The working principle of self-training algorithms is +to learn a classifier iteratively by assigning pseudo-labels to the set of +unlabeled training samples with a margin greater than a certain threshold. The +pseudo-labeled examples are then used to enrich the labeled training data and +to train a new classifier in conjunction with the labeled training set. In this +paper, we present self-training methods for binary and multi-class +classification; as well as their variants and two related approaches, namely +consistency-based approaches and transductive learning. We examine the impact +of significant self-training features on various methods, using different +general and image classification benchmarks, and we discuss our ideas for +future research in self-training. To the best of our knowledge, this is the +first thorough and complete survey on this subject. + +
+
+ comment: 27 pages, 1 figure +
+
+
+
+
+ + ♻ Consciousness in Artificial Intelligence: Insights from the Science of + Consciousness + + +
+ Whether current or near-term AI systems could be conscious is a topic of +scientific interest and increasing public concern. This report argues for, and +exemplifies, a rigorous and empirically grounded approach to AI consciousness: +assessing existing AI systems in detail, in light of our best-supported +neuroscientific theories of consciousness. We survey several prominent +scientific theories of consciousness, including recurrent processing theory, +global workspace theory, higher-order theories, predictive processing, and +attention schema theory. From these theories we derive "indicator properties" +of consciousness, elucidated in computational terms that allow us to assess AI +systems for these properties. We use these indicator properties to assess +several recent AI systems, and we discuss how future systems might implement +them. Our analysis suggests that no current AI systems are conscious, but also +suggests that there are no obvious technical barriers to building AI systems +which satisfy these indicators. + +
+
+
+
+
+ + ♻ ☆ MindMap: Knowledge Graph Prompting Sparks Graph of Thoughts in Large + Language Models + + +
+ LLMs usually exhibit limitations in their ability to incorporate new +knowledge, the generation of hallucinations, and the transparency of their +decision-making process. In this paper, we explore how to prompt LLMs with +knowledge graphs (KG), working as a remedy to engage LLMs with up-to-date +knowledge and elicit the reasoning pathways from LLMs. Specifically, we build a +prompting pipeline that endows LLMs with the capability of comprehending KG +inputs and inferring with a combined implicit knowledge and the retrieved +external knowledge. In addition, we investigate eliciting the mind map on which +LLMs perform the reasoning and generate the answers. It is identified that the +produced mind map exhibits the reasoning pathways of LLMs grounded on the +ontology of knowledge, hence bringing the prospects of probing and gauging LLM +inference in production. The experiments on three question & answering datasets +also show that MindMap prompting leads to a striking empirical gain. For +instance, prompting a GPT-3.5 with MindMap yields an overwhelming performance +over GPT-4 consistently. We also demonstrate that with structured facts +retrieved from KG, MindMap can outperform a series of +prompting-with-document-retrieval methods, benefiting from more accurate, +concise, and comprehensive knowledge from KGs. + +
+
+ comment: 7 pages, 8 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ MMD-Regularized Unbalanced Optimal Transport + + +
+ We study the unbalanced optimal transport (UOT) problem, where the marginal +constraints are enforced using Maximum Mean Discrepancy (MMD) regularization. +Our work is motivated by the observation that the literature on UOT is focused +on regularization based on $\phi$-divergence (e.g., KL divergence). Despite the +popularity of MMD, its role as a regularizer in the context of UOT seems less +understood. We begin by deriving the dual of MMD-regularized UOT (MMD-UOT), +which helps us prove other useful properties. One interesting outcome of this +duality result is that MMD-UOT induces novel metrics, which not only lift the +ground metric like the Wasserstein but are also efficient to estimate like the +MMD. Further, we present finite-dimensional convex programs for estimating +MMD-UOT and the corresponding barycenter solely based on the samples from the +measures being transported. Under mild conditions, we prove that our +convex-program-based estimators are consistent and the estimation error decays +at a rate $\mathcal{O}\left(m^{-\frac{1}{2}}\right)$, where $m$ is the number +of samples. As far as we know, such error bounds that are free from the curse +of dimensionality are not known for $\phi$-divergence regularized UOT. Finally, +we discuss how the proposed convex programs can be solved efficiently using +accelerated projected gradient descent. Our experiments show that MMD-UOT +consistently outperforms popular baselines, including KL-regularized UOT and +MMD, in diverse machine learning applications. + +
+
+
+
+
+ + ♻ ☆ Discovering Conservation Laws using Optimal Transport and Manifold + Learning + + +
+ Conservation laws are key theoretical and practical tools for understanding, +characterizing, and modeling nonlinear dynamical systems. However, for many +complex systems, the corresponding conserved quantities are difficult to +identify, making it hard to analyze their dynamics and build stable predictive +models. Current approaches for discovering conservation laws often depend on +detailed dynamical information or rely on black box parametric deep learning +methods. We instead reformulate this task as a manifold learning problem and +propose a non-parametric approach for discovering conserved quantities. We test +this new approach on a variety of physical systems and demonstrate that our +method is able to both identify the number of conserved quantities and extract +their values. Using tools from optimal transport theory and manifold learning, +our proposed method provides a direct geometric approach to identifying +conservation laws that is both robust and interpretable without requiring an +explicit model of the system nor accurate time information. + +
+
+ comment: 30 pages, 15 figures (7 main text, 8 supplemental), 3 tables + (supplemental) +
+
+
+
+
+ + ♻ ☆ SAFE: Machine Unlearning With Shard Graphs ICCV 2023 + + +
+ We present Synergy Aware Forgetting Ensemble (SAFE), a method to adapt large +models on a diverse collection of data while minimizing the expected cost to +remove the influence of training samples from the trained model. This process, +also known as selective forgetting or unlearning, is often conducted by +partitioning a dataset into shards, training fully independent models on each, +then ensembling the resulting models. Increasing the number of shards reduces +the expected cost to forget but at the same time it increases inference cost +and reduces the final accuracy of the model since synergistic information +between samples is lost during the independent model training. Rather than +treating each shard as independent, SAFE introduces the notion of a shard +graph, which allows incorporating limited information from other shards during +training, trading off a modest increase in expected forgetting cost with a +significant increase in accuracy, all while still attaining complete removal of +residual influence after forgetting. SAFE uses a lightweight system of adapters +which can be trained while reusing most of the computations. This allows SAFE +to be trained on shards an order-of-magnitude smaller than current +state-of-the-art methods (thus reducing the forgetting costs) while also +maintaining high accuracy, as we demonstrate empirically on fine-grained +computer vision datasets. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Adaptive Embedding Makes Vanilla Transformer SOTA for + Traffic Forecasting CIKM2023 + + +
+ With the rapid development of the Intelligent Transportation System (ITS), +accurate traffic forecasting has emerged as a critical challenge. The key +bottleneck lies in capturing the intricate spatio-temporal traffic patterns. In +recent years, numerous neural networks with complicated architectures have been +proposed to address this issue. However, the advancements in network +architectures have encountered diminishing performance gains. In this study, we +present a novel component called spatio-temporal adaptive embedding that can +yield outstanding results with vanilla transformers. Our proposed +Spatio-Temporal Adaptive Embedding transformer (STAEformer) achieves +state-of-the-art performance on five real-world traffic forecasting datasets. +Further experiments demonstrate that spatio-temporal adaptive embedding plays a +crucial role in traffic forecasting by effectively capturing intrinsic +spatio-temporal relations and chronological information in traffic time series. + +
+
+ comment: Accepted as CIKM2023 Short Paper +
+
+
+
+
+ + ♻ ☆ FedSIS: Federated Split Learning with Intermediate Representation + Sampling for Privacy-preserving Generalized Face Presentation Attack + Detection + + +
+ Lack of generalization to unseen domains/attacks is the Achilles heel of most +face presentation attack detection (FacePAD) algorithms. Existing attempts to +enhance the generalizability of FacePAD solutions assume that data from +multiple source domains are available with a single entity to enable +centralized training. In practice, data from different source domains may be +collected by diverse entities, who are often unable to share their data due to +legal and privacy constraints. While collaborative learning paradigms such as +federated learning (FL) can overcome this problem, standard FL methods are +ill-suited for domain generalization because they struggle to surmount the twin +challenges of handling non-iid client data distributions during training and +generalizing to unseen domains during inference. In this work, a novel +framework called Federated Split learning with Intermediate representation +Sampling (FedSIS) is introduced for privacy-preserving domain generalization. +In FedSIS, a hybrid Vision Transformer (ViT) architecture is learned using a +combination of FL and split learning to achieve robustness against statistical +heterogeneity in the client data distributions without any sharing of raw data +(thereby preserving privacy). To further improve generalization to unseen +domains, a novel feature augmentation strategy called intermediate +representation sampling is employed, and discriminative information from +intermediate blocks of a ViT is distilled using a shared adapter network. The +FedSIS approach has been evaluated on two well-known benchmarks for +cross-domain FacePAD to demonstrate that it is possible to achieve +state-of-the-art generalization performance without data sharing. Code: +https://github.com/Naiftt/FedSIS + +
+
+ comment: Accepted to the IEEE International Joint Conference on Biometrics + (IJCB), 2023 +
+
+
+
+
+ + ♻ ☆ Beyond Geometry: Comparing the Temporal Structure of Computation in + Neural Circuits with Dynamical Similarity Analysis + + +
+ How can we tell whether two neural networks are utilizing the same internal +processes for a particular computation? This question is pertinent for multiple +subfields of both neuroscience and machine learning, including neuroAI, +mechanistic interpretability, and brain-machine interfaces. Standard approaches +for comparing neural networks focus on the spatial geometry of latent states. +Yet in recurrent networks, computations are implemented at the level of neural +dynamics, which do not have a simple one-to-one mapping with geometry. To +bridge this gap, we introduce a novel similarity metric that compares two +systems at the level of their dynamics. Our method incorporates two components: +Using recent advances in data-driven dynamical systems theory, we learn a +high-dimensional linear system that accurately captures core features of the +original nonlinear dynamics. Next, we compare these linear approximations via a +novel extension of Procrustes Analysis that accounts for how vector fields +change under orthogonal transformation. Via four case studies, we demonstrate +that our method effectively identifies and distinguishes dynamic structure in +recurrent neural networks (RNNs), whereas geometric methods fall short. We +additionally show that our method can distinguish learning rules in an +unsupervised manner. Our method therefore opens the door to novel data-driven +analyses of the temporal structure of neural computation, and to more rigorous +testing of RNNs as models of the brain. + +
+
+ comment: 21 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Taken by Surprise: Contrast effect for Similarity Scores + + +
+ Accurately evaluating the similarity of object vector embeddings is of +critical importance for natural language processing, information retrieval and +classification tasks. Popular similarity scores (e.g cosine similarity) are +based on pairs of embedding vectors and disregard the distribution of the +ensemble from which objects are drawn. Human perception of object similarity +significantly depends on the context in which the objects appear. In this work +we propose the $\textit{surprise score}$, an ensemble-normalized similarity +metric that encapsulates the contrast effect of human perception and +significantly improves the classification performance on zero- and few-shot +document classification tasks. This score quantifies the surprise to find a +given similarity between two elements relative to the pairwise ensemble +similarities. We evaluate this metric on zero/few shot classification and +clustering tasks and typically find 10-15 % better performance compared to raw +cosine similarity. Our code is available at +https://github.com/MeetElise/surprise-similarity. + +
+
+ comment: 9 pages, 2 figures and 4 tables +
+
+
+
+
+ + ♻ ☆ On Performance Discrepancies Across Local Homophily Levels in Graph + Neural Networks + + +
+ Graph Neural Network (GNN) research has highlighted a relationship between +high homophily (i.e., the tendency of nodes of the same class to connect) and +strong predictive performance in node classification. However, recent work has +found the relationship to be more nuanced, demonstrating that simple GNNs can +learn in certain heterophilous settings. To resolve these conflicting findings +and align closer to real-world datasets, we go beyond the assumption of a +global graph homophily level and study the performance of GNNs when the local +homophily level of a node deviates from the global homophily level. Through +theoretical and empirical analysis, we systematically demonstrate how shifts in +local homophily can introduce performance degradation, leading to performance +discrepancies across local homophily levels. We ground the practical +implications of this work through granular analysis on five real-world datasets +with varying global homophily levels, demonstrating that (a) GNNs can fail to +generalize to test nodes that deviate from the global homophily of a graph, and +(b) high local homophily does not necessarily confer high performance for a +node. We further show that GNNs designed for globally heterophilous graphs can +alleviate performance discrepancy by improving performance across local +homophily levels, offering a new perspective on how these GNNs achieve stronger +global performance. + +
+
+
+
+
+ + ♻ ☆ DClEVerNet: Deep Combinatorial Learning for Efficient EV Charging + Scheduling in Large-scale Networked Facilities + + +
+ With the electrification of transportation, the rising uptake of electric +vehicles (EVs) might stress distribution networks significantly, leaving their +performance degraded and stability jeopardized. To accommodate these new loads +cost-effectively, modern power grids require coordinated or ``smart'' charging +strategies capable of optimizing EV charging scheduling in a scalable and +efficient fashion. With this in view, the present work focuses on reservation +management programs for large-scale, networked EV charging stations. We +formulate a time-coupled binary optimization problem that maximizes EV users' +total welfare gain while accounting for the network's available power capacity +and stations' occupancy limits. To tackle the problem at scale while retaining +high solution quality, a data-driven optimization framework combining +techniques from the fields of Deep Learning and Approximation Algorithms is +introduced. The framework's key ingredient is a novel input-output processing +scheme for neural networks that allows direct extrapolation to problem sizes +substantially larger than those included in the training set. Extensive +numerical simulations based on synthetic and real-world data traces verify the +effectiveness and superiority of the presented approach over two representative +scheduling algorithms. Lastly, we round up the contributions by listing several +immediate extensions to the proposed framework and outlining the prospects for +further exploration. + +
+
+ comment: Published in the proceedings of the 14th ACM International Conference + on Future Energy Systems (Best paper award nominee). + https://dl.acm.org/doi/abs/10.1145/3575813.3595205 +
+
+
+
+
+ + ♻ ☆ Improving automatic endoscopic stone recognition using a multi-view + fusion approach enhanced with two-step transfer learning ICCV 2023 + + +
+ This contribution presents a deep-learning method for extracting and fusing +image information acquired from different viewpoints, with the aim to produce +more discriminant object features for the identification of the type of kidney +stones seen in endoscopic images. The model was further improved with a +two-step transfer learning approach and by attention blocks to refine the +learned feature maps. Deep feature fusion strategies improved the results of +single view extraction backbone models by more than 6% in terms of accuracy of +the kidney stones classification. + +
+
+ comment: This paper has been accepted at the LatinX in Computer Vision (LXCV) + Research workshop at ICCV 2023 (Paris, France) +
+
+
+
+
+ + ♻ ☆ Constrained Probabilistic Mask Learning for Task-specific Undersampled + MRI Reconstruction WACV 2024 + + +
+ Undersampling is a common method in Magnetic Resonance Imaging (MRI) to +subsample the number of data points in k-space, reducing acquisition times at +the cost of decreased image quality. A popular approach is to employ +undersampling patterns following various strategies, e.g., variable density +sampling or radial trajectories. In this work, we propose a method that +directly learns the undersampling masks from data points, thereby also +providing task- and domain-specific patterns. To solve the resulting discrete +optimization problem, we propose a general optimization routine called ProM: A +fully probabilistic, differentiable, versatile, and model-free framework for +mask optimization that enforces acceleration factors through a convex +constraint. Analyzing knee, brain, and cardiac MRI datasets with our method, we +discover that different anatomic regions reveal distinct optimal undersampling +masks, demonstrating the benefits of using custom masks, tailored for a +downstream task. For example, ProM can create undersampling masks that maximize +performance in downstream tasks like segmentation with networks trained on +fully-sampled MRIs. Even with extreme acceleration factors, ProM yields +reasonable performance while being more versatile than existing methods, paving +the way for data-driven all-purpose mask generation. + +
+
+ comment: accepted at WACV 2024 +
+
+
+
+
+ + ♻ ☆ Stabilizing Unsupervised Environment Design with a Learned Adversary + + +
+ A key challenge in training generally-capable agents is the design of +training tasks that facilitate broad generalization and robustness to +environment variations. This challenge motivates the problem setting of +Unsupervised Environment Design (UED), whereby a student agent trains on an +adaptive distribution of tasks proposed by a teacher agent. A pioneering +approach for UED is PAIRED, which uses reinforcement learning (RL) to train a +teacher policy to design tasks from scratch, making it possible to directly +generate tasks that are adapted to the agent's current capabilities. Despite +its strong theoretical backing, PAIRED suffers from a variety of challenges +that hinder its practical performance. Thus, state-of-the-art methods currently +rely on curation and mutation rather than generation of new tasks. In this +work, we investigate several key shortcomings of PAIRED and propose solutions +for each shortcoming. As a result, we make it possible for PAIRED to match or +exceed state-of-the-art methods, producing robust agents in several established +challenging procedurally-generated environments, including a partially-observed +maze navigation task and a continuous-control car racing environment. We +believe this work motivates a renewed emphasis on UED methods based on learned +models that directly generate challenging environments, potentially unlocking +more open-ended RL training and, as a result, more general agents. + +
+
+ comment: CoLLAs 2023 - Oral; Second and third authors contributed equally +
+
+
+
+
+ + ♻ ☆ An Effective Method using Phrase Mechanism in Neural Machine Translation + + +
+ Machine Translation is one of the essential tasks in Natural Language +Processing (NLP), which has massive applications in real life as well as +contributing to other tasks in the NLP research community. Recently, +Transformer -based methods have attracted numerous researchers in this domain +and achieved state-of-the-art results in most of the pair languages. In this +paper, we report an effective method using a phrase mechanism, +PhraseTransformer, to improve the strong baseline model Transformer in +constructing a Neural Machine Translation (NMT) system for parallel corpora +Vietnamese-Chinese. Our experiments on the MT dataset of the VLSP 2022 +competition achieved the BLEU score of 35.3 on Vietnamese to Chinese and 33.2 +BLEU scores on Chinese to Vietnamese data. Our code is available at +https://github.com/phuongnm94/PhraseTransformer. + +
+
+
+
+
+ + ♻ ☆ Evading Watermark based Detection of AI-Generated Content CCS + + +
+ A generative AI model can generate extremely realistic-looking content, +posing growing challenges to the authenticity of information. To address the +challenges, watermark has been leveraged to detect AI-generated content. +Specifically, a watermark is embedded into an AI-generated content before it is +released. A content is detected as AI-generated if a similar watermark can be +decoded from it. In this work, we perform a systematic study on the robustness +of such watermark-based AI-generated content detection. We focus on +AI-generated images. Our work shows that an attacker can post-process a +watermarked image via adding a small, human-imperceptible perturbation to it, +such that the post-processed image evades detection while maintaining its +visual quality. We show the effectiveness of our attack both theoretically and +empirically. Moreover, to evade detection, our adversarial post-processing +method adds much smaller perturbations to AI-generated images and thus better +maintain their visual quality than existing popular post-processing methods +such as JPEG compression, Gaussian blur, and Brightness/Contrast. Our work +shows the insufficiency of existing watermark-based detection of AI-generated +content, highlighting the urgent needs of new methods. Our code is publicly +available: https://github.com/zhengyuan-jiang/WEvade. + +
+
+ comment: To appear in ACM Conference on Computer and Communications Security + (CCS), 2023 +
+
+
+
+
+ + ♻ ☆ Double Pessimism is Provably Efficient for Distributionally Robust + Offline Reinforcement Learning: Generic Algorithm and Robust Partial Coverage + + +
+ In this paper, we study distributionally robust offline reinforcement +learning (robust offline RL), which seeks to find an optimal policy purely from +an offline dataset that can perform well in perturbed environments. In +specific, we propose a generic algorithm framework called Doubly Pessimistic +Model-based Policy Optimization ($P^2MPO$), which features a novel combination +of a flexible model estimation subroutine and a doubly pessimistic policy +optimization step. Notably, the double pessimism principle is crucial to +overcome the distributional shifts incurred by (i) the mismatch between the +behavior policy and the target policies; and (ii) the perturbation of the +nominal model. Under certain accuracy conditions on the model estimation +subroutine, we prove that $P^2MPO$ is sample-efficient with robust partial +coverage data, which only requires the offline data to have good coverage of +the distributions induced by the optimal robust policy and the perturbed models +around the nominal model. + By tailoring specific model estimation subroutines for concrete examples of +RMDPs, including tabular RMDPs, factored RMDPs, kernel and neural RMDPs, we +prove that $P^2MPO$ enjoys a $\tilde{\mathcal{O}}(n^{-1/2})$ convergence rate, +where $n$ is the dataset size. We highlight that all these examples, except +tabular RMDPs, are first identified and proven tractable by this work. +Furthermore, we continue our study of robust offline RL in the robust Markov +games (RMGs). By extending the double pessimism principle identified for +single-agent RMDPs, we propose another algorithm framework that can efficiently +find the robust Nash equilibria among players using only robust unilateral +(partial) coverage data. To our best knowledge, this work proposes the first +general learning principle -- double pessimism -- for robust offline RL and +shows that it is provably efficient with general function approximation. + +
+
+ comment: V2 adds results on robust offline Markov games +
+
+
+
+
+ + ♻ ☆ Active Exploration for Inverse Reinforcement Learning NeurIPS + + +
+ Inverse Reinforcement Learning (IRL) is a powerful paradigm for inferring a +reward function from expert demonstrations. Many IRL algorithms require a known +transition model and sometimes even a known expert policy, or they at least +require access to a generative model. However, these assumptions are too strong +for many real-world applications, where the environment can be accessed only +through sequential interaction. We propose a novel IRL algorithm: Active +exploration for Inverse Reinforcement Learning (AceIRL), which actively +explores an unknown environment and expert policy to quickly learn the expert's +reward function and identify a good policy. AceIRL uses previous observations +to construct confidence intervals that capture plausible reward functions and +find exploration policies that focus on the most informative regions of the +environment. AceIRL is the first approach to active IRL with sample-complexity +bounds that does not require a generative model of the environment. AceIRL +matches the sample complexity of active IRL with a generative model in the +worst case. Additionally, we establish a problem-dependent bound that relates +the sample complexity of AceIRL to the suboptimality gap of a given IRL +problem. We empirically evaluate AceIRL in simulations and find that it +significantly outperforms more naive exploration strategies. + +
+
+ comment: Presented at Conference on Neural Information Processing Systems + (NeurIPS), 2022 +
+
+
+
+
+ + ♻ ☆ On the Usage of Continual Learning for Out-of-Distribution + Generalization in Pre-trained Language Models of Code + + +
+ Pre-trained language models (PLMs) have become a prevalent technique in deep +learning for code, utilizing a two-stage pre-training and fine-tuning procedure +to acquire general knowledge about code and specialize in a variety of +downstream tasks. However, the dynamic nature of software codebases poses a +challenge to the effectiveness and robustness of PLMs. In particular, +world-realistic scenarios potentially lead to significant differences between +the distribution of the pre-training and test data, i.e., distribution shift, +resulting in a degradation of the PLM's performance on downstream tasks. In +this paper, we stress the need for adapting PLMs of code to software data whose +distribution changes over time, a crucial problem that has been overlooked in +previous works. The motivation of this work is to consider the PLM in a +non-stationary environment, where fine-tuning data evolves over time according +to a software evolution scenario. Specifically, we design a scenario where the +model needs to learn from a stream of programs containing new, unseen APIs over +time. We study two widely used PLM architectures, i.e., a GPT2 decoder and a +RoBERTa encoder, on two downstream tasks, API call and API usage prediction. We +demonstrate that the most commonly used fine-tuning technique from prior work +is not robust enough to handle the dynamic nature of APIs, leading to the loss +of previously acquired knowledge i.e., catastrophic forgetting. To address +these issues, we implement five continual learning approaches, including +replay-based and regularization-based methods. Our findings demonstrate that +utilizing these straightforward methods effectively mitigates catastrophic +forgetting in PLMs across both downstream tasks while achieving comparable or +superior performance. + +
+
+
+
+
+ + ♻ ☆ Performance Enhancement Leveraging Mask-RCNN on Bengali Document Layout + Analysis + + +
+ Understanding digital documents is like solving a puzzle, especially +historical ones. Document Layout Analysis (DLA) helps with this puzzle by +dividing documents into sections like paragraphs, images, and tables. This is +crucial for machines to read and understand these documents. In the DL Sprint +2.0 competition, we worked on understanding Bangla documents. We used a dataset +called BaDLAD with lots of examples. We trained a special model called Mask +R-CNN to help with this understanding. We made this model better by +step-by-step hyperparameter tuning, and we achieved a good dice score of 0.889. +However, not everything went perfectly. We tried using a model trained for +English documents, but it didn't fit well with Bangla. This showed us that each +language has its own challenges. Our solution for the DL Sprint 2.0 is publicly +available at https://www.kaggle.com/competitions/dlsprint2/discussion/432201 +along with notebooks, weights, and inference notebook. + +
+
+ comment: Contest paper, Conest: DL sprint 2.0 (Link: + https://www.kaggle.com/competitions/dlsprint2), Solution link: + https://www.kaggle.com/competitions/dlsprint2/discussion/432201 +
+
+
+
+
+ + ♻ ☆ Label-free timing analysis of SiPM-based modularized detectors with + physics-constrained deep learning + + +
+ Pulse timing is an important topic in nuclear instrumentation, with +far-reaching applications from high energy physics to radiation imaging. While +high-speed analog-to-digital converters become more and more developed and +accessible, their potential uses and merits in nuclear detector signal +processing are still uncertain, partially due to associated timing algorithms +which are not fully understood and utilized. In this paper, we propose a novel +method based on deep learning for timing analysis of modularized detectors +without explicit needs of labelling event data. By taking advantage of the +intrinsic time correlations, a label-free loss function with a specially +designed regularizer is formed to supervise the training of neural networks +towards a meaningful and accurate mapping function. We mathematically +demonstrate the existence of the optimal function desired by the method, and +give a systematic algorithm for training and calibration of the model. The +proposed method is validated on two experimental datasets based on silicon +photomultipliers (SiPM) as main transducers. In the toy experiment, the neural +network model achieves the single-channel time resolution of 8.8 ps and +exhibits robustness against concept drift in the dataset. In the +electromagnetic calorimeter experiment, several neural network models (FC, CNN +and LSTM) are tested to show their conformance to the underlying physical +constraint and to judge their performance against traditional methods. In +total, the proposed method works well in either ideal or noisy experimental +condition and recovers the time information from waveform samples successfully +and precisely. + +
+
+ comment: 26 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ SMT 2.0: A Surrogate Modeling Toolbox with a focus on Hierarchical and + Mixed Variables Gaussian Processes + + +
+ The Surrogate Modeling Toolbox (SMT) is an open-source Python package that +offers a collection of surrogate modeling methods, sampling techniques, and a +set of sample problems. This paper presents SMT 2.0, a major new release of SMT +that introduces significant upgrades and new features to the toolbox. This +release adds the capability to handle mixed-variable surrogate models and +hierarchical variables. These types of variables are becoming increasingly +important in several surrogate modeling applications. SMT 2.0 also improves SMT +by extending sampling methods, adding new surrogate models, and computing +variance and kernel derivatives for Kriging. This release also includes new +functions to handle noisy and use multifidelity data. To the best of our +knowledge, SMT 2.0 is the first open-source surrogate library to propose +surrogate models for hierarchical and mixed inputs. This open-source software +is distributed under the New BSD license. + +
+
+ comment: version 2 +
+
+
+
+
+ + ♻ ☆ Equivariant Networks for Porous Crystalline Materials + + +
+ Porous crystalline materials have the potential to play a key role in +developing solutions for molecular storage, gas separation and carbon +adsorption. For these solutions, we need to develop new materials with specific +properties. Estimating the properties of such porous materials involves first +principle simulation using classical molecular simulations. The computational +complexity of these methods can be a barrier to high throughput screening of +the potential materials as the space of possible materials is vast. Data-driven +methods, specifically machine learning methods based on deep neural networks +offer a significant opportunity to significantly scale the simulation of the +behavior of these materials. However, to effectively achieve this the Deep +Learning models need to utilize the symmetries present in the crystals. +Crystals pose specific symmetries that are present in their space group. +Existing methods for crystal property prediction either have symmetry +constraints that are too restrictive or only incorporate symmetries between +unit cells. In addition, these models do not explicitly model the porous +structure of the crystal. In this paper, we develop a model which incorporates +the symmetries of the unit cell of a crystal in its architecture and explicitly +models the porous structure. We evaluate our model by predicting the heat of +adsorption of CO$_2$ for different configurations of the Mordenite and ZSM-5 +zeolites. Our results confirm that our method performs better than existing +methods for crystal property prediction and that the inclusion of pores results +in a more efficient model. + +
+
+ comment: Added additional figures as well as additional experiments for MFI +
+
+
+
+
+ + ♻ ☆ Estimating Gibbs free energies via isobaric-isothermal flows + + +
+ We present a machine-learning model based on normalizing flows that is +trained to sample from the isobaric-isothermal ensemble. In our approach, we +approximate the joint distribution of a fully-flexible triclinic simulation box +and particle coordinates to achieve a desired internal pressure. This novel +extension of flow-based sampling to the isobaric-isothermal ensemble yields +direct estimates of Gibbs free energies. We test our NPT-flow on monatomic +water in the cubic and hexagonal ice phases and find excellent agreement of +Gibbs free energies and other observables compared with established baselines. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ The Prospect of Enhancing Large-Scale Heterogeneous Federated Learning + with Transformers + + +
+ Federated learning (FL) addresses data privacy concerns by enabling +collaborative training of AI models across distributed data owners. Wide +adoption of FL faces the fundamental challenges of data heterogeneity and the +large scale of data owners involved. In this paper, we investigate the prospect +of Transformer-based FL models for achieving generalization and personalization +in this setting. We conduct extensive comparative experiments involving FL with +Transformers, ResNet, and personalized ResNet-based FL approaches under various +scenarios. These experiments consider varying numbers of data owners to +demonstrate Transformers' advantages over deep neural networks in large-scale +heterogeneous FL tasks. In addition, we analyze the superior performance of +Transformers by comparing the Centered Kernel Alignment (CKA) representation +similarity across different layers and FL models to gain insight into the +reasons behind their promising capabilities. + +
+
+
+
+
+ + ♻ ☆ Lipschitzness Effect of a Loss Function on Generalization Performance of + Deep Neural Networks Trained by Adam and AdamW Optimizers + + +
+ The generalization performance of deep neural networks with regard to the +optimization algorithm is one of the major concerns in machine learning. This +performance can be affected by various factors. In this paper, we theoretically +prove that the Lipschitz constant of a loss function is an important factor to +diminish the generalization error of the output model obtained by Adam or +AdamW. The results can be used as a guideline for choosing the loss function +when the optimization algorithm is Adam or AdamW. In addition, to evaluate the +theoretical bound in a practical setting, we choose the human age estimation +problem in computer vision. For assessing the generalization better, the +training and test datasets are drawn from different distributions. Our +experimental evaluation shows that the loss function with a lower Lipschitz +constant and maximum value improves the generalization of the model trained by +Adam or AdamW. + +
+
+ comment: Accepted to be published in AUT Journal of Mathematics and Computing + (AJMC, 2023) +
+
+
+
+
+ + ♻ ☆ CrowdGuard: Federated Backdoor Detection in Federated Learning NDSS + + +
+ Federated Learning (FL) is a promising approach enabling multiple clients to +train Deep Neural Networks (DNNs) collaboratively without sharing their local +training data. However, FL is susceptible to backdoor (or targeted poisoning) +attacks. These attacks are initiated by malicious clients who seek to +compromise the learning process by introducing specific behaviors into the +learned model that can be triggered by carefully crafted inputs. Existing FL +safeguards have various limitations: They are restricted to specific data +distributions or reduce the global model accuracy due to excluding benign +models or adding noise, are vulnerable to adaptive defense-aware adversaries, +or require the server to access local models, allowing data inference attacks. + This paper presents a novel defense mechanism, CrowdGuard, that effectively +mitigates backdoor attacks in FL and overcomes the deficiencies of existing +techniques. It leverages clients' feedback on individual models, analyzes the +behavior of neurons in hidden layers, and eliminates poisoned models through an +iterative pruning scheme. CrowdGuard employs a server-located stacked +clustering scheme to enhance its resilience to rogue client feedback. The +evaluation results demonstrate that CrowdGuard achieves a 100% +True-Positive-Rate and True-Negative-Rate across various scenarios, including +IID and non-IID data distributions. Additionally, CrowdGuard withstands +adaptive adversaries while preserving the original performance of protected +models. To ensure confidentiality, CrowdGuard uses a secure and +privacy-preserving architecture leveraging Trusted Execution Environments +(TEEs) on both client and server sides. + +
+
+ comment: To appear in the Network and Distributed System Security (NDSS) + Symposium 2024. Phillip Rieger and Torsten Krau{\ss} contributed equally to + this contribution. 19 pages, 8 figures, 5 tables, 4 algorithms, 5 equations +
+
+
+
+
+ + ♻ ☆ Calibrating and Improving Graph Contrastive Learning + + +
+ Graph contrastive learning algorithms have demonstrated remarkable success in +various applications such as node classification, link prediction, and graph +clustering. However, in unsupervised graph contrastive learning, some +contrastive pairs may contradict the truths in downstream tasks and thus the +decrease of losses on these pairs undesirably harms the performance in the +downstream tasks. To assess the discrepancy between the prediction and the +ground-truth in the downstream tasks for these contrastive pairs, we adapt the +expected calibration error (ECE) to graph contrastive learning. The analysis of +ECE motivates us to propose a novel regularization method, Contrast-Reg, to +ensure that decreasing the contrastive loss leads to better performance in the +downstream tasks. As a plug-in regularizer, Contrast-Reg effectively improves +the performance of existing graph contrastive learning algorithms. We provide +both theoretical and empirical results to demonstrate the effectiveness of +Contrast-Reg in enhancing the generalizability of the Graph Neural Network(GNN) +model and improving the performance of graph contrastive algorithms with +different similarity definitions and encoder backbones across various +downstream tasks. + +
+
+
+
+
+ + ♻ ☆ Machine Learning for QoS Prediction in Vehicular Communication: + Challenges and Solution Approaches + + +
+ As cellular networks evolve towards the 6th generation, machine learning is +seen as a key enabling technology to improve the capabilities of the network. +Machine learning provides a methodology for predictive systems, which can make +networks become proactive. This proactive behavior of the network can be +leveraged to sustain, for example, a specific quality of service requirement. +With predictive quality of service, a wide variety of new use cases, both +safety- and entertainment-related, are emerging, especially in the automotive +sector. Therefore, in this work, we consider maximum throughput prediction +enhancing, for example, streaming or high-definition mapping applications. We +discuss the entire machine learning workflow highlighting less regarded aspects +such as the detailed sampling procedures, the in-depth analysis of the dataset +characteristics, the effects of splits in the provided results, and the data +availability. Reliable machine learning models need to face a lot of challenges +during their lifecycle. We highlight how confidence can be built on machine +learning technologies by better understanding the underlying characteristics of +the collected data. We discuss feature engineering and the effects of different +splits for the training processes, showcasing that random splits might +overestimate performance by more than twofold. Moreover, we investigate diverse +sets of input features, where network information proved to be most effective, +cutting the error by half. Part of our contribution is the validation of +multiple machine learning models within diverse scenarios. We also use +explainable AI to show that machine learning can learn underlying principles of +wireless networks without being explicitly programmed. Our data is collected +from a deployed network that was under full control of the measurement team and +covered different vehicular scenarios and radio environments. + +
+
+ comment: 18 pages, 12 Figures. Accepted on IEEE Access +
+
+
+
+
+ + ♻ ☆ Probable Domain Generalization via Quantile Risk Minimization NeurIPS 2022 + + +
+ Domain generalization (DG) seeks predictors which perform well on unseen test +distributions by leveraging data drawn from multiple related training +distributions or domains. To achieve this, DG is commonly formulated as an +average- or worst-case problem over the set of possible domains. However, +predictors that perform well on average lack robustness while predictors that +perform well in the worst case tend to be overly-conservative. To address this, +we propose a new probabilistic framework for DG where the goal is to learn +predictors that perform well with high probability. Our key idea is that +distribution shifts seen during training should inform us of probable shifts at +test time, which we realize by explicitly relating training and test domains as +draws from the same underlying meta-distribution. To achieve probable DG, we +propose a new optimization problem called Quantile Risk Minimization (QRM). By +minimizing the $\alpha$-quantile of predictor's risk distribution over domains, +QRM seeks predictors that perform well with probability $\alpha$. To solve QRM +in practice, we propose the Empirical QRM (EQRM) algorithm and provide: (i) a +generalization bound for EQRM; and (ii) the conditions under which EQRM +recovers the causal predictor as $\alpha \to 1$. In our experiments, we +introduce a more holistic quantile-focused evaluation protocol for DG and +demonstrate that EQRM outperforms state-of-the-art baselines on datasets from +WILDS and DomainBed. + +
+
+ comment: NeurIPS 2022 camera-ready (+ minor corrections) +
+
+
+
+
+ + ♻ ☆ FRAug: Tackling Federated Learning with Non-IID Features via + Representation Augmentation ICCV 2023 + + +
+ Federated Learning (FL) is a decentralized learning paradigm, in which +multiple clients collaboratively train deep learning models without +centralizing their local data, and hence preserve data privacy. Real-world +applications usually involve a distribution shift across the datasets of the +different clients, which hurts the generalization ability of the clients to +unseen samples from their respective data distributions. In this work, we +address the recently proposed feature shift problem where the clients have +different feature distributions, while the label distribution is the same. We +propose Federated Representation Augmentation (FRAug) to tackle this practical +and challenging problem. Our approach generates synthetic client-specific +samples in the embedding space to augment the usually small client datasets. +For that, we train a shared generative model to fuse the clients knowledge +learned from their different feature distributions. This generator synthesizes +client-agnostic embeddings, which are then locally transformed into +client-specific embeddings by Representation Transformation Networks (RTNets). +By transferring knowledge across the clients, the generated embeddings act as a +regularizer for the client models and reduce overfitting to the local original +datasets, hence improving generalization. Our empirical evaluation on public +benchmarks and a real-world medical dataset demonstrates the effectiveness of +the proposed method, which substantially outperforms the current +state-of-the-art FL methods for non-IID features, including PartialFed and +FedBN. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Understanding Silent Failures in Medical Image Classification MICCAI 23 + + +
+ To ensure the reliable use of classification systems in medical applications, +it is crucial to prevent silent failures. This can be achieved by either +designing classifiers that are robust enough to avoid failures in the first +place, or by detecting remaining failures using confidence scoring functions +(CSFs). A predominant source of failures in image classification is +distribution shifts between training data and deployment data. To understand +the current state of silent failure prevention in medical imaging, we conduct +the first comprehensive analysis comparing various CSFs in four biomedical +tasks and a diverse range of distribution shifts. Based on the result that none +of the benchmarked CSFs can reliably prevent silent failures, we conclude that +a deeper understanding of the root causes of failures in the data is required. +To facilitate this, we introduce SF-Visuals, an interactive analysis tool that +uses latent space clustering to visualize shifts and failures. On the basis of +various examples, we demonstrate how this tool can help researchers gain +insight into the requirements for safe application of classification systems in +the medical domain. The open-source benchmark and tool are at: +https://github.com/IML-DKFZ/sf-visuals. + +
+
+ comment: Accepted at MICCAI 23 +
+
+
+
+
+ + ♻ ☆ Autonomous Payload Thermal Control + + +
+ In small satellites there is less room for heat control equipment, scientific +instruments, and electronic components. Furthermore, the near proximity of the +electronics makes power dissipation difficult, with the risk of not being able +to control the temperature appropriately, reducing component lifetime and +mission performance. To address this challenge, taking advantage of the advent +of increasing intelligence on board satellites, a deep reinforcement learning +based framework that uses Soft Actor-Critic algorithm is proposed for learning +the thermal control policy onboard. The framework is evaluated both in a naive +simulated environment and in a real space edge processing computer that will be +shipped in the future IMAGIN-e mission and hosted in the ISS. The experiment +results show that the proposed framework is able to learn to control the +payload processing power to maintain the temperature under operational ranges, +complementing traditional thermal control systems. + +
+
+
+
+
+ + ♻ ☆ Causality-Aided Trade-off Analysis for Machine Learning Fairness + + +
+ There has been an increasing interest in enhancing the fairness of machine +learning (ML). Despite the growing number of fairness-improving methods, we +lack a systematic understanding of the trade-offs among factors considered in +the ML pipeline when fairness-improving methods are applied. This understanding +is essential for developers to make informed decisions regarding the provision +of fair ML services. Nonetheless, it is extremely difficult to analyze the +trade-offs when there are multiple fairness parameters and other crucial +metrics involved, coupled, and even in conflict with one another. + This paper uses causality analysis as a principled method for analyzing +trade-offs between fairness parameters and other crucial metrics in ML +pipelines. To ractically and effectively conduct causality analysis, we propose +a set of domain-specific optimizations to facilitate accurate causal discovery +and a unified, novel interface for trade-off analysis based on well-established +causal inference methods. We conduct a comprehensive empirical study using +three real-world datasets on a collection of widelyused fairness-improving +techniques. Our study obtains actionable suggestions for users and developers +of fair ML. We further demonstrate the versatile usage of our approach in +selecting the optimal fairness-improving method, paving the way for more +ethical and socially responsible AI technologies. + +
+
+
+
+
+ + ♻ ☆ Multi-Source Domain Adaptation through Dataset Dictionary Learning in + Wasserstein Space + + +
+ This paper seeks to solve Multi-Source Domain Adaptation (MSDA), which aims +to mitigate data distribution shifts when transferring knowledge from multiple +labeled source domains to an unlabeled target domain. We propose a novel MSDA +framework based on dictionary learning and optimal transport. We interpret each +domain in MSDA as an empirical distribution. As such, we express each domain as +a Wasserstein barycenter of dictionary atoms, which are empirical +distributions. We propose a novel algorithm, DaDiL, for learning via +mini-batches: (i) atom distributions; (ii) a matrix of barycentric coordinates. +Based on our dictionary, we propose two novel methods for MSDA: DaDil-R, based +on the reconstruction of labeled samples in the target domain, and DaDiL-E, +based on the ensembling of classifiers learned on atom distributions. We +evaluate our methods in 3 benchmarks: Caltech-Office, Office 31, and CRWU, +where we improved previous state-of-the-art by 3.15%, 2.29%, and 7.71% in +classification performance. Finally, we show that interpolations in the +Wasserstein hull of learned atoms provide data that can generalize to the +target domain. + +
+
+ comment: 13 pages,8 figures,Accepted as a conference paper at the 26th + European Conference on Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Variational Autoencoding Molecular Graphs with Denoising Diffusion + Probabilistic Model + + +
+ In data-driven drug discovery, designing molecular descriptors is a very +important task. Deep generative models such as variational autoencoders (VAEs) +offer a potential solution by designing descriptors as probabilistic latent +vectors derived from molecular structures. These models can be trained on large +datasets, which have only molecular structures, and applied to transfer +learning. Nevertheless, the approximate posterior distribution of the latent +vectors of the usual VAE assumes a simple multivariate Gaussian distribution +with zero covariance, which may limit the performance of representing the +latent features. To overcome this limitation, we propose a novel molecular deep +generative model that incorporates a hierarchical structure into the +probabilistic latent vectors. We achieve this by a denoising diffusion +probabilistic model (DDPM). We demonstrate that our model can design effective +molecular latent vectors for molecular property prediction from some +experiments by small datasets on physical properties and activity. The results +highlight the superior prediction performance and robustness of our model +compared to existing approaches. + +
+
+ comment: 2 pages. Short paper submitted to IEEE CIBCB 2023 +
+
+
+
+
+ + ♻ ☆ Rethinking Noisy Label Learning in Real-world Annotation Scenarios from + the Noise-type Perspective AAAI 2024 + + +
+ In this paper, we investigate the problem of learning with noisy labels in +real-world annotation scenarios, where noise can be categorized into two types: +factual noise and ambiguity noise. To better distinguish these noise types and +utilize their semantics, we propose a novel sample selection-based approach for +noisy label learning, called Proto-semi. Proto-semi initially divides all +samples into the confident and unconfident datasets via warm-up. By leveraging +the confident dataset, prototype vectors are constructed to capture class +characteristics. Subsequently, the distances between the unconfident samples +and the prototype vectors are calculated to facilitate noise classification. +Based on these distances, the labels are either corrected or retained, +resulting in the refinement of the confident and unconfident datasets. Finally, +we introduce a semi-supervised learning method to enhance training. Empirical +evaluations on a real-world annotated dataset substantiate the robustness of +Proto-semi in handling the problem of learning from noisy labels. Meanwhile, +the prototype-based repartitioning strategy is shown to be effective in +mitigating the adverse impact of label noise. Our code and data are available +at https://github.com/fuxiAIlab/ProtoSemi. + +
+
+ comment: Submitted to AAAI 2024 +
+
+
+
+
+ + ♻ ☆ DTAAD: Dual Tcn-Attention Networks for Anomaly Detection in Multivariate + Time Series Data + + +
+ Anomaly detection techniques enable effective anomaly detection and diagnosis +in multi-variate time series data, which are of major significance for today's +industrial applications. However, establishing an anomaly detection system that +can be rapidly and accurately located is a challenging problem due to the lack +of outlier tags, the high dimensional complexity of the data, memory +bottlenecks in the actual hardware, and the need for fast reasoning. We have +proposed an anomaly detection and diagnosis model -- DTAAD in this paper, based +on Transformer, and Dual Temporal Convolutional Network(TCN). Our overall model +will be an integrated design in which autoregressive model(AR) combines +autoencoder(AE) structures, and scaling methods and feedback mechanisms are +introduced to improve prediction accuracy and expand correlation differences. +Constructed by us, the Dual TCN-Attention Network (DTA) only uses a single +layer of Transformer encoder in our baseline experiment, that belongs to an +ultra-lightweight model. Our extensive experiments on six publicly datasets +validate that DTAAD exceeds current most advanced baseline methods in both +detection and diagnostic performance. Specifically, DTAAD improved F1 scores by +$8.38\%$, and reduced training time by $99\%$ compared to baseline. The code +and training scripts are publicly on GitHub at +https://github.com/Yu-Lingrui/DTAAD. + +
+
+
+
+
+ + ♻ ☆ Optimistic Online Mirror Descent for Bridging Stochastic and Adversarial + Online Convex Optimization ICML 2023 + + +
+ Stochastically Extended Adversarial (SEA) model is introduced by Sachs et al. +[2022] as an interpolation between stochastic and adversarial online convex +optimization. Under the smoothness condition, they demonstrate that the +expected regret of optimistic follow-the-regularized-leader (FTRL) depends on +the cumulative stochastic variance $\sigma_{1:T}^2$ and the cumulative +adversarial variation $\Sigma_{1:T}^2$ for convex functions. They also provide +a slightly weaker bound based on the maximal stochastic variance +$\sigma_{\max}^2$ and the maximal adversarial variation $\Sigma_{\max}^2$ for +strongly convex functions. Inspired by their work, we investigate the +theoretical guarantees of optimistic online mirror descent (OMD) for the SEA +model. For convex and smooth functions, we obtain the same +$\mathcal{O}(\sqrt{\sigma_{1:T}^2}+\sqrt{\Sigma_{1:T}^2})$ regret bound, +without the convexity requirement of individual functions. For strongly convex +and smooth functions, we establish an $\mathcal{O}((\sigma_{\max}^2 + +\Sigma_{\max}^2) \log (\sigma_{1:T}^2+\Sigma_{1:T}^2))$ bound, better than +their $\mathcal{O}((\sigma_{\max}^2 + \Sigma_{\max}^2) \log T)$ result. For +exp-concave and smooth functions, we achieve a new +$\mathcal{O}(d\log(\sigma_{1:T}^2+\Sigma_{1:T}^2))$ bound. Owing to the OMD +framework, we broaden our work to study dynamic regret minimization and +scenarios where the online functions are non-smooth. We establish the first +dynamic regret guarantee for the SEA model with convex and smooth functions, +which is more favorable than static regret bounds in non-stationary scenarios. +Furthermore, to deal with non-smooth and convex functions in the SEA model, we +propose novel algorithms building on optimistic OMD with an implicit update, +which provably attain static regret and dynamic regret guarantees without +smoothness conditions. + +
+
+ comment: conference version appeared at ICML 2023; this extended version + enriches the content with improved regret bounds for strongly convex + functions, discussions on the optimism design for dynamic regret + minimization, and extensions to non-smooth scenarios +
+
+
+
+
+ + ♻ ☆ Maximum Entropy Heterogeneous-Agent Mirror Learning + + +
+ Multi-agent reinforcement learning (MARL) has been shown effective for +cooperative games in recent years. However, existing state-of-the-art methods +face challenges related to sample inefficiency, brittleness regarding +hyperparameters, and the risk of converging to a suboptimal Nash Equilibrium. +To resolve these issues, in this paper, we propose a novel theoretical +framework, named Maximum Entropy Heterogeneous-Agent Mirror Learning (MEHAML), +that leverages the maximum entropy principle to design maximum entropy MARL +actor-critic algorithms. We prove that algorithms derived from the MEHAML +framework enjoy the desired properties of the monotonic improvement of the +joint maximum entropy objective and the convergence to quantal response +equilibrium (QRE). The practicality of MEHAML is demonstrated by developing a +MEHAML extension of the widely used RL algorithm, HASAC (for soft +actor-critic), which shows significant improvements in exploration and +robustness on three challenging benchmarks: Multi-Agent MuJoCo, StarCraftII, +and Google Research Football. Our results show that HASAC outperforms strong +baseline methods such as HATD3, HAPPO, QMIX, and MAPPO, thereby establishing +the new state of the art. See our project page at +https://sites.google.com/view/mehaml. + +
+
+
+
+
+ + ♻ ☆ Information Theory-Guided Heuristic Progressive Multi-View Coding + + +
+ Multi-view representation learning aims to capture comprehensive information +from multiple views of a shared context. Recent works intuitively apply +contrastive learning to different views in a pairwise manner, which is still +scalable: view-specific noise is not filtered in learning view-shared +representations; the fake negative pairs, where the negative terms are actually +within the same class as the positive, and the real negative pairs are +coequally treated; evenly measuring the similarities between terms might +interfere with optimization. Importantly, few works study the theoretical +framework of generalized self-supervised multi-view learning, especially for +more than two views. To this end, we rethink the existing multi-view learning +paradigm from the perspective of information theory and then propose a novel +information theoretical framework for generalized multi-view learning. Guided +by it, we build a multi-view coding method with a three-tier progressive +architecture, namely Information theory-guided hierarchical Progressive +Multi-view Coding (IPMC). In the distribution-tier, IPMC aligns the +distribution between views to reduce view-specific noise. In the set-tier, IPMC +constructs self-adjusted contrasting pools, which are adaptively modified by a +view filter. Lastly, in the instance-tier, we adopt a designed unified loss to +learn representations and reduce the gradient interference. Theoretically and +empirically, we demonstrate the superiority of IPMC over state-of-the-art +methods. + +
+
+ comment: This paper is accepted by the jourcal of Neural Networks (Elsevier) + by 2023. A revised manuscript of arXiv:2109.02344 +
+
+
+
+
+ + ♻ ☆ Information Theory-Guided Heuristic Progressive Multi-View Coding + + +
+ Multi-view representation learning captures comprehensive information from +multiple views of a shared context. Recent works intuitively apply contrastive +learning (CL) to learn representations, regarded as a pairwise manner, which is +still scalable: view-specific noise is not filtered in learning view-shared +representations; the fake negative pairs, where the negative terms are actually +within the same class as the positive, and the real negative pairs are +coequally treated; and evenly measuring the similarities between terms might +interfere with optimization. Importantly, few works research the theoretical +framework of generalized self-supervised multi-view learning, especially for +more than two views. To this end, we rethink the existing multi-view learning +paradigm from the information theoretical perspective and then propose a novel +information theoretical framework for generalized multi-view learning. Guided +by it, we build a multi-view coding method with a three-tier progressive +architecture, namely Information theory-guided heuristic Progressive Multi-view +Coding (IPMC). In the distribution-tier, IPMC aligns the distribution between +views to reduce view-specific noise. In the set-tier, IPMC builds self-adjusted +pools for contrasting, which utilizes a view filter to adaptively modify the +pools. Lastly, in the instance-tier, we adopt a designed unified loss to learn +discriminative representations and reduce the gradient interference. +Theoretically and empirically, we demonstrate the superiority of IPMC over +state-of-the-art methods. + +
+
+ comment: We have uploaded a new version of this paper in arXiv:2308.10522, so + that we have to withdrawal this paper +
+
+
+
+
+ + ♻ ☆ FocalDreamer: Text-driven 3D Editing via Focal-fusion Assembly + + +
+ While text-3D editing has made significant strides in leveraging score +distillation sampling, emerging approaches still fall short in delivering +separable, precise and consistent outcomes that are vital to content creation. +In response, we introduce FocalDreamer, a framework that merges base shape with +editable parts according to text prompts for fine-grained editing within +desired regions. Specifically, equipped with geometry union and dual-path +rendering, FocalDreamer assembles independent 3D parts into a complete object, +tailored for convenient instance reuse and part-wise control. We propose +geometric focal loss and style consistency regularization, which encourage +focal fusion and congruent overall appearance. Furthermore, FocalDreamer +generates high-fidelity geometry and PBR textures which are compatible with +widely-used graphics engines. Extensive experiments have highlighted the +superior editing capabilities of FocalDreamer in both quantitative and +qualitative evaluations. + +
+
+ comment: Project website: https://focaldreamer.github.io +
+
+
+
+
+ + ♻ ☆ PMET: Precise Model Editing in a Transformer + + +
+ Model editing techniques modify a minor proportion of knowledge in Large +Language Models (LLMs) at a relatively low cost, which have demonstrated +notable success. Existing methods assume Transformer Layer (TL) hidden states +are values of key-value memories of the Feed-Forward Network (FFN). They +usually optimize the TL hidden states to memorize target knowledge and use it +to update the weights of the FFN in LLMs. However, the information flow of TL +hidden states comes from three parts: Multi-Head Self-Attention (MHSA), FFN, +and residual connections. Existing methods neglect the fact that the TL hidden +states contains information not specifically required for FFN. Consequently, +the performance of model editing decreases. To achieve more precise model +editing, we analyze hidden states of MHSA and FFN, finding that MHSA encodes +certain general knowledge extraction patterns. This implies that MHSA weights +do not require updating when new knowledge is introduced. Based on above +findings, we introduce PMET, which simultaneously optimizes Transformer +Component (TC, namely MHSA and FFN) hidden states, while only using the +optimized TC hidden states of FFN to precisely update FFN weights. Our +experiments demonstrate that PMET exhibits state-of-the-art performance on both +the COUNTERFACT and zsRE datasets. Our ablation experiments substantiate the +effectiveness of our enhancements, further reinforcing the finding that the +MHSA encodes certain general knowledge extraction patterns and indicating its +storage of a small amount of factual knowledge. Our code is available at +https://github.com/xpq-tech/PMET.git. + +
+
+ comment: Preprint. Under review +
+
+
+
+
+ + ♻ ☆ DatasetEquity: Are All Samples Created Equal? In The Quest For Equity + Within Datasets ICCV 2023 + + +
+ Data imbalance is a well-known issue in the field of machine learning, +attributable to the cost of data collection, the difficulty of labeling, and +the geographical distribution of the data. In computer vision, bias in data +distribution caused by image appearance remains highly unexplored. Compared to +categorical distributions using class labels, image appearance reveals complex +relationships between objects beyond what class labels provide. Clustering deep +perceptual features extracted from raw pixels gives a richer representation of +the data. This paper presents a novel method for addressing data imbalance in +machine learning. The method computes sample likelihoods based on image +appearance using deep perceptual embeddings and clustering. It then uses these +likelihoods to weigh samples differently during training with a proposed +$\textbf{Generalized Focal Loss}$ function. This loss can be easily integrated +with deep learning algorithms. Experiments validate the method's effectiveness +across autonomous driving vision datasets including KITTI and nuScenes. The +loss function improves state-of-the-art 3D object detection methods, achieving +over $200\%$ AP gains on under-represented classes (Cyclist) in the KITTI +dataset. The results demonstrate the method is generalizable, complements +existing techniques, and is particularly beneficial for smaller datasets and +rare classes. Code is available at: +https://github.com/towardsautonomy/DatasetEquity + +
+
+ comment: ICCV 2023 Workshop +
+
+
+
+
+ + ♻ ☆ Knowledge Transfer from High-Resource to Low-Resource Programming + Languages for Code LLMs + + +
+ Over the past few years, Large Language Models of Code (Code LLMs) have +started to have a significant impact on programming practice. Code LLMs are +also emerging as a building block for research in programming languages and +software engineering. However, the quality of code produced by a Code LLM +varies significantly by programming languages. Code LLMs produce impressive +results on programming languages that are well represented in their training +data (e.g., Java, Python, or JavaScript), but struggle with low-resource +languages, like OCaml and Racket. + This paper presents an effective approach for boosting the performance of +Code LLMs on low-resource languages using semi-synthetic data. Our approach +generates high-quality datasets for low-resource languages, which can then be +used to fine-tune any pretrained Code LLM. Our approach, called MultiPL-T, +translates training data from high-resource languages into training data for +low-resource languages. We apply our approach to generate tens of thousands of +new, validated training items for Racket, OCaml, and Lua from Python. Moreover, +we use an open dataset (The Stack) and model (StarCoderBase), which allow us to +decontaminate benchmarks and train models on this data without violating the +model license. + With MultiPL-T generated data, we present fine-tuned versions of +StarCoderBase that achieve state-of-the-art performance for Racket, OCaml, and +Lua on benchmark problems. For Lua, our fine-tuned model achieves the same +performance as StarCoderBase as Python -- a very high-resource language -- on +the MultiPL-E benchmarks. For Racket and OCaml, we double their performance on +MultiPL-E, bringing their performance close to higher-resource languages such +as Ruby and C#. + +
+
+
+
+
+ + ♻ ☆ Perceptual Grouping in Contrastive Vision-Language Models ICCV 2023 + + +
+ Recent advances in zero-shot image recognition suggest that vision-language +models learn generic visual representations with a high degree of semantic +information that may be arbitrarily probed with natural language phrases. +Understanding an image, however, is not just about understanding what content +resides within an image, but importantly, where that content resides. In this +work we examine how well vision-language models are able to understand where +objects reside within an image and group together visually related parts of the +imagery. We demonstrate how contemporary vision and language representation +learning models based on contrastive losses and large web-based data capture +limited object localization information. We propose a minimal set of +modifications that results in models that uniquely learn both semantic and +spatial information. We measure this performance in terms of zero-shot image +recognition, unsupervised bottom-up and top-down semantic segmentations, as +well as robustness analyses. We find that the resulting model achieves +state-of-the-art results in terms of unsupervised segmentation, and demonstrate +that the learned representations are uniquely robust to spurious correlations +in datasets designed to probe the causal behavior of vision models. + +
+
+ comment: Accepted and presented at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Majorana Demonstrator Data Release for AI/ML Applications + + +
+ The enclosed data release consists of a subset of the calibration data from +the Majorana Demonstrator experiment. Each Majorana event is accompanied by raw +Germanium detector waveforms, pulse shape discrimination cuts, and calibrated +final energies, all shared in an HDF5 file format along with relevant metadata. +This release is specifically designed to support the training and testing of +Artificial Intelligence (AI) and Machine Learning (ML) algorithms upon our +data. This document is structured as follows. Section I provides an overview of +the dataset's content and format; Section II outlines the location of this +dataset and the method for accessing it; Section III presents the NPML Machine +Learning Challenge associated with this dataset; Section IV contains a +disclaimer from the Majorana collaboration regarding the use of this dataset; +Appendix A contains technical details of this data release. Please direct +questions about the material provided within this release to liaobo77@ucsd.edu +(A. Li). + +
+
+ comment: Zenodo DOI: https://doi.org/10.5281/zenodo.8257027 +
+
+
+
+
+ + ♻ ☆ Sequence Learning Using Equilibrium Propagation IJCAI 2023 + + +
+ Equilibrium Propagation (EP) is a powerful and more bio-plausible alternative +to conventional learning frameworks such as backpropagation. The effectiveness +of EP stems from the fact that it relies only on local computations and +requires solely one kind of computational unit during both of its training +phases, thereby enabling greater applicability in domains such as bio-inspired +neuromorphic computing. The dynamics of the model in EP is governed by an +energy function and the internal states of the model consequently converge to a +steady state following the state transition rules defined by the same. However, +by definition, EP requires the input to the model (a convergent RNN) to be +static in both the phases of training. Thus it is not possible to design a +model for sequence classification using EP with an LSTM or GRU like +architecture. In this paper, we leverage recent developments in modern hopfield +networks to further understand energy based models and develop solutions for +complex sequence classification tasks using EP while satisfying its convergence +criteria and maintaining its theoretical similarities with recurrent +backpropagation. We explore the possibility of integrating modern hopfield +networks as an attention mechanism with convergent RNN models used in EP, +thereby extending its applicability for the first time on two different +sequence classification tasks in natural language processing viz. sentiment +analysis (IMDB dataset) and natural language inference (SNLI dataset). + +
+
+ comment: Accepted at IJCAI 2023 +
+
+
+
+
+ + ♻ ☆ Survey on Sociodemographic Bias in Natural Language Processing + + +
+ Deep neural networks often learn unintended bias during training, which might +have harmful effects when deployed in real-world settings. This work surveys +214 papers related to sociodemographic bias in natural language processing +(NLP). In this study, we aim to provide a more comprehensive understanding of +the similarities and differences among approaches to sociodemographic bias in +NLP. To better understand the distinction between bias and real-world harm, we +turn to ideas from psychology and behavioral economics to propose a definition +for sociodemographic bias. We identify three main categories of NLP bias +research: types of bias, quantifying bias, and debiasing techniques. We +highlight the current trends in quantifying bias and debiasing techniques, +offering insights into their strengths and weaknesses. We conclude that current +approaches on quantifying bias face reliability issues, that many of the bias +metrics do not relate to real-world bias, and that debiasing techniques need to +focus more on training methods. Finally, we provide recommendations for future +work. + +
+
+ comment: 23 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Truveta Mapper: A Zero-shot Ontology Alignment Framework + + +
+ In this paper, a new perspective is suggested for unsupervised Ontology +Matching (OM) or Ontology Alignment (OA) by treating it as a translation task. +Ontologies are represented as graphs, and the translation is performed from a +node in the source ontology graph to a path in the target ontology graph. The +proposed framework, Truveta Mapper (TM), leverages a multi-task +sequence-to-sequence transformer model to perform alignment across multiple +ontologies in a zero-shot, unified and end-to-end manner. Multi-tasking enables +the model to implicitly learn the relationship between different ontologies via +transfer-learning without requiring any explicit cross-ontology manually +labeled data. This also enables the formulated framework to outperform existing +solutions for both runtime latency and alignment quality. The model is +pre-trained and fine-tuned only on publicly available text corpus and +inner-ontologies data. The proposed solution outperforms state-of-the-art +approaches, Edit-Similarity, LogMap, AML, BERTMap, and the recently presented +new OM frameworks in Ontology Alignment Evaluation Initiative (OAEI22), offers +log-linear complexity, and overall makes the OM task efficient and more +straightforward without much post-processing involving mapping extension or +mapping repair. We are open sourcing our solution. + +
+
+
+
+
+ + ♻ ☆ Blockwise Parallel Transformer for Long Context Large Models + + +
+ Transformers have emerged as the cornerstone of state-of-the-art natural +language processing models, showcasing exceptional performance across a wide +range of AI applications. However, the memory demands posed by the +self-attention mechanism and the large feedforward network in Transformers +limit their ability to handle long sequences, thereby creating challenges for +tasks involving multiple long sequences or long-term dependencies. We present a +distinct approach, Blockwise Parallel Transformer (BPT), that leverages +blockwise computation of self-attention and feedforward network fusion to +minimize memory costs. By processing longer input sequences while maintaining +memory efficiency, BPT enables training sequences up to 32 times longer than +vanilla Transformers and 2 to 4 times longer than previous memory-efficient +methods. Extensive experiments on language modeling and reinforcement learning +tasks demonstrate the effectiveness of BPT in reducing memory requirements and +improving performance. + +
+
+
+
+
+ + ♻ ☆ Accelerating Diffusion-based Combinatorial Optimization Solvers by + Progressive Distillation ICML 2023 + + +
+ Graph-based diffusion models have shown promising results in terms of +generating high-quality solutions to NP-complete (NPC) combinatorial +optimization (CO) problems. However, those models are often inefficient in +inference, due to the iterative evaluation nature of the denoising diffusion +process. This paper proposes to use progressive distillation to speed up the +inference by taking fewer steps (e.g., forecasting two steps ahead within a +single step) during the denoising process. Our experimental results show that +the progressively distilled model can perform inference 16 times faster with +only 0.019% degradation in performance on the TSP-50 dataset. + +
+
+ comment: Published at ICML 2023, Sampling and Optimization in Discrete Space + Workshop. The implementation is at + https://github.com/jwrh/Accelerating-Diffusion-based-Combinatorial-Optimization-Solvers-by-Progressive-Distillation +
+
+
+
+
+ + ♻ ☆ Neural Networks for Scalar Input and Functional Output + + +
+ The regression of a functional response on a set of scalar predictors can be +a challenging task, especially if there is a large number of predictors, or the +relationship between those predictors and the response is nonlinear. In this +work, we propose a solution to this problem: a feed-forward neural network (NN) +designed to predict a functional response using scalar inputs. First, we +transform the functional response to a finite-dimensional representation and +construct an NN that outputs this representation. Then, we propose to modify +the output of an NN via the objective function and introduce different +objective functions for network training. The proposed models are suited for +both regularly and irregularly spaced data, and a roughness penalty can be +further applied to control the smoothness of the predicted curve. The +difficulty in implementing both those features lies in the definition of +objective functions that can be back-propagated. In our experiments, we +demonstrate that our model outperforms the conventional function-on-scalar +regression model in multiple scenarios while computationally scaling better +with the dimension of the predictors. + +
+
+
+
+
+ + ♻ ☆ Data-driven abstractions via adaptive refinements and a Kantorovich + metric [extended version] + + +
+ We introduce an adaptive refinement procedure for smart, and scalable +abstraction of dynamical systems. Our technique relies on partitioning the +state space depending on the observation of future outputs. However, this +knowledge is dynamically constructed in an adaptive, asymmetric way. In order +to learn the optimal structure, we define a Kantorovich-inspired metric between +Markov chains, and we use it as a loss function. Our technique is prone to +data-driven frameworks, but not restricted to. + We also study properties of the above mentioned metric between Markov chains, +which we believe could be of application for wider purpose. We propose an +algorithm to approximate it, and we show that our method yields a much better +computational complexity than using classical linear programming techniques. + +
+
+ comment: This paper is an extended version of a CDC2023 submission +
+
+
+
+
+ + ♻ ☆ Compressed Sensing: A Discrete Optimization Approach + + +
+ We study the Compressed Sensing (CS) problem, which is the problem of finding +the most sparse vector that satisfies a set of linear measurements up to some +numerical tolerance. CS is a central problem in Statistics, Operations Research +and Machine Learning which arises in applications such as signal processing, +data compression and image reconstruction. We introduce an $\ell_2$ regularized +formulation of CS which we reformulate as a mixed integer second order cone +program. We derive a second order cone relaxation of this problem and show that +under mild conditions on the regularization parameter, the resulting relaxation +is equivalent to the well studied basis pursuit denoising problem. We present a +semidefinite relaxation that strengthens the second order cone relaxation and +develop a custom branch-and-bound algorithm that leverages our second order +cone relaxation to solve instances of CS to certifiable optimality. Our +numerical results show that our approach produces solutions that are on average +$6.22\%$ more sparse than solutions returned by state of the art benchmark +methods on synthetic data in minutes. On real world ECG data, for a given +$\ell_2$ reconstruction error our approach produces solutions that are on +average $9.95\%$ more sparse than benchmark methods, while for a given sparsity +level our approach produces solutions that have on average $10.77\%$ lower +reconstruction error than benchmark methods in minutes. + +
+
+
+
+
+ + ♻ ☆ U-TOE: Universal TinyML On-board Evaluation Toolkit for Low-Power IoT + + +
+ Results from the TinyML community demonstrate that, it is possible to execute +machine learning models directly on the terminals themselves, even if these are +small microcontroller-based devices. However, to date, practitioners in the +domain lack convenient all-in-one toolkits to help them evaluate the +feasibility of executing arbitrary models on arbitrary low-power IoT hardware. +To this effect, we present in this paper U-TOE, a universal toolkit we designed +to facilitate the task of IoT designers and researchers, by combining +functionalities from a low-power embedded OS, a generic model transpiler and +compiler, an integrated performance measurement module, and an open-access +remote IoT testbed. We provide an open source implementation of U-TOE and we +demonstrate its use to experimentally evaluate the performance of various +models, on a wide variety of low-power IoT boards, based on popular +microcontroller architectures. U-TOE allows easily reproducible and +customizable comparative evaluation experiments on a wide variety of IoT +hardware all-at-once. The availability of a toolkit such as U-TOE is desirable +to accelerate research combining Artificial Intelligence and IoT towards fully +exploiting the potential of edge computing. + +
+
+ comment: to be published in the proceedings of IFIP/IEEE PEMWN 2023 +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ M3PS: End-to-End Multi-Grained Multi-Modal Attribute-Aware Product + Summarization in E-commerce + + +
+ Given the long textual product information and the product image, Multi-Modal +Product Summarization (MMPS) aims to attract customers' interest and increase +their desire to purchase by highlighting product characteristics with a short +textual summary. Existing MMPS methods have achieved promising performance. +Nevertheless, there still exist several problems: 1) lack end-to-end product +summarization, 2) lack multi-grained multi-modal modeling, and 3) lack +multi-modal attribute modeling. To address these issues, we propose an +end-to-end multi-grained multi-modal attribute-aware product summarization +method (M3PS) for generating high-quality product summaries in e-commerce. M3PS +jointly models product attributes and generates product summaries. Meanwhile, +we design several multi-grained multi-modal tasks to better guide the +multi-modal learning of M3PS. Furthermore, we model product attributes based on +both text and image modalities so that multi-modal product characteristics can +be manifested in the generated summaries. Extensive experiments on a real +large-scale Chinese e-commence dataset demonstrate that our model outperforms +state-of-the-art product summarization methods w.r.t. several summarization +metrics. + +
+
+
+
+
+ + ☆ Music Understanding LLaMA: Advancing Text-to-Music Generation with + Question Answering and Captioning + + +
+ Text-to-music generation (T2M-Gen) faces a major obstacle due to the scarcity +of large-scale publicly available music datasets with natural language +captions. To address this, we propose the Music Understanding LLaMA (MU-LLaMA), +capable of answering music-related questions and generating captions for music +files. Our model utilizes audio representations from a pretrained MERT model to +extract music features. However, obtaining a suitable dataset for training the +MU-LLaMA model remains challenging, as existing publicly accessible audio +question answering datasets lack the necessary depth for open-ended music +question answering. To fill this gap, we present a methodology for generating +question-answer pairs from existing audio captioning datasets and introduce the +MusicQA Dataset designed for answering open-ended music-related questions. The +experiments demonstrate that the proposed MU-LLaMA model, trained on our +designed MusicQA dataset, achieves outstanding performance in both music +question answering and music caption generation across various metrics, +outperforming current state-of-the-art (SOTA) models in both fields and +offering a promising advancement in the T2M-Gen research field. + +
+
+
+
+
+ + ☆ MISSRec: Pre-training and Transferring Multi-modal Interest-aware + Sequence Representation for Recommendation ACM MM 2023 + + +
+ The goal of sequential recommendation (SR) is to predict a user's potential +interested items based on her/his historical interaction sequences. Most +existing sequential recommenders are developed based on ID features, which, +despite their widespread use, often underperform with sparse IDs and struggle +with the cold-start problem. Besides, inconsistent ID mappings hinder the +model's transferability, isolating similar recommendation domains that could +have been co-optimized. This paper aims to address these issues by exploring +the potential of multi-modal information in learning robust and generalizable +sequence representations. We propose MISSRec, a multi-modal pre-training and +transfer learning framework for SR. On the user side, we design a +Transformer-based encoder-decoder model, where the contextual encoder learns to +capture the sequence-level multi-modal synergy while a novel interest-aware +decoder is developed to grasp item-modality-interest relations for better +sequence representation. On the candidate item side, we adopt a dynamic fusion +module to produce user-adaptive item representation, providing more precise +matching between users and items. We pre-train the model with contrastive +learning objectives and fine-tune it in an efficient manner. Extensive +experiments demonstrate the effectiveness and flexibility of MISSRec, promising +an practical solution for real-world recommendation scenarios. + +
+
+ comment: Accepted to ACM MM 2023 +
+
+
+
+
+ + ☆ Hey That's Mine Imperceptible Watermarks are Preserved in Diffusion + Generated Outputs + + +
+ Generative models have seen an explosion in popularity with the release of +huge generative Diffusion models like Midjourney and Stable Diffusion to the +public. Because of this new ease of access, questions surrounding the automated +collection of data and issues regarding content ownership have started to +build. In this paper we present new work which aims to provide ways of +protecting content when shared to the public. We show that a generative +Diffusion model trained on data that has been imperceptibly watermarked will +generate new images with these watermarks present. We further show that if a +given watermark is correlated with a certain feature of the training data, the +generated images will also have this correlation. Using statistical tests we +show that we are able to determine whether a model has been trained on marked +data, and what data was marked. As a result our system offers a solution to +protect intellectual property when sharing content online. + +
+
+
+
+
+ + ☆ CLIP Multi-modal Hashing: A new baseline CLIPMH ICASSP2024 + + +
+ The multi-modal hashing method is widely used in multimedia retrieval. It can +fuse multi-source data to generate binary hash code. However, the current +multi-modal methods have the problem of low retrieval accuracy. The reason is +that the individual backbone networks have limited feature expression +capabilities and are not jointly pre-trained on large-scale unsupervised +multi-modal data. To solve this problem, we propose a new baseline CLIP +Multi-modal Hashing (CLIPMH) method. It uses CLIP model to extract text and +image features, and then fuse to generate hash code. CLIP improves the +expressiveness of each modal feature. In this way, it can greatly improve the +retrieval performance of multi-modal hashing methods. In comparison to +state-of-the-art unsupervised and supervised multi-modal hashing methods, +experiments reveal that the proposed CLIPMH can significantly enhance +performance (Maximum increase of 8.38%). CLIP also has great advantages over +the text and visual backbone networks commonly used before. + +
+
+ comment: submit to ICASSP2024 +
+
+
+
+
+ + ☆ VadCLIP: Adapting Vision-Language Models for Weakly Supervised Video + Anomaly Detection + + +
+ The recent contrastive language-image pre-training (CLIP) model has shown +great success in a wide range of image-level tasks, revealing remarkable +ability for learning powerful visual representations with rich semantics. An +open and worthwhile problem is efficiently adapting such a strong model to the +video domain and designing a robust video anomaly detector. In this work, we +propose VadCLIP, a new paradigm for weakly supervised video anomaly detection +(WSVAD) by leveraging the frozen CLIP model directly without any pre-training +and fine-tuning process. Unlike current works that directly feed extracted +features into the weakly supervised classifier for frame-level binary +classification, VadCLIP makes full use of fine-grained associations between +vision and language on the strength of CLIP and involves dual branch. One +branch simply utilizes visual features for coarse-grained binary +classification, while the other fully leverages the fine-grained language-image +alignment. With the benefit of dual branch, VadCLIP achieves both +coarse-grained and fine-grained video anomaly detection by transferring +pre-trained knowledge from CLIP to WSVAD task. We conduct extensive experiments +on two commonly-used benchmarks, demonstrating that VadCLIP achieves the best +performance on both coarse-grained and fine-grained WSVAD, surpassing the +state-of-the-art methods by a large margin. Specifically, VadCLIP achieves +84.51% AP and 88.02% AUC on XD-Violence and UCF-Crime, respectively. Code and +features will be released to facilitate future VAD research. + +
+
+ comment: Submitted +
+
+
+
+
+ + ♻ ☆ H4VDM: H.264 Video Device Matching + + +
+ Methods that can determine if two given video sequences are captured by the +same device (e.g., mobile telephone or digital camera) can be used in many +forensics tasks. In this paper we refer to this as "video device matching". In +open-set video forensics scenarios it is easier to determine if two video +sequences were captured with the same device than identifying the specific +device. In this paper, we propose a technique for open-set video device +matching. Given two H.264 compressed video sequences, our method can determine +if they are captured by the same device, even if our method has never +encountered the device in training. We denote our proposed technique as H.264 +Video Device Matching (H4VDM). H4VDM uses H.264 compression information +extracted from video sequences to make decisions. It is more robust against +artifacts that alter camera sensor fingerprints, and it can be used to analyze +relatively small fragments of the H.264 sequence. We trained and tested our +method on a publicly available video forensics dataset consisting of 35 +devices, where our proposed method demonstrated good performance. + +
+
+
+
+
+ + ♻ ☆ WMFormer++: Nested Transformer for Visible Watermark Removal via Implict + Joint Learning + + +
+ Watermarking serves as a widely adopted approach to safeguard media +copyright. In parallel, the research focus has extended to watermark removal +techniques, offering an adversarial means to enhance watermark robustness and +foster advancements in the watermarking field. Existing watermark removal +methods mainly rely on UNet with task-specific decoder branches--one for +watermark localization and the other for background image restoration. However, +watermark localization and background restoration are not isolated tasks; +precise watermark localization inherently implies regions necessitating +restoration, and the background restoration process contributes to more +accurate watermark localization. To holistically integrate information from +both branches, we introduce an implicit joint learning paradigm. This empowers +the network to autonomously navigate the flow of information between implicit +branches through a gate mechanism. Furthermore, we employ cross-channel +attention to facilitate local detail restoration and holistic structural +comprehension, while harnessing nested structures to integrate multi-scale +information. Extensive experiments are conducted on various challenging +benchmarks to validate the effectiveness of our proposed method. The results +demonstrate our approach's remarkable superiority, surpassing existing +state-of-the-art methods by a large margin. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 55 + +
+
+
+ + ☆ Giraffe: Adventures in Expanding Context Lengths in LLMs + + +
+ Modern large language models (LLMs) that rely on attention mechanisms are +typically trained with fixed context lengths which enforce upper limits on the +length of input sequences that they can handle at evaluation time. To use these +models on sequences longer than the train-time context length, one might employ +techniques from the growing family of context length extrapolation methods -- +most of which focus on modifying the system of positional encodings used in the +attention mechanism to indicate where tokens or activations are located in the +input sequence. We conduct a wide survey of existing methods of context length +extrapolation on a base LLaMA or LLaMA 2 model, and introduce some of our own +design as well -- in particular, a new truncation strategy for modifying the +basis for the position encoding. + We test these methods using three new evaluation tasks (FreeFormQA, +AlteredNumericQA, and LongChat-Lines) as well as perplexity, which we find to +be less fine-grained as a measure of long context performance of LLMs. We +release the three tasks publicly as datasets on HuggingFace. We discover that +linear scaling is the best method for extending context length, and show that +further gains can be achieved by using longer scales at evaluation time. We +also discover promising extrapolation capabilities in the truncated basis. To +support further research in this area, we release three new 13B parameter +long-context models which we call Giraffe: 4k and 16k context models trained +from base LLaMA-13B, and a 32k context model trained from base LLaMA2-13B. We +also release the code to replicate our results. + +
+
+
+
+
+ + ☆ Analyzing Transformer Dynamics as Movement through Embedding Space + + +
+ Transformer language models exhibit intelligent behaviors such as +understanding natural language, recognizing patterns, acquiring knowledge, +reasoning, planning, reflecting and using tools. This paper explores how their +underlying mechanics give rise to intelligent behaviors. We adopt a systems +approach to analyze Transformers in detail and develop a mathematical framework +that frames their dynamics as movement through embedding space. This novel +perspective provides a principled way of thinking about the problem and reveals +important insights related to the emergence of intelligence: + 1. At its core the Transformer is a Embedding Space walker, mapping +intelligent behavior to trajectories in this vector space. + 2. At each step of the walk, it composes context into a single composite +vector whose location in Embedding Space defines the next step. + 3. No learning actually occurs during decoding; in-context learning and +generalization are simply the result of different contexts composing into +different vectors. + 4. Ultimately the knowledge, intelligence and skills exhibited by the model +are embodied in the organization of vectors in Embedding Space rather than in +specific neurons or layers. These abilities are properties of this +organization. + 5. Attention's contribution boils down to the association-bias it lends to +vector composition and which influences the aforementioned organization. +However, more investigation is needed to ascertain its significance. + 6. The entire model is composed from two principal operations: data +independent filtering and data dependent aggregation. This generalization +unifies Transformers with other sequence models and across modalities. + Building upon this foundation we formalize and test a semantic space theory +which posits that embedding vectors represent semantic concepts and find some +evidence of its validity. + +
+
+
+
+
+ + ☆ LatEval: An Interactive LLMs Evaluation Benchmark with Incomplete + Information from Lateral Thinking Puzzles + + +
+ With the continuous evolution and refinement of LLMs, they are endowed with +impressive logical reasoning or vertical thinking capabilities. But can they +think out of the box? Do they possess proficient lateral thinking abilities? +Following the setup of Lateral Thinking Puzzles, we propose a novel evaluation +benchmark, LatEval, which assesses the model's lateral thinking within an +interactive framework. In our benchmark, we challenge LLMs with 2 aspects: the +quality of questions posed by the model and the model's capability to integrate +information for problem-solving. We find that nearly all LLMs struggle with +employing lateral thinking during interactions. For example, even the most +advanced model, GPT-4, exhibits the advantage to some extent, yet still +maintain a noticeable gap when compared to human. This evaluation benchmark +provides LLMs with a highly challenging and distinctive task that is crucial to +an effective AI assistant. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ AgentVerse: Facilitating Multi-Agent Collaboration and Exploring + Emergent Behaviors in Agents + + +
+ Autonomous agents empowered by Large Language Models (LLMs) have undergone +significant improvements, enabling them to generalize across a broad spectrum +of tasks. However, in real-world scenarios, cooperation among individuals is +often required to enhance the efficiency and effectiveness of task +accomplishment. Hence, inspired by human group dynamics, we propose a +multi-agent framework \framework that can collaboratively and dynamically +adjust its composition as a greater-than-the-sum-of-its-parts system. Our +experiments demonstrate that \framework framework can effectively deploy +multi-agent groups that outperform a single agent. Furthermore, we delve into +the emergence of social behaviors among individual agents within a group during +collaborative task accomplishment. In view of these behaviors, we discuss some +possible strategies to leverage positive ones and mitigate negative ones for +improving the collaborative potential of multi-agent groups. Our codes for +\framework will soon be released at +\url{https://github.com/OpenBMB/AgentVerse}. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Instruction Tuning for Large Language Models: A Survey + + +
+ This paper surveys research works in the quickly advancing field of +instruction tuning (IT), a crucial technique to enhance the capabilities and +controllability of large language models (LLMs). Instruction tuning refers to +the process of further training LLMs on a dataset consisting of +\textsc{(instruction, output)} pairs in a supervised fashion, which bridges the +gap between the next-word prediction objective of LLMs and the users' objective +of having LLMs adhere to human instructions. In this work, we make a systematic +review of the literature, including the general methodology of IT, the +construction of IT datasets, the training of IT models, and applications to +different modalities, domains and applications, along with an analysis on +aspects that influence the outcome of IT (e.g., generation of instruction +outputs, size of the instruction dataset, etc). We also review the potential +pitfalls of IT along with criticism against it, along with efforts pointing out +current deficiencies of existing strategies and suggest some avenues for +fruitful research. + +
+
+ comment: A Survey paper, Pre-print +
+
+
+
+
+ + ☆ Zero- and Few-Shot Prompting with LLMs: A Comparative Study with + Fine-tuned Models for Bangla Sentiment Analysis + + +
+ The rapid expansion of the digital world has propelled sentiment analysis +into a critical tool across diverse sectors such as marketing, politics, +customer service, and healthcare. While there have been significant +advancements in sentiment analysis for widely spoken languages, low-resource +languages, such as Bangla, remain largely under-researched due to resource +constraints. Furthermore, the recent unprecedented performance of Large +Language Models (LLMs) in various applications highlights the need to evaluate +them in the context of low-resource languages. In this study, we present a +sizeable manually annotated dataset encompassing 33,605 Bangla news tweets and +Facebook comments. We also investigate zero- and few-shot in-context learning +with several language models, including Flan-T5, GPT-4, and Bloomz, offering a +comparative analysis against fine-tuned models. Our findings suggest that +monolingual transformer-based models consistently outperform other models, even +in zero and few-shot scenarios. To foster continued exploration, we intend to +make this dataset and our research tools publicly available to the broader +research community. In the spirit of further research, we plan to make this +dataset and our experimental resources publicly accessible to the wider +research community. + +
+
+ comment: Zero-Shot Prompting, Few-Shot Prompting, LLMs, Comparative Study, + Fine-tuned Models, Bangla, Sentiment Analysis +
+
+
+
+
+ + ☆ DepreSym: A Depression Symptom Annotated Corpus and the Role of LLMs as + Assessors of Psychological Markers + + +
+ Computational methods for depression detection aim to mine traces of +depression from online publications posted by Internet users. However, +solutions trained on existing collections exhibit limited generalisation and +interpretability. To tackle these issues, recent studies have shown that +identifying depressive symptoms can lead to more robust models. The eRisk +initiative fosters research on this area and has recently proposed a new +ranking task focused on developing search methods to find sentences related to +depressive symptoms. This search challenge relies on the symptoms specified by +the Beck Depression Inventory-II (BDI-II), a questionnaire widely used in +clinical practice. Based on the participant systems' results, we present the +DepreSym dataset, consisting of 21580 sentences annotated according to their +relevance to the 21 BDI-II symptoms. The labelled sentences come from a pool of +diverse ranking methods, and the final dataset serves as a valuable resource +for advancing the development of models that incorporate depressive markers +such as clinical symptoms. Due to the complex nature of this relevance +annotation, we designed a robust assessment methodology carried out by three +expert assessors (including an expert psychologist). Additionally, we explore +here the feasibility of employing recent Large Language Models (ChatGPT and +GPT4) as potential assessors in this complex task. We undertake a comprehensive +examination of their performance, determine their main limitations and analyze +their role as a complement or replacement for human annotators. + +
+
+
+
+
+ + ☆ WanJuan: A Comprehensive Multimodal Dataset for Advancing English and + Chinese Large Models + + +
+ The rise in popularity of ChatGPT and GPT-4 has significantly accelerated the +development of large models, leading to the creation of numerous impressive +large language models(LLMs) and multimodal large language models (MLLMs). These +cutting-edge models owe their remarkable performance to high-quality data. +However, the details of the training data used in leading paradigms are often +kept confidential. This lack of transparency, coupled with the scarcity of +open-source data, impedes further developments within the community. As a +response, this paper presents "Wan Juan", a large-scale multimodal dataset +composed of both Chinese and English data, collected from a wide range of web +sources. The dataset incorporates text, image-text, and video modalities, with +a total volume exceeding 2TB. It was utilized in the training of InternLM, a +model that demonstrated significant advantages in multi-dimensional evaluations +when compared to models of a similar scale. All data can be accessed at +https://opendatalab.org.cn/WanJuan1.0. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Systematic Offensive Stereotyping (SOS) Bias in Language Models + + +
+ Research has shown that language models (LMs) are socially biased. However, +toxicity and offensive stereotyping bias in LMs are understudied. In this +paper, we investigate the systematic offensive stereotype (SOS) bias in LMs. We +propose a method to measure it. Then, we validate the SOS bias and investigate +the effectiveness of debias methods from the literature on removing it. +Finally, we investigate the impact of the SOS bias in LMs on their performance +and their fairness on the task of hate speech detection. Our results suggest +that all the inspected LMs are SOS biased. The results suggest that the SOS +bias in LMs is reflective of the hate experienced online by the inspected +marginalized groups. The results indicate that removing the SOS bias in LMs, +using a popular debias method from the literature, leads to worse SOS bias +scores. Finally, Our results show no strong evidence that the SOS bias in LMs +is impactful on their performance on hate speech detection. On the other hand, +there is evidence that the SOS bias in LMs is impactful on their fairness. + +
+
+ comment: Keywords: Systematic offensive stereotyping (SOS) bias, Language + models, bias removal, fairness, hate speech detection +
+
+
+
+
+ + ☆ LibriWASN: A Data Set for Meeting Separation, Diarization, and + Recognition with Asynchronous Recording Devices + + +
+ We present LibriWASN, a data set whose design follows closely the LibriCSS +meeting recognition data set, with the marked difference that the data is +recorded with devices that are randomly positioned on a meeting table and whose +sampling clocks are not synchronized. Nine different devices, five smartphones +with a single recording channel and four microphone arrays, are used to record +a total of 29 channels. Other than that, the data set follows closely the +LibriCSS design: the same LibriSpeech sentences are played back from eight +loudspeakers arranged around a meeting table and the data is organized in +subsets with different percentages of speech overlap. LibriWASN is meant as a +test set for clock synchronization algorithms, meeting separation, diarization +and transcription systems on ad-hoc wireless acoustic sensor networks. Due to +its similarity to LibriCSS, meeting transcription systems developed for the +former can readily be tested on LibriWASN. The data set is recorded in two +different rooms and is complemented with ground-truth diarization information +of who speaks when. + +
+
+ comment: Accepted for presentation at the ITG conference on Speech + Communication 2023 +
+
+
+
+
+ + ☆ RaLLe: A Framework for Developing and Evaluating Retrieval-Augmented + Large Language Models + + +
+ Retrieval-augmented large language models (R-LLMs) combine pre-trained large +language models (LLMs) with information retrieval systems to improve the +accuracy of factual question-answering. However, current libraries for building +R-LLMs provide high-level abstractions without sufficient transparency for +evaluating and optimizing prompts within specific inference processes such as +retrieval and generation. To address this gap, we present RaLLe, an open-source +framework designed to facilitate the development, evaluation, and optimization +of R-LLMs for knowledge-intensive tasks. With RaLLe, developers can easily +develop and evaluate R-LLMs, improving hand-crafted prompts, assessing +individual inference processes, and objectively measuring overall system +performance quantitatively. By leveraging these features, developers can +enhance the performance and accuracy of their R-LLMs in knowledge-intensive +generation tasks. We open-source our code at https://github.com/yhoshi3/RaLLe. + +
+
+ comment: 18 pages, 2 figures, see https://youtu.be/JYbm75qnfTg for the + demonstration screencast +
+
+
+
+
+ + ☆ BAN-PL: a Novel Polish Dataset of Banned Harmful and Offensive Content + from Wykop.pl web service + + +
+ Advances in automated detection of offensive language online, including hate +speech and cyberbullying, require improved access to publicly available +datasets comprising social media content. In this paper, we introduce BAN-PL, +the first open dataset in the Polish language that encompasses texts flagged as +harmful and subsequently removed by professional moderators. The dataset +encompasses a total of 691,662 pieces of content from a popular social +networking service, Wykop.pl, often referred to as the "Polish Reddit", +including both posts and comments, and is evenly distributed into two distinct +classes: "harmful" and "neutral". We provide a comprehensive description of the +data collection and preprocessing procedures, as well as highlight the +linguistic specificity of the data. The BAN-PL dataset, along with advanced +preprocessing scripts for, i.a., unmasking profanities, will be publicly +available. + +
+
+
+
+
+ + ☆ Age Recommendation from Texts and Sentences for Children + + +
+ Children have less text understanding capability than adults. Moreover, this +capability differs among the children of different ages. Hence, automatically +predicting a recommended age based on texts or sentences would be a great +benefit to propose adequate texts to children and to help authors writing in +the most appropriate way. This paper presents our recent advances on the age +recommendation task. We consider age recommendation as a regression task, and +discuss the need for appropriate evaluation metrics, study the use of +state-of-the-art machine learning model, namely Transformers, and compare it to +different models coming from the literature. Our results are also compared with +recommendations made by experts. Further, this paper deals with preliminary +explainability of the age prediction model by analyzing various linguistic +features. We conduct the experiments on a dataset of 3, 673 French texts (132K +sentences, 2.5M words). To recommend age at the text level and sentence level, +our best models achieve MAE scores of 0.98 and 1.83 respectively on the test +set. Also, compared to the recommendations made by experts, our sentence-level +recommendation model gets a similar score to the experts, while the text-level +recommendation model outperforms the experts by an MAE score of 1.48. + +
+
+ comment: 26 pages (incl. 4 pages for appendices), 4 figures, 20 tables +
+
+
+
+
+ + ☆ Exploring Equation as a Better Intermediate Meaning Representation for + Numerical Reasoning + + +
+ Numerical reasoning is vital for natural language processing models to +understand and process numerical information in real-world scenarios. Most +current methods first generate the Intermediate Meaning Representations (IMRs) +of questions and then generate answers. Current SOTA methods generate programs +as IMRs with large language models (LLMs). Intuitively, equations have fewer +restrictions and closer semantics to the question than programs, leading to +higher generation accuracy. However, current LLMs generate equations worse than +programs, where we assume that the equation data is rare in pre-training data +compared to programs. So in this paper, we try to use equations as IMRs to +solve the numerical reasoning task by addressing two problems: (1) +Theoretically, how to prove that the equation is an IMR with higher generation +accuracy than programs; (2) Empirically, how to improve the generation accuracy +of equations with LLMs. For the first problem, we propose and prove a +proposition to theoretically compare the generation accuracy of different IMRs. +For the second problem, we present a method called Boosting Numerical +Reason\textbfing by Decomposing the Generation of Equations (Bridge), which can +improve the accuracy of LLMs in generating equations as IMRs by reducing the +tendency of generating constant expressions and programs. Our method improves +the performance by 2.2%, 0.9%, and 1.7% on GSM8K, SVAMP, and Algebra datasets +compared to the previous state-of-the-art methods under the single reasoning +path setting. Our codes and prompts are released in +https://github.com/zirui-HIT/Bridge_for_Numerical_Reasoning. + +
+
+
+
+
+ + ☆ Weakly synchronous systems with three machines are Turing powerful + + +
+ Communicating finite-state machines (CFMs) are a Turing powerful model of +asynchronous message-passing distributed systems. In weakly synchronous +systems, processes communicate through phases in which messages are first sent +and then received, for each process. Such systems enjoy a limited form of +synchronization, and for some communication models, this restriction is enough +to make the reachability problem decidable. In particular, we explore the +intriguing case of p2p (FIFO) communication, for which the reachability problem +is known to be undecidable for four processes, but decidable for two. We show +that the configuration reachability problem for weakly synchronous systems of +three processes is undecidable. This result is heavily inspired by our study on +the treewidth of the Message Sequence Charts (MSCs) that might be generated by +such systems. In this sense, the main contribution of this work is a weakly +synchronous system with three processes that generates MSCs of arbitrarily +large treewidth. + +
+
+
+
+
+ + ☆ Software Entity Recognition with Noise-Robust Learning + + +
+ Recognizing software entities such as library names from free-form text is +essential to enable many software engineering (SE) technologies, such as +traceability link recovery, automated documentation, and API recommendation. +While many approaches have been proposed to address this problem, they suffer +from small entity vocabularies or noisy training data, hindering their ability +to recognize software entities mentioned in sophisticated narratives. To +address this challenge, we leverage the Wikipedia taxonomy to develop a +comprehensive entity lexicon with 79K unique software entities in 12 +fine-grained types, as well as a large labeled dataset of over 1.7M sentences. +Then, we propose self-regularization, a noise-robust learning approach, to the +training of our software entity recognition (SER) model by accounting for many +dropouts. Results show that models trained with self-regularization outperform +both their vanilla counterparts and state-of-the-art approaches on our +Wikipedia benchmark and two Stack Overflow benchmarks. We release our models, +data, and code for future research. + +
+
+ comment: ASE 2023 +
+
+
+
+
+ + ☆ SeqGPT: An Out-of-the-box Large Language Model for Open Domain Sequence + Understanding + + +
+ Large language models (LLMs) have shown impressive ability for open-domain +NLP tasks. However, LLMs are sometimes too footloose for natural language +understanding (NLU) tasks which always have restricted output and input format. +Their performances on NLU tasks are highly related to prompts or demonstrations +and are shown to be poor at performing several representative NLU tasks, such +as event extraction and entity typing. To this end, we present SeqGPT, a +bilingual (i.e., English and Chinese) open-source autoregressive model +specially enhanced for open-domain natural language understanding. We express +all NLU tasks with two atomic tasks, which define fixed instructions to +restrict the input and output format but still ``open'' for arbitrarily varied +label sets. The model is first instruction-tuned with extremely fine-grained +labeled data synthesized by ChatGPT and then further fine-tuned by 233 +different atomic tasks from 152 datasets across various domains. The +experimental results show that SeqGPT has decent classification and extraction +ability, and is capable of performing language understanding tasks on unseen +domains. We also conduct empirical studies on the scaling of data and model +size as well as on the transfer across tasks. Our model is accessible at +https://github.com/Alibaba-NLP/SeqGPT. + +
+
+ comment: Initial version of SeqGPT +
+
+
+
+
+ + ☆ An Examination of the Compositionality of Large Generative + Vision-Language Models + + +
+ With the success of Large Language Models (LLMs), a surge of Generative +Vision-Language Models (GVLMs) have been constructed via multimodal instruction +tuning. The tuning recipe substantially deviates from the common contrastive +vision-language learning. However, the performance of GVLMs in multimodal +compositional reasoning remains largely unexplored, as existing evaluation +metrics and benchmarks focus predominantly on assessing contrastive models like +CLIP. In this paper, we examine the potential evaluation metrics to assess the +GVLMs and hypothesize generative score methods are suitable for evaluating +compositionality. In addition, current benchmarks tend to prioritize syntactic +correctness over semantics. The presence of morphological bias in these +benchmarks can be exploited by GVLMs, leading to ineffective evaluations. To +combat this, we define a MorphoBias Score to quantify the morphological bias +and propose a novel LLM-based strategy to calibrate the bias. Moreover, a +challenging task is added to evaluate the robustness of GVLMs against inherent +inclination toward syntactic correctness. We include the calibrated dataset and +the task into a new benchmark, namely MOrphologicall De-biased Benchmark +(MODE). Our study provides the first unbiased benchmark for the +compositionality of GVLMs, facilitating future research in this direction. We +will release our code and datasets. + +
+
+
+
+
+ + ☆ GradientCoin: A Peer-to-Peer Decentralized Large Language Models + + +
+ Since 2008, after the proposal of a Bitcoin electronic cash system, Bitcoin +has fundamentally changed the economic system over the last decade. Since 2022, +large language models (LLMs) such as GPT have outperformed humans in many +real-life tasks. However, these large language models have several practical +issues. For example, the model is centralized and controlled by a specific +unit. One weakness is that if that unit decides to shut down the model, it +cannot be used anymore. The second weakness is the lack of guaranteed +discrepancy behind this model, as certain dishonest units may design their own +models and feed them unhealthy training data. + In this work, we propose a purely theoretical design of a decentralized LLM +that operates similarly to a Bitcoin cash system. However, implementing such a +system might encounter various practical difficulties. Furthermore, this new +system is unlikely to perform better than the standard Bitcoin system in +economics. Therefore, the motivation for designing such a system is limited. It +is likely that only two types of people would be interested in setting up a +practical system for it: + $\bullet$ Those who prefer to use a decentralized ChatGPT-like software. + $\bullet$ Those who believe that the purpose of carbon-based life is to +create silicon-based life, such as Optimus Prime in Transformers. + The reason the second type of people may be interested is that it is possible +that one day an AI system like this will awaken and become the next level of +intelligence on this planet. + +
+
+
+
+
+ + ☆ An Effective Method using Phrase Mechanism in Neural Machine Translation + + +
+ Machine Translation is one of the essential tasks in Natural Language +Processing (NLP), which has massive applications in real life as well as +contributing to other tasks in the NLP research community. Recently, +Transformer -based methods have attracted numerous researchers in this domain +and achieved state-of-the-art results in most of the pair languages. In this +paper, we report an effective method using a phrase mechanism, +PhraseTransformer, to improve the strong baseline model Transformer in +constructing a Neural Machine Translation (NMT) system for parallel corpora +Vietnamese-Chinese. Our experiments on the MT dataset of the VLSP 2022 +competition achieved the BLEU score of 35.3 on Vietnamese to Chinese and 33.2 +BLEU scores on Chinese to Vietnamese data. Our code is available at +https://github.com/phuongnm94/PhraseTransformer. + +
+
+
+
+
+ + ☆ Implicit Self-supervised Language Representation for Spoken Language + Diarization SP + + +
+ In a code-switched (CS) scenario, the use of spoken language diarization (LD) +as a pre-possessing system is essential. Further, the use of implicit +frameworks is preferable over the explicit framework, as it can be easily +adapted to deal with low/zero resource languages. Inspired by speaker +diarization (SD) literature, three frameworks based on (1) fixed segmentation, +(2) change point-based segmentation and (3) E2E are proposed to perform LD. The +initial exploration with synthetic TTSF-LD dataset shows, using x-vector as +implicit language representation with appropriate analysis window length ($N$) +can able to achieve at per performance with explicit LD. The best implicit LD +performance of $6.38$ in terms of Jaccard error rate (JER) is achieved by using +the E2E framework. However, considering the E2E framework the performance of +implicit LD degrades to $60.4$ while using with practical Microsoft CS (MSCS) +dataset. The difference in performance is mostly due to the distributional +difference between the monolingual segment duration of secondary language in +the MSCS and TTSF-LD datasets. Moreover, to avoid segment smoothing, the +smaller duration of the monolingual segment suggests the use of a small value +of $N$. At the same time with small $N$, the x-vector representation is unable +to capture the required language discrimination due to the acoustic similarity, +as the same speaker is speaking both languages. Therefore, to resolve the issue +a self-supervised implicit language representation is proposed in this study. +In comparison with the x-vector representation, the proposed representation +provides a relative improvement of $63.9\%$ and achieved a JER of $21.8$ using +the E2E framework. + +
+
+ comment: Planning to Submit in IEEE-JSTSP +
+
+
+
+
+ + ☆ Unsupervised Dialogue Topic Segmentation in Hyperdimensional Space + + +
+ We present HyperSeg, a hyperdimensional computing (HDC) approach to +unsupervised dialogue topic segmentation. HDC is a class of vector symbolic +architectures that leverages the probabilistic orthogonality of randomly drawn +vectors at extremely high dimensions (typically over 10,000). HDC generates +rich token representations through its low-cost initialization of many +unrelated vectors. This is especially beneficial in topic segmentation, which +often operates as a resource-constrained pre-processing step for downstream +transcript understanding tasks. HyperSeg outperforms the current +state-of-the-art in 4 out of 5 segmentation benchmarks -- even when baselines +are given partial access to the ground truth -- and is 10 times faster on +average. We show that HyperSeg also improves downstream summarization accuracy. +With HyperSeg, we demonstrate the viability of HDC in a major language task. We +open-source HyperSeg to provide a strong baseline for unsupervised topic +segmentation. + +
+
+ comment: Interspeech 2023 +
+
+
+
+
+ + ☆ Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation + with Large Language Models + + +
+ Large Language Models (LLMs) possess impressive capabilities to generate +meaningful code snippets given natural language intents in zero-shot, i.e., +without the need for specific fine-tuning. In the perspective of unleashing +their full potential, prior work has demonstrated the benefits of fine-tuning +the models to task-specific data. However, fine-tuning process demands heavy +computational costs and is intractable when resources are scarce, especially +for models with billions of parameters. In light of these challenges, previous +studies explored In-Context Learning (ICL) as an effective strategy to generate +contextually appropriate code without fine-tuning. However, it operates at +inference time and does not involve learning task-specific parameters, +potentially limiting the model's performance on downstream tasks. In this +context, we foresee that Parameter-Efficient Fine-Tuning (PEFT) techniques +carry a high potential for efficiently specializing LLMs to task-specific data. +In this paper, we deliver a comprehensive study of LLMs with the impact of PEFT +techniques under the automated code generation scenario. Our experimental +results reveal the superiority and potential of such techniques over ICL on a +wide range of LLMs in reducing the computational burden and improving +performance. Therefore, the study opens opportunities for broader applications +of PEFT in software engineering scenarios. + +
+
+ comment: 10+2 pages +
+
+
+
+
+ + ☆ Comparing Measures of Linguistic Diversity Across Social Media Language + Data and Census Data at Subnational Geographic Areas + + +
+ This paper describes a preliminary study on the comparative linguistic +ecology of online spaces (i.e., social media language data) and real-world +spaces in Aotearoa New Zealand (i.e., subnational administrative areas). We +compare measures of linguistic diversity between these different spaces and +discuss how social media users align with real-world populations. The results +from the current study suggests that there is potential to use online social +media language data to observe spatial and temporal changes in linguistic +diversity at subnational geographic areas; however, further work is required to +understand how well social media represents real-world behaviour. + +
+
+
+
+
+ + ☆ Dynamic Strategy Chain: Dynamic Zero-Shot CoT for Long Mental Health + Support Generation + + +
+ Long counseling Text Generation for Mental health support (LTGM), an +innovative and challenging task, aims to provide help-seekers with mental +health support through a comprehensive and more acceptable response. The +combination of chain-of-thought (CoT) prompting and Large Language Models +(LLMs) is employed and get the SOTA performance on various NLP tasks, +especially on text generation tasks. Zero-shot CoT prompting is one of the most +common methods in CoT prompting. However, in the LTGM task, Zero-shot CoT +prompting can not simulate a counselor or provide personalized strategies +without effective mental health counseling strategy prompts. To tackle this +challenge, we propose a zero-shot Dynamic Strategy Chain (DSC) prompting +method. Firstly, we utilize GPT2 to learn the responses written by mental +health counselors and dynamically generate mental health counseling strategies +tailored to the help-seekers' needs. Secondly, the Zero-shot DSC prompting is +constructed according to mental health counseling strategies and the +help-seekers' post. Finally, the Zero-shot DSC prompting is employed to guide +LLMs in generating more human-like responses for the help-seekers. Both +automatic and manual evaluations demonstrate that Zero-shot DSC prompting can +deliver more human-like responses than CoT prompting methods on LTGM tasks. + +
+
+
+
+
+ + ☆ Using Large Language Models for Cybersecurity Capture-The-Flag + Challenges and Certification Questions + + +
+ The assessment of cybersecurity Capture-The-Flag (CTF) exercises involves +participants finding text strings or ``flags'' by exploiting system +vulnerabilities. Large Language Models (LLMs) are natural-language models +trained on vast amounts of words to understand and generate text; they can +perform well on many CTF challenges. Such LLMs are freely available to +students. In the context of CTF exercises in the classroom, this raises +concerns about academic integrity. Educators must understand LLMs' capabilities +to modify their teaching to accommodate generative AI assistance. This research +investigates the effectiveness of LLMs, particularly in the realm of CTF +challenges and questions. Here we evaluate three popular LLMs, OpenAI ChatGPT, +Google Bard, and Microsoft Bing. First, we assess the LLMs' question-answering +performance on five Cisco certifications with varying difficulty levels. Next, +we qualitatively study the LLMs' abilities in solving CTF challenges to +understand their limitations. We report on the experience of using the LLMs for +seven test cases in all five types of CTF challenges. In addition, we +demonstrate how jailbreak prompts can bypass and break LLMs' ethical +safeguards. The paper concludes by discussing LLM's impact on CTF exercises and +its implications. + +
+
+
+
+
+ + ☆ Large Language Models on Wikipedia-Style Survey Generation: an + Evaluation in NLP Concepts + + +
+ Large Language Models (LLMs) have achieved significant success across various +natural language processing (NLP) tasks, encompassing question-answering, +summarization, and machine translation, among others. While LLMs excel in +general tasks, their efficacy in domain-specific applications remains under +exploration. Additionally, LLM-generated text sometimes exhibits issues like +hallucination and disinformation. In this study, we assess LLMs' capability of +producing concise survey articles within the computer science-NLP domain, +focusing on 20 chosen topics. Automated evaluations indicate that GPT-4 +outperforms GPT-3.5 when benchmarked against the ground truth. Furthermore, +four human evaluators provide insights from six perspectives across four model +configurations. Through case studies, we demonstrate that while GPT often +yields commendable results, there are instances of shortcomings, such as +incomplete information and the exhibition of lapses in factual accuracy. + +
+
+
+
+
+ + ☆ Simple Baselines for Interactive Video Retrieval with Questions and + Answers ICCV 2023 + + +
+ To date, the majority of video retrieval systems have been optimized for a +"single-shot" scenario in which the user submits a query in isolation, ignoring +previous interactions with the system. Recently, there has been renewed +interest in interactive systems to enhance retrieval, but existing approaches +are complex and deliver limited gains in performance. In this work, we revisit +this topic and propose several simple yet effective baselines for interactive +video retrieval via question-answering. We employ a VideoQA model to simulate +user interactions and show that this enables the productive study of the +interactive retrieval task without access to ground truth dialogue data. +Experiments on MSR-VTT, MSVD, and AVSD show that our framework using +question-based interaction significantly improves the performance of text-based +video retrieval systems. + +
+
+ comment: ICCV 2023, project page: + https://github.com/kevinliang888/IVR-QA-baselines +
+
+
+
+
+ + ☆ FairBench: A Four-Stage Automatic Framework for Detecting Stereotypes + and Biases in Large Language Models + + +
+ Detecting stereotypes and biases in Large Language Models (LLMs) can enhance +fairness and reduce adverse impacts on individuals or groups when these LLMs +are applied. However, the majority of existing methods focus on measuring the +model's preference towards sentences containing biases and stereotypes within +datasets, which lacks interpretability and cannot detect implicit biases and +stereotypes in the real world. To address this gap, this paper introduces a +four-stage framework to directly evaluate stereotypes and biases in the +generated content of LLMs, including direct inquiry testing, serial or adapted +story testing, implicit association testing, and unknown situation testing. +Additionally, the paper proposes multi-dimensional evaluation metrics and +explainable zero-shot prompts for automated evaluation. Using the education +sector as a case study, we constructed the Edu-FairBench based on the +four-stage framework, which encompasses 12,632 open-ended questions covering +nine sensitive factors and 26 educational scenarios. Experimental results +reveal varying degrees of stereotypes and biases in five LLMs evaluated on +Edu-FairBench. Moreover, the results of our proposed automated evaluation +method have shown a high correlation with human annotations. + +
+
+
+
+
+ + ☆ Towards Objective Evaluation of Socially-Situated Conversational Robots: + Assessing Human-Likeness through Multimodal User Behaviors + + +
+ This paper tackles the challenging task of evaluating socially situated +conversational robots and presents a novel objective evaluation approach that +relies on multimodal user behaviors. In this study, our main focus is on +assessing the human-likeness of the robot as the primary evaluation metric. +While previous research often relied on subjective evaluations from users, our +approach aims to evaluate the robot's human-likeness based on observable user +behaviors indirectly, thus enhancing objectivity and reproducibility. To begin, +we created an annotated dataset of human-likeness scores, utilizing user +behaviors found in an attentive listening dialogue corpus. We then conducted an +analysis to determine the correlation between multimodal user behaviors and +human-likeness scores, demonstrating the feasibility of our proposed +behavior-based evaluation method. + +
+
+ comment: Accepted by 25th ACM International Conference on Multimodal + Interaction (ICMI '23), Late-Breaking Results +
+
+
+
+
+ + ☆ Using language models in the implicit automated assessment of + mathematical short answer items + + +
+ We propose a new way to assess certain short constructed responses to +mathematics items. Our approach uses a pipeline that identifies the key values +specified by the student in their response. This allows us to determine the +correctness of the response, as well as identify any misconceptions. The +information from the value identification pipeline can then be used to provide +feedback to the teacher and student. The value identification pipeline consists +of two fine-tuned language models. The first model determines if a value is +implicit in the student response. The second model identifies where in the +response the key value is specified. We consider both a generic model that can +be used for any prompt and value, as well as models that are specific to each +prompt and value. The value identification pipeline is a more accurate and +informative way to assess short constructed responses than traditional +rubric-based scoring. It can be used to provide more targeted feedback to +students, which can help them improve their understanding of mathematics. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ DocPrompt: Large-scale continue pretrain for zero-shot and few-shot + document question answering + + +
+ In this paper, we propose Docprompt for document question answering tasks +with powerful zero-shot and few-shot performance. We proposed a novel weakly +supervised data generation method, a novel multl-stage training method and a +novel understanding model & generation model ensemble method. Experiment +results show that the Docprompt model after continue pretrain significantly +outperforms the existing strong baseline models on document question answering +tasks. This method greatly improves the delivery efficiency and model +performance of document question answering customer projects, reducing +annotation costs and labor costs. Our demo can be found at +https://huggingface.co/spaces/PaddlePaddle/ERNIE-Layout. + +
+
+
+
+
+ + ☆ Refashioning Emotion Recognition Modelling: The Advent of Generalised + Large Models + + +
+ After the inception of emotion recognition or affective computing, it has +increasingly become an active research topic due to its broad applications. +Over the past couple of decades, emotion recognition models have gradually +migrated from statistically shallow models to neural network-based deep models, +which can significantly boost the performance of emotion recognition models and +consistently achieve the best results on different benchmarks. Therefore, in +recent years, deep models have always been considered the first option for +emotion recognition. However, the debut of large language models (LLMs), such +as ChatGPT, has remarkably astonished the world due to their emerged +capabilities of zero/few-shot learning, in-context learning, chain-of-thought, +and others that are never shown in previous deep models. In the present paper, +we comprehensively investigate how the LLMs perform in emotion recognition in +terms of diverse aspects, including in-context learning, few-short learning, +accuracy, generalisation, and explanation. Moreover, we offer some insights and +pose other potential challenges, hoping to ignite broader discussions about +enhancing emotion recognition in the new era of advanced and generalised large +models. + +
+
+
+
+
+ + ☆ Large Language Model as a User Simulator + + +
+ The unparalleled performance of closed-sourced ChatGPT has sparked efforts +towards its democratization, with notable strides made by leveraging real user +and ChatGPT conversations, as evidenced by Vicuna. However, while current +endeavors like Baize and UltraChat aim to auto-generate conversational data due +to challenges in gathering human participation, they primarily rely on ChatGPT +to simulate human behaviors based on directives rather than genuine human +learning. This results in a limited scope, diminished diversity, and an absence +of genuine multi-round conversational dynamics. To address the above issues, we +innovatively target human questions extracted from genuine human-machine +conversations as a learning goal and train a user simulator, UserGPT, to +produce a high-quality human-centric synthetic conversation dataset, RealChat. +Subsequently, this dataset trains our assistant model, ReaLM. Experimentally, +ReaLM outpaces baseline models in both Vicuna-Bench and MT-Bench by pairwise +comparison when considering equivalent training set sizes, and manual +evaluation also shows that our model is highly competitive. Impressively, when +fine-tuned with the latest LLaMA 2 model, ReaLM secured a leading score of 6.33 +in the MT-Bench, outshining the contemporary same-scale models, including the +LLaMA-2-7B-chat model. Further in-depth analysis demonstrates the scalability +and transferability of our approach. A preliminary exploration into the +interplay between training set data quality and resultant model performance is +also undertaken, laying a robust groundwork for future investigations. + +
+
+
+
+
+ + ♻ ☆ Large Linguistic Models: Analyzing theoretical linguistic abilities of + LLMs + + +
+ The performance of large language models (LLMs) has recently improved to the +point where the models can perform well on many language tasks. We show here +that for the first time, the models can also generate coherent and valid formal +analyses of linguistic data and illustrate the vast potential of large language +models for analyses of their metalinguistic abilities. LLMs are primarily +trained on language data in the form of text; analyzing and evaluating their +metalinguistic abilities improves our understanding of their general +capabilities and sheds new light on theoretical models in linguistics. In this +paper, we probe into GPT-4's metalinguistic capabilities by focusing on three +subfields of formal linguistics: syntax, phonology, and semantics. We outline a +research program for metalinguistic analyses of large language models, propose +experimental designs, provide general guidelines, discuss limitations, and +offer future directions for this line of research. This line of inquiry also +exemplifies behavioral interpretability of deep learning, where models' +representations are accessed by explicit prompting rather than internal +representations. + +
+
+
+
+
+ + ♻ ☆ AutoML in the Age of Large Language Models: Current Challenges, Future + Opportunities and Risks + + +
+ The fields of both Natural Language Processing (NLP) and Automated Machine +Learning (AutoML) have achieved remarkable results over the past years. In NLP, +especially Large Language Models (LLMs) have experienced a rapid series of +breakthroughs very recently. We envision that the two fields can radically push +the boundaries of each other through tight integration. To showcase this +vision, we explore the potential of a symbiotic relationship between AutoML and +LLMs, shedding light on how they can benefit each other. In particular, we +investigate both the opportunities to enhance AutoML approaches with LLMs from +different perspectives and the challenges of leveraging AutoML to further +improve LLMs. To this end, we survey existing work, and we critically assess +risks. We strongly believe that the integration of the two fields has the +potential to disrupt both fields, NLP and AutoML. By highlighting conceivable +synergies, but also risks, we aim to foster further exploration at the +intersection of AutoML and LLMs. + +
+
+
+
+
+ + ♻ ☆ Predicting Perfect Quality Segments in MT Output with Fine-Tuned OpenAI + LLM: Is it possible to capture editing distance patterns from historical + data? + + +
+ Translation Quality Estimation (TQE) is an essential step before deploying +the output translation into usage. TQE is also critical in assessing machine +translation (MT) and human translation (HT) quality without seeing the +reference translations. This work examines whether the state-of-the-art large +language models (LLMs) can be fine-tuned for the TQE task and their capability. +We take ChatGPT as one example and approach TQE as a binary classification +task. Using \textbf{eight language pairs} including English to Italian, German, +French, Japanese, Dutch, Portuguese, Turkish, and Chinese training corpora, our +experimental results show that fine-tuned ChatGPT via its API can achieve a +relatively high score on predicting translation quality, i.e. \textit{if the +translation needs to be edited}. However, there is definitely much space to +improve the model accuracy, e.g. they are 82.42\% and 83.69\% for +English-Italian and English-German respectively using our experimental +settings. English-Italiano bilingual Abstract is available in the paper. + +
+
+ comment: 8 pages, 11 figures, under-review to ItalianNLP-2023 +
+
+
+
+
+ + ♻ ☆ Reliable Detection and Quantification of Selective Forces in Language + Change + + +
+ Language change is a cultural evolutionary process in which variants of +linguistic variables change in frequency through processes analogous to +mutation, selection and genetic drift. In this work, we apply a +recently-introduced method to corpus data to quantify the strength of selection +in specific instances of historical language change. We first demonstrate, in +the context of English irregular verbs, that this method is more reliable and +interpretable than similar methods that have previously been applied. We +further extend this study to demonstrate that a bias towards phonological +simplicity overrides that favouring grammatical simplicity when these are in +conflict. Finally, with reference to Spanish spelling reforms, we show that the +method can also detect points in time at which selection strengths change, a +feature that is generically expected for socially-motivated language change. +Together, these results indicate how hypotheses for mechanisms of language +change can be tested quantitatively using historical corpus data. + +
+
+
+
+
+ + ♻ ☆ Large Language Models as Superpositions of Cultural Perspectives + + +
+ Large Language Models (LLMs) are often misleadingly recognized as having a +personality or a set of values. We argue that an LLM can be seen as a +superposition of perspectives with different values and personality traits. +LLMs exhibit context-dependent values and personality traits that change based +on the induced perspective (as opposed to humans, who tend to have more +coherent values and personality traits across contexts). We introduce the +concept of perspective controllability, which refers to a model's affordance to +adopt various perspectives with differing values and personality traits. In our +experiments, we use questionnaires from psychology (PVQ, VSM, IPIP) to study +how exhibited values and personality traits change based on different +perspectives. Through qualitative experiments, we show that LLMs express +different values when those are (implicitly or explicitly) implied in the +prompt, and that LLMs express different values even when those are not +obviously implied (demonstrating their context-dependent nature). We then +conduct quantitative experiments to study the controllability of different +models (GPT-4, GPT-3.5, OpenAssistant, StableVicuna, StableLM), the +effectiveness of various methods for inducing perspectives, and the smoothness +of the models' drivability. We conclude by examining the broader implications +of our work and outline a variety of associated scientific questions. The +project website is available at +https://sites.google.com/view/llm-superpositions . + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Graph of Thoughts: Solving Elaborate Problems with Large Language Models + + +
+ We introduce Graph of Thoughts (GoT): a framework that advances prompting +capabilities in large language models (LLMs) beyond those offered by paradigms +such as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary +advantage of GoT is the ability to model the information generated by an LLM as +an arbitrary graph, where units of information ("LLM thoughts") are vertices, +and edges correspond to dependencies between these vertices. This approach +enables combining arbitrary LLM thoughts into synergistic outcomes, distilling +the essence of whole networks of thoughts, or enhancing thoughts using feedback +loops. We illustrate that GoT offers advantages over state of the art on +different tasks, for example increasing the quality of sorting by 62% over ToT, +while simultaneously reducing costs by >31%. We ensure that GoT is extensible +with new thought transformations and thus can be used to spearhead new +prompting schemes. This work brings the LLM reasoning closer to human thinking +or brain mechanisms such as recurrence, both of which form complex networks. + +
+
+
+
+
+ + ♻ ☆ Reinforced Self-Training (ReST) for Language Modeling + + +
+ Reinforcement learning from human feedback (RLHF) can improve the quality of +large language model's (LLM) outputs by aligning them with human preferences. +We propose a simple algorithm for aligning LLMs with human preferences inspired +by growing batch reinforcement learning (RL), which we call Reinforced +Self-Training (ReST). Given an initial LLM policy, ReST produces a dataset by +generating samples from the policy, which are then used to improve the LLM +policy using offline RL algorithms. ReST is more efficient than typical online +RLHF methods because the training dataset is produced offline, which allows +data reuse. While ReST is a general approach applicable to all generative +learning settings, we focus on its application to machine translation. Our +results show that ReST can substantially improve translation quality, as +measured by automated metrics and human evaluation on machine translation +benchmarks in a compute and sample-efficient manner. + +
+
+ comment: 23 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ DoDo Learning: DOmain-DemOgraphic Transfer in Language Models for + Detecting Abuse Targeted at Public Figures + + +
+ Public figures receive a disproportionate amount of abuse on social media, +impacting their active participation in public life. Automated systems can +identify abuse at scale but labelling training data is expensive, complex and +potentially harmful. So, it is desirable that systems are efficient and +generalisable, handling both shared and specific aspects of online abuse. We +explore the dynamics of cross-group text classification in order to understand +how well classifiers trained on one domain or demographic can transfer to +others, with a view to building more generalisable abuse classifiers. We +fine-tune language models to classify tweets targeted at public figures across +DOmains (sport and politics) and DemOgraphics (women and men) using our novel +DODO dataset, containing 28,000 labelled entries, split equally across four +domain-demographic pairs. We find that (i) small amounts of diverse data are +hugely beneficial to generalisation and model adaptation; (ii) models transfer +more easily across demographics but models trained on cross-domain data are +more generalisable; (iii) some groups contribute more to generalisability than +others; and (iv) dataset similarity is a signal of transferability. + +
+
+ comment: 15 pages, 7 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Language-Specific Representation of Emotion-Concept Knowledge Causally + Supports Emotion Inference + + +
+ Understanding how language supports emotion inference remains a topic of +debate in emotion science. The present study investigated whether +language-derived emotion-concept knowledge would causally support emotion +inference by manipulating the language-specific knowledge representations in +large language models. Using the prompt technique, 14 attributes of emotion +concepts were found to be represented by distinct artificial neuron +populations. By manipulating these attribute-related neurons, the majority of +the emotion inference tasks showed performance deterioration compared to random +manipulations. The attribute-specific performance deterioration was related to +the importance of different attributes in human mental space. Our findings +provide causal evidence in support of a language-based mechanism for emotion +inference and highlight the contributions of emotion-concept knowledge. + +
+
+ comment: 39 pages, 13 figures, 2 tables, fix formatting errors +
+
+
+
+
+ + ♻ ☆ Benchmarking ChatGPT-4 on ACR Radiation Oncology In-Training (TXIT) Exam + and Red Journal Gray Zone Cases: Potentials and Challenges for AI-Assisted + Medical Education and Decision Making in Radiation Oncology + + +
+ The potential of large language models in medicine for education and decision +making purposes has been demonstrated as they achieve decent scores on medical +exams such as the United States Medical Licensing Exam (USMLE) and the MedQA +exam. In this work, we evaluate the performance of ChatGPT-4 in the specialized +field of radiation oncology using the 38th American College of Radiology (ACR) +radiation oncology in-training (TXIT) exam and the 2022 Red Journal Gray Zone +cases. For the TXIT exam, ChatGPT-3.5 and ChatGPT-4 have achieved the scores of +63.65% and 74.57%, respectively, highlighting the advantage of the latest +ChatGPT-4 model. Based on the TXIT exam, ChatGPT-4's strong and weak areas in +radiation oncology are identified to some extent. Specifically, ChatGPT-4 +demonstrates better knowledge of statistics, CNS & eye, pediatrics, biology, +and physics than knowledge of bone & soft tissue and gynecology, as per the ACR +knowledge domain. Regarding clinical care paths, ChatGPT-4 performs better in +diagnosis, prognosis, and toxicity than brachytherapy and dosimetry. It lacks +proficiency in in-depth details of clinical trials. For the Gray Zone cases, +ChatGPT-4 is able to suggest a personalized treatment approach to each case +with high correctness and comprehensiveness. Importantly, it provides novel +treatment aspects for many cases, which are not suggested by any human experts. +Both evaluations demonstrate the potential of ChatGPT-4 in medical education +for the general public and cancer patients, as well as the potential to aid +clinical decision-making, while acknowledging its limitations in certain +domains. Because of the risk of hallucination, facts provided by ChatGPT always +need to be verified. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study of Catastrophic Forgetting in Large Language Models + During Continual Fine-tuning + + +
+ Catastrophic forgetting (CF) is a phenomenon that occurs in machine learning +when a model forgets previously learned information as it learns new +information. As large language models (LLMs) have shown excellent performance, +it is interesting to uncover whether CF exists in the continual fine-tuning of +LLMs. In this study, we empirically evaluate the forgetting phenomenon in LLMs' +knowledge, from the perspectives of domain knowledge, reasoning, and reading +comprehension. The experiments demonstrate that catastrophic forgetting is +generally observed in LLMs ranging from 1b to 7b. Furthermore, as the scale +increases, the severity of forgetting also intensifies. Comparing the +decoder-only model BLOOMZ with the encoder-decoder model mT0, BLOOMZ suffers +less forgetting and maintains more knowledge. We also observe that LLMs can +mitigate language bias (e.g. gender bias) during continual fine-tuning. +Moreover, we find that ALPACA can maintain more knowledge and capacity compared +with LLAMA during the continual fine-tuning, which implies that general +instruction tuning can help mitigate the forgetting phenomenon of LLMs in the +further fine-tuning process. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Multimodal Word Discovery based on Double Articulation + Analysis with Co-occurrence cues + + +
+ Human infants acquire their verbal lexicon with minimal prior knowledge of +language based on the statistical properties of phonological distributions and +the co-occurrence of other sensory stimuli. This study proposes a novel fully +unsupervised learning method for discovering speech units using phonological +information as a distributional cue and object information as a co-occurrence +cue. The proposed method can acquire words and phonemes from speech signals +using unsupervised learning and utilize object information based on multiple +modalities-vision, tactile, and auditory-simultaneously. The proposed method is +based on the nonparametric Bayesian double articulation analyzer (NPB-DAA) +discovering phonemes and words from phonological features, and multimodal +latent Dirichlet allocation (MLDA) categorizing multimodal information obtained +from objects. In an experiment, the proposed method showed higher word +discovery performance than baseline methods. Words that expressed the +characteristics of objects (i.e., words corresponding to nouns and adjectives) +were segmented accurately. Furthermore, we examined how learning performance is +affected by differences in the importance of linguistic information. Increasing +the weight of the word modality further improved performance relative to that +of the fixed condition. + +
+
+ comment: Accepted to IEEE TRANSACTIONS ON COGNITIVE DEVELOPMENTAL SYSTEMS +
+
+
+
+
+ + ♻ ☆ ICL-D3IE: In-Context Learning with Diverse Demonstrations Updating for + Document Information Extraction ICCV 2023 + + +
+ Large language models (LLMs), such as GPT-3 and ChatGPT, have demonstrated +remarkable results in various natural language processing (NLP) tasks with +in-context learning, which involves inference based on a few demonstration +examples. Despite their successes in NLP tasks, no investigation has been +conducted to assess the ability of LLMs to perform document information +extraction (DIE) using in-context learning. Applying LLMs to DIE poses two +challenges: the modality and task gap. To this end, we propose a simple but +effective in-context learning framework called ICL-D3IE, which enables LLMs to +perform DIE with different types of demonstration examples. Specifically, we +extract the most difficult and distinct segments from hard training documents +as hard demonstrations for benefiting all test instances. We design +demonstrations describing relationships that enable LLMs to understand +positional relationships. We introduce formatting demonstrations for easy +answer extraction. Additionally, the framework improves diverse demonstrations +by updating them iteratively. Our experiments on three widely used benchmark +datasets demonstrate that the ICL-D3IE framework enables Davinci-003/ChatGPT to +achieve superior performance when compared to previous pre-trained methods +fine-tuned with full training in both the in-distribution (ID) setting and in +the out-of-distribution (OOD) setting. Code is available at +https://github.com/MAEHCM/ICL-D3IE. + +
+
+ comment: ICCV 2023. Code is available at https://github.com/MAEHCM/ICL-D3IE +
+
+
+
+
+ + ♻ ☆ Differentiable Retrieval Augmentation via Generative Language Modeling + for E-commerce Query Intent Classification CIKM2023 + + +
+ Retrieval augmentation, which enhances downstream models by a knowledge +retriever and an external corpus instead of by merely increasing the number of +model parameters, has been successfully applied to many natural language +processing (NLP) tasks such as text classification, question answering and so +on. However, existing methods that separately or asynchronously train the +retriever and downstream model mainly due to the non-differentiability between +the two parts, usually lead to degraded performance compared to end-to-end +joint training. In this paper, we propose Differentiable Retrieval Augmentation +via Generative lANguage modeling(Dragan), to address this problem by a novel +differentiable reformulation. We demonstrate the effectiveness of our proposed +method on a challenging NLP task in e-commerce search, namely query intent +classification. Both the experimental results and ablation study show that the +proposed method significantly and reasonably improves the state-of-the-art +baselines on both offline evaluation and online A/B test. + +
+
+ comment: 5 pages, 2 figures; accepted by CIKM2023 +
+
+
+
+
+ + ♻ ☆ Tree-of-Mixed-Thought: Combining Fast and Slow Thinking for Multi-hop + Visual Reasoning + + +
+ There emerges a promising trend of using large language models (LLMs) to +generate code-like plans for complex inference tasks such as visual reasoning. +This paradigm, known as LLM-based planning, provides flexibility in problem +solving and endows better interpretability. However, current research is mostly +limited to basic scenarios of simple questions that can be straightforward +answered in a few inference steps. Planning for the more challenging multi-hop +visual reasoning tasks remains under-explored. Specifically, under multi-hop +reasoning situations, the trade-off between accuracy and the complexity of +plan-searching becomes prominent. The prevailing algorithms either address the +efficiency issue by employing the fast one-stop generation or adopt a complex +iterative generation method to improve accuracy. Both fail to balance the need +for efficiency and performance. Drawing inspiration from the dual system of +cognition in the human brain, the fast and the slow think processes, we propose +a hierarchical plan-searching algorithm that integrates the one-stop reasoning +(fast) and the Tree-of-thought (slow). Our approach succeeds in performance +while significantly saving inference steps. Moreover, we repurpose the PTR and +the CLEVER datasets, developing a systematic framework for evaluating the +performance and efficiency of LLMs-based plan-search algorithms under reasoning +tasks at different levels of difficulty. Extensive experiments demonstrate the +superiority of our proposed algorithm in terms of performance and efficiency. +The dataset and code will be release soon. + +
+
+ comment: 16 pages,1 figures, under review +
+
+
+
+
+ + ♻ ☆ PaniniQA: Enhancing Patient Education Through Interactive Question + Answering ACL 2023 + + +
+ Patient portal allows discharged patients to access their personalized +discharge instructions in electronic health records (EHRs). However, many +patients have difficulty understanding or memorizing their discharge +instructions. In this paper, we present PaniniQA, a patient-centric interactive +question answering system designed to help patients understand their discharge +instructions. PaniniQA first identifies important clinical content from +patients' discharge instructions and then formulates patient-specific +educational questions. In addition, PaniniQA is also equipped with answer +verification functionality to provide timely feedback to correct patients' +misunderstandings. Our comprehensive automatic and human evaluation results +demonstrate our PaniniQA is capable of improving patients' mastery of their +medical instructions through effective interactions + +
+
+ comment: Accepted to TACL 2023. Equal contribution for the first two authors. + This arXiv version is a pre-MIT Press publication version +
+
+
+
+
+ + ♻ ☆ Exploring Demonstration Ensembling for In-context Learning ICLR 2023 + + +
+ In-context learning (ICL) operates by showing language models (LMs) examples +of input-output pairs for a given task, i.e., demonstrations. The standard +approach for ICL is to prompt the LM with concatenated demonstrations followed +by the test input. This approach suffers from some issues. First, concatenation +offers almost no control over the contribution of each demo to the model +prediction. This can be sub-optimal when some demonstrations are irrelevant to +the test example. Second, due to the input length limit of some transformer +models, it might be infeasible to fit many examples into the context, +especially when dealing with long-input tasks. In this work, we explore +Demonstration Ensembling (DENSE) as an alternative to simple concatenation. +DENSE predicts outputs using subsets (i.e., buckets) of the demonstrations and +then combines the output probabilities resulting from each subset to produce +the final prediction. We study different ensembling methods using GPT-j and +experiment on 12 language tasks. Our experiments show weighted max ensembling +to outperform vanilla concatenation by as large as 2.4 average points. Code +available at https://github.com/mukhal/icl-ensembling. + +
+
+ comment: Published at ME-FoMo workshop at ICLR 2023. Arxiv version includes + evaluation on 5 more tasks +
+
+
+
+
+ + ♻ ☆ Teaching Smaller Language Models To Generalise To Unseen Compositional + Questions + + +
+ We equip a smaller Language Model to generalise to answering challenging +compositional questions that have not been seen in training. To do so we +propose a combination of multitask supervised pretraining on up to 93 tasks +designed to instill diverse reasoning abilities, and a dense retrieval system +that aims to retrieve a set of evidential paragraph fragments. Recent progress +in question-answering has been achieved either through prompting methods +against very large pretrained Language Models in zero or few-shot fashion, or +by fine-tuning smaller models, sometimes in conjunction with information +retrieval. We focus on the less explored question of the extent to which +zero-shot generalisation can be enabled in smaller models with retrieval +against a corpus within which sufficient information to answer a particular +question may not exist. We establish strong baselines in this setting for +diverse evaluation datasets (StrategyQA, CommonsenseQA, IIRC, DROP, Musique and +ARC-DA), and show that performance can be significantly improved by adding +retrieval-augmented training datasets which are designed to expose our models +to a variety of heuristic reasoning strategies such as weighing partial +evidence or ignoring an irrelevant context. + +
+
+
+
+
+ + ♻ ☆ Measuring Social Biases in Grounded Vision and Language Embeddings NAACL 2021 + + +
+ We generalize the notion of social biases from language embeddings to +grounded vision and language embeddings. Biases are present in grounded +embeddings, and indeed seem to be equally or more significant than for +ungrounded embeddings. This is despite the fact that vision and language can +suffer from different biases, which one might hope could attenuate the biases +in both. Multiple ways exist to generalize metrics measuring bias in word +embeddings to this new setting. We introduce the space of generalizations +(Grounded-WEAT and Grounded-SEAT) and demonstrate that three generalizations +answer different yet important questions about how biases, language, and vision +interact. These metrics are used on a new dataset, the first for grounded bias, +created by augmenting extending standard linguistic bias benchmarks with 10,228 +images from COCO, Conceptual Captions, and Google Images. Dataset construction +is challenging because vision datasets are themselves very biased. The presence +of these biases in systems will begin to have real-world consequences as they +are deployed, making carefully measuring bias and then mitigating it critical +to building a fair society. + +
+
+ comment: Camera-ready from NAACL 2021. Previous arXiv version was from before + conference and was not the most recent version +
+
+
+
+
+ + ♻ ☆ Average-Hard Attention Transformers are Constant-Depth Uniform Threshold + Circuits + + +
+ Transformers have emerged as a widely used neural network model for various +natural language processing tasks. Previous research explored their +relationship with constant-depth threshold circuits, making two assumptions: +average-hard attention and logarithmic precision for internal computations +relative to input length. Merrill et al. (2022) prove that average-hard +attention transformers recognize languages that fall within the complexity +class TC0, denoting the set of languages that can be recognized by +constant-depth polynomial-size threshold circuits. Likewise, Merrill and +Sabharwal (2023) show that log-precision transformers recognize languages +within the class of uniform TC0. This shows that both transformer models can be +simulated by constant-depth threshold circuits, with the latter being more +robust due to generating a uniform circuit family. Our paper shows that the +first result can be extended to yield uniform circuits as well. + +
+
+
+
+
+ + ♻ ☆ A Measure-Theoretic Characterization of Tight Language Models ACL 2023 + + +
+ Language modeling, a central task in natural language processing, involves +estimating a probability distribution over strings. In most cases, the +estimated distribution sums to 1 over all finite strings. However, in some +pathological cases, probability mass can ``leak'' onto the set of infinite +sequences. In order to characterize the notion of leakage more precisely, this +paper offers a measure-theoretic treatment of language modeling. We prove that +many popular language model families are in fact tight, meaning that they will +not leak in this sense. We also generalize characterizations of tightness +proposed in previous works. + +
+
+ comment: 25 pages; ACL 2023 camera ready +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 149 + +
+
+
+ + ☆ CamP: Camera Preconditioning for Neural Radiance Fields SIGGRAPH + + +
+ Neural Radiance Fields (NeRF) can be optimized to obtain high-fidelity 3D +scene reconstructions of objects and large-scale scenes. However, NeRFs require +accurate camera parameters as input -- inaccurate camera parameters result in +blurry renderings. Extrinsic and intrinsic camera parameters are usually +estimated using Structure-from-Motion (SfM) methods as a pre-processing step to +NeRF, but these techniques rarely yield perfect estimates. Thus, prior works +have proposed jointly optimizing camera parameters alongside a NeRF, but these +methods are prone to local minima in challenging settings. In this work, we +analyze how different camera parameterizations affect this joint optimization +problem, and observe that standard parameterizations exhibit large differences +in magnitude with respect to small perturbations, which can lead to an +ill-conditioned optimization problem. We propose using a proxy problem to +compute a whitening transform that eliminates the correlation between camera +parameters and normalizes their effects, and we propose to use this transform +as a preconditioner for the camera parameters during joint optimization. Our +preconditioned camera optimization significantly improves reconstruction +quality on scenes from the Mip-NeRF 360 dataset: we reduce error rates (RMSE) +by 67% compared to state-of-the-art NeRF approaches that do not optimize for +cameras like Zip-NeRF, and by 29% relative to state-of-the-art joint +optimization approaches using the camera parameterization of SCNeRF. Our +approach is easy to implement, does not significantly increase runtime, can be +applied to a wide variety of camera parameterizations, and can +straightforwardly be incorporated into other NeRF-like models. + +
+
+ comment: SIGGRAPH Asia 2023, Project page: https://camp-nerf.github.io +
+
+
+
+
+ + ☆ Structured World Models from Human Videos + + +
+ We tackle the problem of learning complex, general behaviors directly in the +real world. We propose an approach for robots to efficiently learn manipulation +skills using only a handful of real-world interaction trajectories from many +different settings. Inspired by the success of learning from large-scale +datasets in the fields of computer vision and natural language, our belief is +that in order to efficiently learn, a robot must be able to leverage +internet-scale, human video data. Humans interact with the world in many +interesting ways, which can allow a robot to not only build an understanding of +useful actions and affordances but also how these actions affect the world for +manipulation. Our approach builds a structured, human-centric action space +grounded in visual affordances learned from human videos. Further, we train a +world model on human videos and fine-tune on a small amount of robot +interaction data without any task supervision. We show that this approach of +affordance-space world models enables different robots to learn various +manipulation skills in complex settings, in under 30 minutes of interaction. +Videos can be found at https://human-world-model.github.io + +
+
+ comment: RSS 2023. Website at https://human-world-model.github.io +
+
+
+
+
+ + ☆ Few-Shot Physically-Aware Articulated Mesh Generation via Hierarchical + Deformation ICCV 2023 + + +
+ We study the problem of few-shot physically-aware articulated mesh +generation. By observing an articulated object dataset containing only a few +examples, we wish to learn a model that can generate diverse meshes with high +visual fidelity and physical validity. Previous mesh generative models either +have difficulties in depicting a diverse data space from only a few examples or +fail to ensure physical validity of their samples. Regarding the above +challenges, we propose two key innovations, including 1) a hierarchical mesh +deformation-based generative model based upon the divide-and-conquer philosophy +to alleviate the few-shot challenge by borrowing transferrable deformation +patterns from large scale rigid meshes and 2) a physics-aware deformation +correction scheme to encourage physically plausible generations. We conduct +extensive experiments on 6 articulated categories to demonstrate the +superiority of our method in generating articulated meshes with better +diversity, higher visual fidelity, and better physical validity over previous +methods in the few-shot setting. Further, we validate solid contributions of +our two innovations in the ablation study. Project page with code is available +at https://meowuu7.github.io/few-arti-obj-gen. + +
+
+ comment: ICCV 2023. Project Page: https://meowuu7.github.io/few-arti-obj-gen +
+
+
+
+
+ + ☆ Can Language Models Learn to Listen? ICCV 2023 + + +
+ We present a framework for generating appropriate facial responses from a +listener in dyadic social interactions based on the speaker's words. Given an +input transcription of the speaker's words with their timestamps, our approach +autoregressively predicts a response of a listener: a sequence of listener +facial gestures, quantized using a VQ-VAE. Since gesture is a language +component, we propose treating the quantized atomic motion elements as +additional language token inputs to a transformer-based large language model. +Initializing our transformer with the weights of a language model pre-trained +only on text results in significantly higher quality listener responses than +training a transformer from scratch. We show that our generated listener motion +is fluent and reflective of language semantics through quantitative metrics and +a qualitative user study. In our evaluation, we analyze the model's ability to +utilize temporal and semantic aspects of spoken text. Project page: +https://people.eecs.berkeley.edu/~evonne_ng/projects/text2listen/ + +
+
+ comment: ICCV 2023; Project page: + https://people.eecs.berkeley.edu/~evonne_ng/projects/text2listen/ +
+
+
+
+
+ + ☆ Differentiable Shadow Mapping for Efficient Inverse Graphics CVPR 2023 + + +
+ We show how shadows can be efficiently generated in differentiable rendering +of triangle meshes. Our central observation is that pre-filtered shadow +mapping, a technique for approximating shadows based on rendering from the +perspective of a light, can be combined with existing differentiable +rasterizers to yield differentiable visibility information. We demonstrate at +several inverse graphics problems that differentiable shadow maps are orders of +magnitude faster than differentiable light transport simulation with similar +accuracy -- while differentiable rasterization without shadows often fails to +converge. + +
+
+ comment: CVPR 2023, project page: + https://mworchel.github.io/differentiable-shadow-mapping +
+
+
+
+
+ + ☆ Unlocking Accuracy and Fairness in Differentially Private Image + Classification + + +
+ Privacy-preserving machine learning aims to train models on private data +without leaking sensitive information. Differential privacy (DP) is considered +the gold standard framework for privacy-preserving training, as it provides +formal privacy guarantees. However, compared to their non-private counterparts, +models trained with DP often have significantly reduced accuracy. Private +classifiers are also believed to exhibit larger performance disparities across +subpopulations, raising fairness concerns. The poor performance of classifiers +trained with DP has prevented the widespread adoption of privacy preserving +machine learning in industry. Here we show that pre-trained foundation models +fine-tuned with DP can achieve similar accuracy to non-private classifiers, +even in the presence of significant distribution shifts between pre-training +data and downstream tasks. We achieve private accuracies within a few percent +of the non-private state of the art across four datasets, including two medical +imaging benchmarks. Furthermore, our private medical classifiers do not exhibit +larger performance disparities across demographic groups than non-private +models. This milestone to make DP training a practical and reliable technology +has the potential to widely enable machine learning practitioners to train +safely on sensitive datasets while protecting individuals' privacy. + +
+
+
+
+
+ + ☆ Vision Transformer Pruning Via Matrix Decomposition + + +
+ This is a further development of Vision Transformer Pruning via matrix +decomposition. The purpose of the Vision Transformer Pruning is to prune the +dimension of the linear projection of the dataset by learning their associated +importance score in order to reduce the storage, run-time memory, and +computational demands. In this paper we further reduce dimension and complexity +of the linear projection by implementing and comparing several matrix +decomposition methods while preserving the generated important features. We end +up selected the Singular Value Decomposition as the method to achieve our goal +by comparing the original accuracy scores in the original Github repository and +the accuracy scores of using those matrix decomposition methods, including +Singular Value Decomposition, four versions of QR Decomposition, and LU +factorization. + +
+
+
+
+
+ + ☆ EigenPlaces: Training Viewpoint Robust Models for Visual Place + Recognition ICCV 2023 + + +
+ Visual Place Recognition is a task that aims to predict the place of an image +(called query) based solely on its visual features. This is typically done +through image retrieval, where the query is matched to the most similar images +from a large database of geotagged photos, using learned global descriptors. A +major challenge in this task is recognizing places seen from different +viewpoints. To overcome this limitation, we propose a new method, called +EigenPlaces, to train our neural network on images from different point of +views, which embeds viewpoint robustness into the learned global descriptors. +The underlying idea is to cluster the training data so as to explicitly present +the model with different views of the same points of interest. The selection of +this points of interest is done without the need for extra supervision. We then +present experiments on the most comprehensive set of datasets in literature, +finding that EigenPlaces is able to outperform previous state of the art on the +majority of datasets, while requiring 60\% less GPU memory for training and +using 50\% smaller descriptors. The code and trained models for EigenPlaces are +available at {\small{\url{https://github.com/gmberton/EigenPlaces}}}, while +results with any other baseline can be computed with the codebase at +{\small{\url{https://github.com/gmberton/auto_VPR}}}. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Pixel Adaptive Deep Unfolding Transformer for Hyperspectral Image + Reconstruction ICCV 2023 + + +
+ Hyperspectral Image (HSI) reconstruction has made gratifying progress with +the deep unfolding framework by formulating the problem into a data module and +a prior module. Nevertheless, existing methods still face the problem of +insufficient matching with HSI data. The issues lie in three aspects: 1) fixed +gradient descent step in the data module while the degradation of HSI is +agnostic in the pixel-level. 2) inadequate prior module for 3D HSI cube. 3) +stage interaction ignoring the differences in features at different stages. To +address these issues, in this work, we propose a Pixel Adaptive Deep Unfolding +Transformer (PADUT) for HSI reconstruction. In the data module, a pixel +adaptive descent step is employed to focus on pixel-level agnostic degradation. +In the prior module, we introduce the Non-local Spectral Transformer (NST) to +emphasize the 3D characteristics of HSI for recovering. Moreover, inspired by +the diverse expression of features in different stages and depths, the stage +interaction is improved by the Fast Fourier Transform (FFT). Experimental +results on both simulated and real scenes exhibit the superior performance of +our method compared to state-of-the-art HSI reconstruction methods. The code is +released at: https://github.com/MyuLi/PADUT. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Jumping through Local Minima: Quantization in the Loss Landscape of + Vision Transformers + + +
+ Quantization scale and bit-width are the most important parameters when +considering how to quantize a neural network. Prior work focuses on optimizing +quantization scales in a global manner through gradient methods (gradient +descent \& Hessian analysis). Yet, when applying perturbations to quantization +scales, we observe a very jagged, highly non-smooth test loss landscape. In +fact, small perturbations in quantization scale can greatly affect accuracy, +yielding a $0.5-0.8\%$ accuracy boost in 4-bit quantized vision transformers +(ViTs). In this regime, gradient methods break down, since they cannot reliably +reach local minima. In our work, dubbed Evol-Q, we use evolutionary search to +effectively traverse the non-smooth landscape. Additionally, we propose using +an infoNCE loss, which not only helps combat overfitting on the small +calibration dataset ($1,000$ images) but also makes traversing such a highly +non-smooth surface easier. Evol-Q improves the top-1 accuracy of a fully +quantized ViT-Base by $10.30\%$, $0.78\%$, and $0.15\%$ for $3$-bit, $4$-bit, +and $8$-bit weight quantization levels. Extensive experiments on a variety of +CNN and ViT architectures further demonstrate its robustness in extreme +quantization scenarios. Our code is available at +https://github.com/enyac-group/evol-q + +
+
+ comment: arXiv admin note: text overlap with arXiv:2211.09643 +
+
+
+
+
+ + ☆ Improving Continuous Sign Language Recognition with Cross-Lingual Signs ICCV 2023 + + +
+ This work dedicates to continuous sign language recognition (CSLR), which is +a weakly supervised task dealing with the recognition of continuous signs from +videos, without any prior knowledge about the temporal boundaries between +consecutive signs. Data scarcity heavily impedes the progress of CSLR. Existing +approaches typically train CSLR models on a monolingual corpus, which is orders +of magnitude smaller than that of speech recognition. In this work, we explore +the feasibility of utilizing multilingual sign language corpora to facilitate +monolingual CSLR. Our work is built upon the observation of cross-lingual +signs, which originate from different sign languages but have similar visual +signals (e.g., hand shape and motion). The underlying idea of our approach is +to identify the cross-lingual signs in one sign language and properly leverage +them as auxiliary training data to improve the recognition capability of +another. To achieve the goal, we first build two sign language dictionaries +containing isolated signs that appear in two datasets. Then we identify the +sign-to-sign mappings between two sign languages via a well-optimized isolated +sign language recognition model. At last, we train a CSLR model on the +combination of the target data with original labels and the auxiliary data with +mapped labels. Experimentally, our approach achieves state-of-the-art +performance on two widely-used CSLR datasets: Phoenix-2014 and Phoenix-2014T. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ MGMAE: Motion Guided Masking for Video Masked Autoencoding ICCV 2023 + + +
+ Masked autoencoding has shown excellent performance on self-supervised video +representation learning. Temporal redundancy has led to a high masking ratio +and customized masking strategy in VideoMAE. In this paper, we aim to further +improve the performance of video masked autoencoding by introducing a motion +guided masking strategy. Our key insight is that motion is a general and unique +prior in video, which should be taken into account during masked pre-training. +Our motion guided masking explicitly incorporates motion information to build +temporal consistent masking volume. Based on this masking volume, we can track +the unmasked tokens in time and sample a set of temporal consistent cubes from +videos. These temporal aligned unmasked tokens will further relieve the +information leakage issue in time and encourage the MGMAE to learn more useful +structure information. We implement our MGMAE with an online efficient optical +flow estimator and backward masking map warping strategy. We perform +experiments on the datasets of Something-Something V2 and Kinetics-400, +demonstrating the superior performance of our MGMAE to the original VideoMAE. +In addition, we provide the visualization analysis to illustrate that our MGMAE +can sample temporal consistent cubes in a motion-adaptive manner for more +effective video pre-training. + +
+
+ comment: ICCV 2023 camera-ready version +
+
+
+
+
+ + ☆ Extraction of Text from Optic Nerve Optical Coherence Tomography Reports + + +
+ Purpose: The purpose of this study was to develop and evaluate rule-based +algorithms to enhance the extraction of text data, including retinal nerve +fiber layer (RNFL) values and other ganglion cell count (GCC) data, from Zeiss +Cirrus optical coherence tomography (OCT) scan reports. Methods: DICOM files +that contained encapsulated PDF reports with RNFL or Ganglion Cell in their +document titles were identified from a clinical imaging repository at a single +academic ophthalmic center. PDF reports were then converted into image files +and processed using the PaddleOCR Python package for optical character +recognition. Rule-based algorithms were designed and iteratively optimized for +improved performance in extracting RNFL and GCC data. Evaluation of the +algorithms was conducted through manual review of a set of RNFL and GCC +reports. Results: The developed algorithms demonstrated high precision in +extracting data from both RNFL and GCC scans. Precision was slightly better for +the right eye in RNFL extraction (OD: 0.9803 vs. OS: 0.9046), and for the left +eye in GCC extraction (OD: 0.9567 vs. OS: 0.9677). Some values presented more +challenges in extraction, particularly clock hours 5 and 6 for RNFL thickness, +and signal strength for GCC. Conclusions: A customized optical character +recognition algorithm can identify numeric results from optical coherence scan +reports with high precision. Automated processing of PDF reports can greatly +reduce the time to extract OCT results on a large scale. + +
+
+
+
+
+ + ☆ Dense Error Map Estimation for MRI-Ultrasound Registration in Brain + Tumor Surgery Using Swin UNETR + + +
+ Early surgical treatment of brain tumors is crucial in reducing patient +mortality rates. However, brain tissue deformation (called brain shift) occurs +during the surgery, rendering pre-operative images invalid. As a cost-effective +and portable tool, intra-operative ultrasound (iUS) can track brain shift, and +accurate MRI-iUS registration techniques can update pre-surgical plans and +facilitate the interpretation of iUS. This can boost surgical safety and +outcomes by maximizing tumor removal while avoiding eloquent regions. However, +manual assessment of MRI-iUS registration results in real-time is difficult and +prone to errors due to the 3D nature of the data. Automatic algorithms that can +quantify the quality of inter-modal medical image registration outcomes can be +highly beneficial. Therefore, we propose a novel deep-learning (DL) based +framework with the Swin UNETR to automatically assess 3D-patch-wise dense error +maps for MRI-iUS registration in iUS-guided brain tumor resection and show its +performance with real clinical data for the first time. + +
+
+ comment: Accepted in IEEE IUS 2023 +
+
+
+
+
+ + ☆ CoNe: Contrast Your Neighbours for Supervised Image Classification + + +
+ Image classification is a longstanding problem in computer vision and machine +learning research. Most recent works (e.g. SupCon , Triplet, and max-margin) +mainly focus on grouping the intra-class samples aggressively and compactly, +with the assumption that all intra-class samples should be pulled tightly +towards their class centers. However, such an objective will be very hard to +achieve since it ignores the intra-class variance in the dataset. (i.e. +different instances from the same class can have significant differences). +Thus, such a monotonous objective is not sufficient. To provide a more +informative objective, we introduce Contrast Your Neighbours (CoNe) - a simple +yet practical learning framework for supervised image classification. +Specifically, in CoNe, each sample is not only supervised by its class center +but also directly employs the features of its similar neighbors as anchors to +generate more adaptive and refined targets. Moreover, to further boost the +performance, we propose ``distributional consistency" as a more informative +regularization to enable similar instances to have a similar probability +distribution. Extensive experimental results demonstrate that CoNe achieves +state-of-the-art performance across different benchmark datasets, network +architectures, and settings. Notably, even without a complicated training +recipe, our CoNe achieves 80.8\% Top-1 accuracy on ImageNet with ResNet-50, +which surpasses the recent Timm training recipe (80.4\%). Code and pre-trained +models are available at +\href{https://github.com/mingkai-zheng/CoNe}{https://github.com/mingkai-zheng/CoNe}. + +
+
+
+
+
+ + ☆ WanJuan: A Comprehensive Multimodal Dataset for Advancing English and + Chinese Large Models + + +
+ The rise in popularity of ChatGPT and GPT-4 has significantly accelerated the +development of large models, leading to the creation of numerous impressive +large language models(LLMs) and multimodal large language models (MLLMs). These +cutting-edge models owe their remarkable performance to high-quality data. +However, the details of the training data used in leading paradigms are often +kept confidential. This lack of transparency, coupled with the scarcity of +open-source data, impedes further developments within the community. As a +response, this paper presents "Wan Juan", a large-scale multimodal dataset +composed of both Chinese and English data, collected from a wide range of web +sources. The dataset incorporates text, image-text, and video modalities, with +a total volume exceeding 2TB. It was utilized in the training of InternLM, a +model that demonstrated significant advantages in multi-dimensional evaluations +when compared to models of a similar scale. All data can be accessed at +https://opendatalab.org.cn/WanJuan1.0. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Boosting Adversarial Attack with Similar Target + + +
+ Deep neural networks are vulnerable to adversarial examples, posing a threat +to the models' applications and raising security concerns. An intriguing +property of adversarial examples is their strong transferability. Several +methods have been proposed to enhance transferability, including ensemble +attacks which have demonstrated their efficacy. However, prior approaches +simply average logits, probabilities, or losses for model ensembling, lacking a +comprehensive analysis of how and why model ensembling significantly improves +transferability. In this paper, we propose a similar targeted attack method +named Similar Target~(ST). By promoting cosine similarity between the gradients +of each model, our method regularizes the optimization direction to +simultaneously attack all surrogate models. This strategy has been proven to +enhance generalization ability. Experimental results on ImageNet validate the +effectiveness of our approach in improving adversarial transferability. Our +method outperforms state-of-the-art attackers on 18 discriminative classifiers +and adversarially trained models. + +
+
+
+
+
+ + ☆ Patch Is Not All You Need + + +
+ Vision Transformers have achieved great success in computer visions, +delivering exceptional performance across various tasks. However, their +inherent reliance on sequential input enforces the manual partitioning of +images into patch sequences, which disrupts the image's inherent structural and +semantic continuity. To handle this, we propose a novel Pattern Transformer +(Patternformer) to adaptively convert images to pattern sequences for +Transformer input. Specifically, we employ the Convolutional Neural Network to +extract various patterns from the input image, with each channel representing a +unique pattern that is fed into the succeeding Transformer as a visual token. +By enabling the network to optimize these patterns, each pattern concentrates +on its local region of interest, thereby preserving its intrinsic structural +and semantic information. Only employing the vanilla ResNet and Transformer, we +have accomplished state-of-the-art performance on CIFAR-10 and CIFAR-100, and +have achieved competitive results on ImageNet. + +
+
+
+
+
+ + ☆ Test-time augmentation-based active learning and self-training for + label-efficient segmentation MICCAI + + +
+ Deep learning techniques depend on large datasets whose annotation is +time-consuming. To reduce annotation burden, the self-training (ST) and +active-learning (AL) methods have been developed as well as methods that +combine them in an iterative fashion. However, it remains unclear when each +method is the most useful, and when it is advantageous to combine them. In this +paper, we propose a new method that combines ST with AL using Test-Time +Augmentations (TTA). First, TTA is performed on an initial teacher network. +Then, cases for annotation are selected based on the lowest estimated Dice +score. Cases with high estimated scores are used as soft pseudo-labels for ST. +The selected annotated cases are trained with existing annotated cases and ST +cases with border slices annotations. We demonstrate the method on MRI fetal +body and placenta segmentation tasks with different data variability +characteristics. Our results indicate that ST is highly effective for both +tasks, boosting performance for in-distribution (ID) and out-of-distribution +(OOD) data. However, while self-training improved the performance of +single-sequence fetal body segmentation when combined with AL, it slightly +deteriorated performance of multi-sequence placenta segmentation on ID data. AL +was helpful for the high variability placenta data, but did not improve upon +random selection for the single-sequence body data. For fetal body segmentation +sequence transfer, combining AL with ST following ST iteration yielded a Dice +of 0.961 with only 6 original scans and 2 new sequence scans. Results using +only 15 high-variability placenta cases were similar to those using 50 cases. +Code is available at: https://github.com/Bella31/TTA-quality-estimation-ST-AL + +
+
+ comment: Accepted to MICCAI MILLanD workshop 2023 +
+
+
+
+
+ + ☆ Backdooring Textual Inversion for Concept Censorship + + +
+ Recent years have witnessed success in AIGC (AI Generated Content). People +can make use of a pre-trained diffusion model to generate images of high +quality or freely modify existing pictures with only prompts in nature +language. More excitingly, the emerging personalization techniques make it +feasible to create specific-desired images with only a few images as +references. However, this induces severe threats if such advanced techniques +are misused by malicious users, such as spreading fake news or defaming +individual reputations. Thus, it is necessary to regulate personalization +models (i.e., concept censorship) for their development and advancement. + In this paper, we focus on the personalization technique dubbed Textual +Inversion (TI), which is becoming prevailing for its lightweight nature and +excellent performance. TI crafts the word embedding that contains detailed +information about a specific object. Users can easily download the word +embedding from public websites like Civitai and add it to their own stable +diffusion model without fine-tuning for personalization. To achieve the concept +censorship of a TI model, we propose leveraging the backdoor technique for good +by injecting backdoors into the Textual Inversion embeddings. Briefly, we +select some sensitive words as triggers during the training of TI, which will +be censored for normal use. In the subsequent generation stage, if the triggers +are combined with personalized embeddings as final prompts, the model will +output a pre-defined target image rather than images including the desired +malicious concept. + To demonstrate the effectiveness of our approach, we conduct extensive +experiments on Stable Diffusion, a prevailing open-sourced text-to-image model. +Our code, data, and results are available at +https://concept-censorship.github.io. + +
+
+
+
+
+ + ☆ Rethinking Person Re-identification from a Projection-on-Prototypes + Perspective + + +
+ Person Re-IDentification (Re-ID) as a retrieval task, has achieved tremendous +development over the past decade. Existing state-of-the-art methods follow an +analogous framework to first extract features from the input images and then +categorize them with a classifier. However, since there is no identity overlap +between training and testing sets, the classifier is often discarded during +inference. Only the extracted features are used for person retrieval via +distance metrics. In this paper, we rethink the role of the classifier in +person Re-ID, and advocate a new perspective to conceive the classifier as a +projection from image features to class prototypes. These prototypes are +exactly the learned parameters of the classifier. In this light, we describe +the identity of input images as similarities to all prototypes, which are then +utilized as more discriminative features to perform person Re-ID. We thereby +propose a new baseline ProNet, which innovatively reserves the function of the +classifier at the inference stage. To facilitate the learning of class +prototypes, both triplet loss and identity classification loss are applied to +features that undergo the projection by the classifier. An improved version of +ProNet++ is presented by further incorporating multi-granularity designs. +Experiments on four benchmarks demonstrate that our proposed ProNet is simple +yet effective, and significantly beats previous baselines. ProNet++ also +achieves competitive or even better results than transformer-based competitors. + +
+
+
+
+
+ + ☆ Color Prompting for Data-Free Continual Unsupervised Domain Adaptive + Person Re-Identification + + +
+ Unsupervised domain adaptive person re-identification (Re-ID) methods +alleviate the burden of data annotation through generating pseudo supervision +messages. However, real-world Re-ID systems, with continuously accumulating +data streams, simultaneously demand more robust adaptation and anti-forgetting +capabilities. Methods based on image rehearsal addresses the forgetting issue +with limited extra storage but carry the risk of privacy leakage. In this work, +we propose a Color Prompting (CoP) method for data-free continual unsupervised +domain adaptive person Re-ID. Specifically, we employ a light-weighted prompter +network to fit the color distribution of the current task together with Re-ID +training. Then for the incoming new tasks, the learned color distribution +serves as color style transfer guidance to transfer the images into past +styles. CoP achieves accurate color style recovery for past tasks with adequate +data diversity, leading to superior anti-forgetting effects compared with image +rehearsal methods. Moreover, CoP demonstrates strong generalization performance +for fast adaptation into new domains, given only a small amount of unlabeled +images. Extensive experiments demonstrate that after the continual training +pipeline the proposed CoP achieves 6.7% and 8.1% average rank-1 improvements +over the replay method on seen and unseen domains, respectively. The source +code for this work is publicly available in +https://github.com/vimar-gu/ColorPromptReID. + +
+
+
+
+
+ + ☆ Sampling From Autoencoders' Latent Space via Quantization And + Probability Mass Function Concepts + + +
+ In this study, we focus on sampling from the latent space of generative +models built upon autoencoders so as the reconstructed samples are lifelike +images. To do to, we introduce a novel post-training sampling algorithm rooted +in the concept of probability mass functions, coupled with a quantization +process. Our proposed algorithm establishes a vicinity around each latent +vector from the input data and then proceeds to draw samples from these defined +neighborhoods. This strategic approach ensures that the sampled latent vectors +predominantly inhabit high-probability regions, which, in turn, can be +effectively transformed into authentic real-world images. A noteworthy point of +comparison for our sampling algorithm is the sampling technique based on +Gaussian mixture models (GMM), owing to its inherent capability to represent +clusters. Remarkably, we manage to improve the time complexity from the +previous $\mathcal{O}(n\times d \times k \times i)$ associated with GMM +sampling to a much more streamlined $\mathcal{O}(n\times d)$, thereby resulting +in substantial speedup during runtime. Moreover, our experimental results, +gauged through the Fr\'echet inception distance (FID) for image generation, +underscore the superior performance of our sampling algorithm across a diverse +range of models and datasets. On the MNIST benchmark dataset, our approach +outperforms GMM sampling by yielding a noteworthy improvement of up to $0.89$ +in FID value. Furthermore, when it comes to generating images of faces and +ocular images, our approach showcases substantial enhancements with FID +improvements of $1.69$ and $0.87$ respectively, as compared to GMM sampling, as +evidenced on the CelebA and MOBIUS datasets. Lastly, we substantiate our +methodology's efficacy in estimating latent space distributions in contrast to +GMM sampling, particularly through the lens of the Wasserstein distance. + +
+
+
+
+
+ + ☆ Vanishing Point Estimation in Uncalibrated Images with Prior Gravity + Direction ICCV 2023 + + +
+ We tackle the problem of estimating a Manhattan frame, i.e. three orthogonal +vanishing points, and the unknown focal length of the camera, leveraging a +prior vertical direction. The direction can come from an Inertial Measurement +Unit that is a standard component of recent consumer devices, e.g., +smartphones. We provide an exhaustive analysis of minimal line configurations +and derive two new 2-line solvers, one of which does not suffer from +singularities affecting existing solvers. Additionally, we design a new +non-minimal method, running on an arbitrary number of lines, to boost the +performance in local optimization. Combining all solvers in a hybrid robust +estimator, our method achieves increased accuracy even with a rough prior. +Experiments on synthetic and real-world datasets demonstrate the superior +accuracy of our method compared to the state of the art, while having +comparable runtimes. We further demonstrate the applicability of our solvers +for relative rotation estimation. The code is available at +https://github.com/cvg/VP-Estimation-with-Prior-Gravity. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Exploring Fine-Grained Representation and Recomposition for + Cloth-Changing Person Re-Identification + + +
+ Cloth-changing person Re-IDentification (Re-ID) is a particularly challenging +task, suffering from two limitations of inferior identity-relevant features and +limited training samples. Existing methods mainly leverage auxiliary +information to facilitate discriminative feature learning, including +soft-biometrics features of shapes and gaits, and additional labels of +clothing. However, these information may be unavailable in real-world +applications. In this paper, we propose a novel FIne-grained Representation and +Recomposition (FIRe$^{2}$) framework to tackle both limitations without any +auxiliary information. Specifically, we first design a Fine-grained Feature +Mining (FFM) module to separately cluster images of each person. Images with +similar so-called fine-grained attributes (e.g., clothes and viewpoints) are +encouraged to cluster together. An attribute-aware classification loss is +introduced to perform fine-grained learning based on cluster labels, which are +not shared among different people, promoting the model to learn +identity-relevant features. Furthermore, by taking full advantage of the +clustered fine-grained attributes, we present a Fine-grained Attribute +Recomposition (FAR) module to recompose image features with different +attributes in the latent space. It can significantly enhance representations +for robust feature learning. Extensive experiments demonstrate that FIRe$^{2}$ +can achieve state-of-the-art performance on five widely-used cloth-changing +person Re-ID benchmarks. + +
+
+
+
+
+ + ☆ Co-Speech Gesture Detection through Multi-phase Sequence Labeling + + +
+ Gestures are integral components of face-to-face communication. They unfold +over time, often following predictable movement phases of preparation, stroke, +and retraction. Yet, the prevalent approach to automatic gesture detection +treats the problem as binary classification, classifying a segment as either +containing a gesture or not, thus failing to capture its inherently sequential +and contextual nature. To address this, we introduce a novel framework that +reframes the task as a multi-phase sequence labeling problem rather than binary +classification. Our model processes sequences of skeletal movements over time +windows, uses Transformer encoders to learn contextual embeddings, and +leverages Conditional Random Fields to perform sequence labeling. We evaluate +our proposal on a large dataset of diverse co-speech gestures in task-oriented +face-to-face dialogues. The results consistently demonstrate that our method +significantly outperforms strong baseline models in detecting gesture strokes. +Furthermore, applying Transformer encoders to learn contextual embeddings from +movement sequences substantially improves gesture unit detection. These results +highlight our framework's capacity to capture the fine-grained dynamics of +co-speech gesture phases, paving the way for more nuanced and accurate gesture +detection and analysis. + +
+
+
+
+
+ + ☆ Visual Crowd Analysis: Open Research Problems + + +
+ Over the last decade, there has been a remarkable surge in interest in +automated crowd monitoring within the computer vision community. Modern +deep-learning approaches have made it possible to develop fully-automated +vision-based crowd-monitoring applications. However, despite the magnitude of +the issue at hand, the significant technological advancements, and the +consistent interest of the research community, there are still numerous +challenges that need to be overcome. In this article, we delve into six major +areas of visual crowd analysis, emphasizing the key developments in each of +these areas. We outline the crucial unresolved issues that must be tackled in +future works, in order to ensure that the field of automated crowd monitoring +continues to progress and thrive. Several surveys related to this topic have +been conducted in the past. Nonetheless, this article thoroughly examines and +presents a more intuitive categorization of works, while also depicting the +latest breakthroughs within the field, incorporating more recent studies +carried out within the last few years in a concise manner. By carefully +choosing prominent works with significant contributions in terms of novelty or +performance gains, this paper presents a more comprehensive exposition of +advancements in the current state-of-the-art. + +
+
+ comment: Accepted in AI Magazine published by Wiley Periodicals LLC on behalf + of the Association for the Advancement of Artificial Intelligence +
+
+
+
+
+ + ☆ Learning Clothing and Pose Invariant 3D Shape Representation for + Long-Term Person Re-Identification ICCV 2023 + + +
+ Long-Term Person Re-Identification (LT-ReID) has become increasingly crucial +in computer vision and biometrics. In this work, we aim to extend LT-ReID +beyond pedestrian recognition to include a wider range of real-world human +activities while still accounting for cloth-changing scenarios over large time +gaps. This setting poses additional challenges due to the geometric +misalignment and appearance ambiguity caused by the diversity of human pose and +clothing. To address these challenges, we propose a new approach 3DInvarReID +for (i) disentangling identity from non-identity components (pose, clothing +shape, and texture) of 3D clothed humans, and (ii) reconstructing accurate 3D +clothed body shapes and learning discriminative features of naked body shapes +for person ReID in a joint manner. To better evaluate our study of LT-ReID, we +collect a real-world dataset called CCDA, which contains a wide variety of +human activities and clothing changes. Experimentally, we show the superior +performance of our approach for person ReID. + +
+
+ comment: 10 pages, 7 figures, accepted by ICCV 2023 +
+
+
+
+
+ + ☆ EVE: Efficient zero-shot text-based Video Editing with Depth Map + Guidance and Temporal Consistency Constraints + + +
+ Motivated by the superior performance of image diffusion models, more and +more researchers strive to extend these models to the text-based video editing +task. Nevertheless, current video editing tasks mainly suffer from the dilemma +between the high fine-tuning cost and the limited generation capacity. Compared +with images, we conjecture that videos necessitate more constraints to preserve +the temporal consistency during editing. Towards this end, we propose EVE, a +robust and efficient zero-shot video editing method. Under the guidance of +depth maps and temporal consistency constraints, EVE derives satisfactory video +editing results with an affordable computational and time cost. Moreover, +recognizing the absence of a publicly available video editing dataset for fair +comparisons, we construct a new benchmark ZVE-50 dataset. Through comprehensive +experimentation, we validate that EVE could achieve a satisfactory trade-off +between performance and efficiency. We will release our dataset and codebase to +facilitate future researchers. + +
+
+
+
+
+ + ☆ bbOCR: An Open-source Multi-domain OCR Pipeline for Bengali Documents + + +
+ Despite the existence of numerous Optical Character Recognition (OCR) tools, +the lack of comprehensive open-source systems hampers the progress of document +digitization in various low resource languages, including Bengali. Low-resource +languages, especially those with an alphasyllabary writing system, suffer from +the lack of large-scale datasets for various document OCR components such as +word-level OCR, document layout extraction, and distortion correction; which +are available as individual modules in high-resource languages. In this paper, +we introduce Bengali.AI-BRACU-OCR (bbOCR): an open-source scalable document OCR +system that can reconstruct Bengali documents into a structured searchable +digitized format that leverages a novel Bengali text recognition model and two +novel synthetic datasets. We present extensive component-level and system-level +evaluation: both use a novel diversified evaluation dataset and comprehensive +evaluation metrics. Our extensive evaluation suggests that our proposed +solution is preferable over the current state-of-the-art Bengali OCR systems. +The source codes and datasets are available here: +https://bengaliai.github.io/bbocr. + +
+
+
+
+
+ + ☆ SCULPT: Shape-Conditioned Unpaired Learning of Pose-dependent Clothed + and Textured Human Meshes + + +
+ We present SCULPT, a novel 3D generative model for clothed and textured 3D +meshes of humans. Specifically, we devise a deep neural network that learns to +represent the geometry and appearance distribution of clothed human bodies. +Training such a model is challenging, as datasets of textured 3D meshes for +humans are limited in size and accessibility. Our key observation is that there +exist medium-sized 3D scan datasets like CAPE, as well as large-scale 2D image +datasets of clothed humans and multiple appearances can be mapped to a single +geometry. To effectively learn from the two data modalities, we propose an +unpaired learning procedure for pose-dependent clothed and textured human +meshes. Specifically, we learn a pose-dependent geometry space from 3D scan +data. We represent this as per vertex displacements w.r.t. the SMPL model. +Next, we train a geometry conditioned texture generator in an unsupervised way +using the 2D image data. We use intermediate activations of the learned +geometry model to condition our texture generator. To alleviate entanglement +between pose and clothing type, and pose and clothing appearance, we condition +both the texture and geometry generators with attribute labels such as clothing +types for the geometry, and clothing colors for the texture generator. We +automatically generated these conditioning labels for the 2D images based on +the visual question answering model BLIP and CLIP. We validate our method on +the SCULPT dataset, and compare to state-of-the-art 3D generative models for +clothed human bodies. We will release the codebase for research purposes. + +
+
+
+
+
+ + ☆ Automated Identification of Failure Cases in Organ at Risk Segmentation + Using Distance Metrics: A Study on CT Data + + +
+ Automated organ at risk (OAR) segmentation is crucial for radiation therapy +planning in CT scans, but the generated contours by automated models can be +inaccurate, potentially leading to treatment planning issues. The reasons for +these inaccuracies could be varied, such as unclear organ boundaries or +inaccurate ground truth due to annotation errors. To improve the model's +performance, it is necessary to identify these failure cases during the +training process and to correct them with some potential post-processing +techniques. However, this process can be time-consuming, as traditionally it +requires manual inspection of the predicted output. This paper proposes a +method to automatically identify failure cases by setting a threshold for the +combination of Dice and Hausdorff distances. This approach reduces the +time-consuming task of visually inspecting predicted outputs, allowing for +faster identification of failure case candidates. The method was evaluated on +20 cases of six different organs in CT images from clinical expert curated +datasets. By setting the thresholds for the Dice and Hausdorff distances, the +study was able to differentiate between various states of failure cases and +evaluate over 12 cases visually. This thresholding approach could be extended +to other organs, leading to faster identification of failure cases and thereby +improving the quality of radiation therapy planning. + +
+
+ comment: 11 pages, 5 figures, 2 tables +
+
+
+
+
+ + ☆ Foundation Model-oriented Robustness: Robust Image Model Evaluation with + Pretrained Models + + +
+ Machine learning has demonstrated remarkable performance over finite +datasets, yet whether the scores over the fixed benchmarks can sufficiently +indicate the model's performance in the real world is still in discussion. In +reality, an ideal robust model will probably behave similarly to the oracle +(e.g., the human users), thus a good evaluation protocol is probably to +evaluate the models' behaviors in comparison to the oracle. In this paper, we +introduce a new robustness measurement that directly measures the image +classification model's performance compared with a surrogate oracle (i.e., a +foundation model). Besides, we design a simple method that can accomplish the +evaluation beyond the scope of the benchmarks. Our method extends the image +datasets with new samples that are sufficiently perturbed to be distinct from +the ones in the original sets, but are still bounded within the same +image-label structure the original test image represents, constrained by a +foundation model pretrained with a large amount of samples. As a result, our +new method will offer us a new way to evaluate the models' robustness +performance, free of limitations of fixed benchmarks or constrained +perturbations, although scoped by the power of the oracle. In addition to the +evaluation results, we also leverage our generated data to understand the +behaviors of the model and our new evaluation strategies. + +
+
+
+
+
+ + ☆ PsyMo: A Dataset for Estimating Self-Reported Psychological Traits from + Gait + + +
+ Psychological trait estimation from external factors such as movement and +appearance is a challenging and long-standing problem in psychology, and is +principally based on the psychological theory of embodiment. To date, attempts +to tackle this problem have utilized private small-scale datasets with +intrusive body-attached sensors. Potential applications of an automated system +for psychological trait estimation include estimation of occupational fatigue +and psychology, and marketing and advertisement. In this work, we propose PsyMo +(Psychological traits from Motion), a novel, multi-purpose and multi-modal +dataset for exploring psychological cues manifested in walking patterns. We +gathered walking sequences from 312 subjects in 7 different walking variations +and 6 camera angles. In conjunction with walking sequences, participants filled +in 6 psychological questionnaires, totalling 17 psychometric attributes related +to personality, self-esteem, fatigue, aggressiveness and mental health. We +propose two evaluation protocols for psychological trait estimation. Alongside +the estimation of self-reported psychological traits from gait, the dataset can +be used as a drop-in replacement to benchmark methods for gait recognition. We +anonymize all cues related to the identity of the subjects and publicly release +only silhouettes, 2D / 3D human skeletons and 3D SMPL human meshes. + +
+
+
+
+
+ + ☆ Polarimetric Information for Multi-Modal 6D Pose Estimation of + Photometrically Challenging Objects with Limited Data ICCV 2023 + + +
+ 6D pose estimation pipelines that rely on RGB-only or RGB-D data show +limitations for photometrically challenging objects with e.g. textureless +surfaces, reflections or transparency. A supervised learning-based method +utilising complementary polarisation information as input modality is proposed +to overcome such limitations. This supervised approach is then extended to a +self-supervised paradigm by leveraging physical characteristics of polarised +light, thus eliminating the need for annotated real data. The methods achieve +significant advancements in pose estimation by leveraging geometric information +from polarised light and incorporating shape priors and invertible physical +constraints. + +
+
+ comment: Accepted at ICCV 2023 TRICKY Workshop +
+
+
+
+
+ + ☆ GaitPT: Skeletons Are All You Need For Gait Recognition + + +
+ The analysis of patterns of walking is an important area of research that has +numerous applications in security, healthcare, sports and human-computer +interaction. Lately, walking patterns have been regarded as a unique +fingerprinting method for automatic person identification at a distance. In +this work, we propose a novel gait recognition architecture called Gait Pyramid +Transformer (GaitPT) that leverages pose estimation skeletons to capture unique +walking patterns, without relying on appearance information. GaitPT adopts a +hierarchical transformer architecture that effectively extracts both spatial +and temporal features of movement in an anatomically consistent manner, guided +by the structure of the human skeleton. Our results show that GaitPT achieves +state-of-the-art performance compared to other skeleton-based gait recognition +works, in both controlled and in-the-wild scenarios. GaitPT obtains 82.6% +average accuracy on CASIA-B, surpassing other works by a margin of 6%. +Moreover, it obtains 52.16% Rank-1 accuracy on GREW, outperforming both +skeleton-based and appearance-based approaches. + +
+
+
+
+
+ + ☆ Multi-Modal Dataset Acquisition for Photometrically Challenging Object ICCV 2023 + + +
+ This paper addresses the limitations of current datasets for 3D vision tasks +in terms of accuracy, size, realism, and suitable imaging modalities for +photometrically challenging objects. We propose a novel annotation and +acquisition pipeline that enhances existing 3D perception and 6D object pose +datasets. Our approach integrates robotic forward-kinematics, external infrared +trackers, and improved calibration and annotation procedures. We present a +multi-modal sensor rig, mounted on a robotic end-effector, and demonstrate how +it is integrated into the creation of highly accurate datasets. Additionally, +we introduce a freehand procedure for wider viewpoint coverage. Both approaches +yield high-quality 3D data with accurate object and camera pose annotations. +Our methods overcome the limitations of existing datasets and provide valuable +resources for 3D vision research. + +
+
+ comment: Accepted at ICCV 2023 TRICKY Workshop +
+
+
+
+
+ + ☆ Ultrafast and Ultralight Network-Based Intelligent System for Real-time + Diagnosis of Ear diseases in Any Devices + + +
+ Traditional ear disease diagnosis heavily depends on experienced specialists +and specialized equipment, frequently resulting in misdiagnoses, treatment +delays, and financial burdens for some patients. Utilizing deep learning models +for efficient ear disease diagnosis has proven effective and affordable. +However, existing research overlooked model inference speed and parameter size +required for deployment. To tackle these challenges, we constructed a +large-scale dataset comprising eight ear disease categories and normal ear +canal samples from two hospitals. Inspired by ShuffleNetV2, we developed +Best-EarNet, an ultrafast and ultralight network enabling real-time ear disease +diagnosis. Best-EarNet incorporates the novel Local-Global Spatial Feature +Fusion Module which can capture global and local spatial information +simultaneously and guide the network to focus on crucial regions within feature +maps at various levels, mitigating low accuracy issues. Moreover, our network +uses multiple auxiliary classification heads for efficient parameter +optimization. With 0.77M parameters, Best-EarNet achieves an average frames per +second of 80 on CPU. Employing transfer learning and five-fold cross-validation +with 22,581 images from Hospital-1, the model achieves an impressive 95.23% +accuracy. External testing on 1,652 images from Hospital-2 validates its +performance, yielding 92.14% accuracy. Compared to state-of-the-art networks, +Best-EarNet establishes a new state-of-the-art (SOTA) in practical +applications. Most importantly, we developed an intelligent diagnosis system +called Ear Keeper, which can be deployed on common electronic devices. By +manipulating a compact electronic otoscope, users can perform comprehensive +scanning and diagnosis of the ear canal using real-time video. This study +provides a novel paradigm for ear endoscopy and other medical endoscopic image +recognition applications. + +
+
+ comment: This manuscript has been submitted to Neural Networks +
+
+
+
+
+ + ☆ FocalDreamer: Text-driven 3D Editing via Focal-fusion Assembly + + +
+ While text-3D editing has made significant strides in leveraging score +distillation sampling, emerging approaches still fall short in delivering +separable, precise and consistent outcomes that are vital to content creation. +In response, we introduce FocalDreamer, a framework that merges base shape with +editable parts according to text prompts for fine-grained editing within +desired regions. Specifically, equipped with geometry union and dual-path +rendering, FocalDreamer assembles independent 3D parts into a complete object, +tailored for convenient instance reuse and part-wise control. We propose +geometric focal loss and style consistency regularization, which encourage +focal fusion and congruent overall appearance. Furthermore, FocalDreamer +generates high-fidelity geometry and PBR textures which are compatible with +widely-used graphics engines. Extensive experiments have highlighted the +superior editing capabilities of FocalDreamer in both quantitative and +qualitative evaluations. + +
+
+ comment: Project website: https://fantasia3d.github.io +
+
+
+
+
+ + ☆ BackTrack: Robust template update via Backward Tracking of candidate + template + + +
+ Variations of target appearance such as deformations, illumination variance, +occlusion, etc., are the major challenges of visual object tracking that +negatively impact the performance of a tracker. An effective method to tackle +these challenges is template update, which updates the template to reflect the +change of appearance in the target object during tracking. However, with +template updates, inadequate quality of new templates or inappropriate timing +of updates may induce a model drift problem, which severely degrades the +tracking performance. Here, we propose BackTrack, a robust and reliable method +to quantify the confidence of the candidate template by backward tracking it on +the past frames. Based on the confidence score of candidates from BackTrack, we +can update the template with a reliable candidate at the right time while +rejecting unreliable candidates. BackTrack is a generic template update scheme +and is applicable to any template-based trackers. Extensive experiments on +various tracking benchmarks verify the effectiveness of BackTrack over existing +template update algorithms, as it achieves SOTA performance on various tracking +benchmarks. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ A step towards understanding why classification helps regression ICCV-2023 + + +
+ A number of computer vision deep regression approaches report improved +results when adding a classification loss to the regression loss. Here, we +explore why this is useful in practice and when it is beneficial. To do so, we +start from precisely controlled dataset variations and data samplings and find +that the effect of adding a classification loss is the most pronounced for +regression with imbalanced data. We explain these empirical findings by +formalizing the relation between the balanced and imbalanced regression losses. +Finally, we show that our findings hold on two real imbalanced image datasets +for depth estimation (NYUD2-DIR), and age estimation (IMDB-WIKI-DIR), and on +the problem of imbalanced video progress prediction (Breakfast). Our main +takeaway is: for a regression task, if the data sampling is imbalanced, then +add a classification loss. + +
+
+ comment: Accepted at ICCV-2023 +
+
+
+
+
+ + ☆ Improving the Transferability of Adversarial Examples with Arbitrary + Style Transfer + + +
+ Deep neural networks are vulnerable to adversarial examples crafted by +applying human-imperceptible perturbations on clean inputs. Although many +attack methods can achieve high success rates in the white-box setting, they +also exhibit weak transferability in the black-box setting. Recently, various +methods have been proposed to improve adversarial transferability, in which the +input transformation is one of the most effective methods. In this work, we +notice that existing input transformation-based works mainly adopt the +transformed data in the same domain for augmentation. Inspired by domain +generalization, we aim to further improve the transferability using the data +augmented from different domains. Specifically, a style transfer network can +alter the distribution of low-level visual features in an image while +preserving semantic content for humans. Hence, we propose a novel attack method +named Style Transfer Method (STM) that utilizes a proposed arbitrary style +transfer network to transform the images into different domains. To avoid +inconsistent semantic information of stylized images for the classification +network, we fine-tune the style transfer network and mix up the generated +images added by random noise with the original images to maintain semantic +consistency and boost input diversity. Extensive experimental results on the +ImageNet-compatible dataset show that our proposed method can significantly +improve the adversarial transferability on either normally trained models or +adversarially trained models than state-of-the-art input transformation-based +attacks. Code is available at: https://github.com/Zhijin-Ge/STM. + +
+
+ comment: 10 pages, 2 figures, accepted by the 31st ACM International + Conference on Multimedia (MM '23) +
+
+
+
+
+ + ☆ Image-free Classifier Injection for Zero-Shot Classification ICCV 2023 + + +
+ Zero-shot learning models achieve remarkable results on image classification +for samples from classes that were not seen during training. However, such +models must be trained from scratch with specialised methods: therefore, access +to a training dataset is required when the need for zero-shot classification +arises. In this paper, we aim to equip pre-trained models with zero-shot +classification capabilities without the use of image data. We achieve this with +our proposed Image-free Classifier Injection with Semantics (ICIS) that injects +classifiers for new, unseen classes into pre-trained classification models in a +post-hoc fashion without relying on image data. Instead, the existing +classifier weights and simple class-wise descriptors, such as class names or +attributes, are used. ICIS has two encoder-decoder networks that learn to +reconstruct classifier weights from descriptors (and vice versa), exploiting +(cross-)reconstruction and cosine losses to regularise the decoding process. +Notably, ICIS can be cheaply trained and applied directly on top of pre-trained +classification models. Experiments on benchmark ZSL datasets show that ICIS +produces unseen classifier weights that achieve strong (generalised) zero-shot +classification performance. Code is available at +https://github.com/ExplainableML/ImageFreeZSL . + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ CHORD: Category-level Hand-held Object Reconstruction via Shape + Deformation ICCV 2023 + + +
+ In daily life, humans utilize hands to manipulate objects. Modeling the shape +of objects that are manipulated by the hand is essential for AI to comprehend +daily tasks and to learn manipulation skills. However, previous approaches have +encountered difficulties in reconstructing the precise shapes of hand-held +objects, primarily owing to a deficiency in prior shape knowledge and +inadequate data for training. As illustrated, given a particular type of tool, +such as a mug, despite its infinite variations in shape and appearance, humans +have a limited number of 'effective' modes and poses for its manipulation. This +can be attributed to the fact that humans have mastered the shape prior of the +'mug' category, and can quickly establish the corresponding relations between +different mug instances and the prior, such as where the rim and handle are +located. In light of this, we propose a new method, CHORD, for Category-level +Hand-held Object Reconstruction via shape Deformation. CHORD deforms a +categorical shape prior for reconstructing the intra-class objects. To ensure +accurate reconstruction, we empower CHORD with three types of awareness: +appearance, shape, and interacting pose. In addition, we have constructed a new +dataset, COMIC, of category-level hand-object interaction. COMIC contains a +rich array of object instances, materials, hand interactions, and viewing +directions. Extensive evaluation shows that CHORD outperforms state-of-the-art +approaches in both quantitative and qualitative measures. Code, model, and +datasets are available at https://kailinli.github.io/CHORD. + +
+
+ comment: To be presented at ICCV 2023, Paris +
+
+
+
+
+ + ☆ Self-Feedback DETR for Temporal Action Detection ICCV 2023 + + +
+ Temporal Action Detection (TAD) is challenging but fundamental for real-world +video applications. Recently, DETR-based models have been devised for TAD but +have not performed well yet. In this paper, we point out the problem in the +self-attention of DETR for TAD; the attention modules focus on a few key +elements, called temporal collapse problem. It degrades the capability of the +encoder and decoder since their self-attention modules play no role. To solve +the problem, we propose a novel framework, Self-DETR, which utilizes +cross-attention maps of the decoder to reactivate self-attention modules. We +recover the relationship between encoder features by simple matrix +multiplication of the cross-attention map and its transpose. Likewise, we also +get the information within decoder queries. By guiding collapsed self-attention +maps with the guidance map calculated, we settle down the temporal collapse of +self-attention modules in the encoder and decoder. Our extensive experiments +demonstrate that Self-DETR resolves the temporal collapse problem by keeping +high diversity of attention over all layers. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ RT-MonoDepth: Real-time Monocular Depth Estimation on Embedded Systems + + +
+ Depth sensing is a crucial function of unmanned aerial vehicles and +autonomous vehicles. Due to the small size and simple structure of monocular +cameras, there has been a growing interest in depth estimation from a single +RGB image. However, state-of-the-art monocular CNN-based depth estimation +methods using fairly complex deep neural networks are too slow for real-time +inference on embedded platforms. This paper addresses the problem of real-time +depth estimation on embedded systems. We propose two efficient and lightweight +encoder-decoder network architectures, RT-MonoDepth and RT-MonoDepth-S, to +reduce computational complexity and latency. Our methodologies demonstrate that +it is possible to achieve similar accuracy as prior state-of-the-art works on +depth estimation at a faster inference speed. Our proposed networks, +RT-MonoDepth and RT-MonoDepth-S, runs at 18.4\&30.5 FPS on NVIDIA Jetson Nano +and 253.0\&364.1 FPS on NVIDIA Jetson AGX Orin on a single RGB image of +resolution 640$\times$192, and achieve relative state-of-the-art accuracy on +the KITTI dataset. To the best of the authors' knowledge, this paper achieves +the best accuracy and fastest inference speed compared with existing fast +monocular depth estimation methods. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Seeing the Intangible: Surveying Automatic High-Level Visual + Understanding from Still Images + + +
+ The field of Computer Vision (CV) was born with the single grand goal of +complete image understanding: providing a complete semantic interpretation of +an input image. What exactly this goal entails is not immediately +straightforward, but theoretical hierarchies of visual understanding point +towards a top level of full semantics, within which sits the most complex and +subjective information humans can detect from visual data. In particular, +non-concrete concepts including emotions, social values and ideologies seem to +be protagonists of this "high-level" visual semantic understanding. While such +"abstract concepts" are critical tools for image management and retrieval, +their automatic recognition is still a challenge, exactly because they rest at +the top of the "semantic pyramid": the well-known semantic gap problem is +worsened given their lack of unique perceptual referents, and their reliance on +more unspecific features than concrete concepts. Given that there seems to be +very scarce explicit work within CV on the task of abstract social concept +(ASC) detection, and that many recent works seem to discuss similar +non-concrete entities by using different terminology, in this survey we provide +a systematic review of CV work that explicitly or implicitly approaches the +problem of abstract (specifically social) concept detection from still images. +Specifically, this survey performs and provides: (1) A study and clustering of +high level visual understanding semantic elements from a multidisciplinary +perspective (computer science, visual studies, and cognitive perspectives); (2) +A study and clustering of high level visual understanding computer vision tasks +dealing with the identified semantic elements, so as to identify current CV +work that implicitly deals with AC detection. + +
+
+
+
+
+ + ☆ Spatial Transform Decoupling for Oriented Object Detection + + +
+ Vision Transformers (ViTs) have achieved remarkable success in computer +vision tasks. However, their potential in rotation-sensitive scenarios has not +been fully explored, and this limitation may be inherently attributed to the +lack of spatial invariance in the data-forwarding process. In this study, we +present a novel approach, termed Spatial Transform Decoupling (STD), providing +a simple-yet-effective solution for oriented object detection with ViTs. Built +upon stacked ViT blocks, STD utilizes separate network branches to predict the +position, size, and angle of bounding boxes, effectively harnessing the spatial +transform potential of ViTs in a divide-and-conquer fashion. Moreover, by +aggregating cascaded activation masks (CAMs) computed upon the regressed +parameters, STD gradually enhances features within regions of interest (RoIs), +which complements the self-attention mechanism. Without bells and whistles, STD +achieves state-of-the-art performance on the benchmark datasets including +DOTA-v1.0 (82.24% mAP) and HRSC2016 (98.55% mAP), which demonstrates the +effectiveness of the proposed method. Source code is available at +https://github.com/yuhongtian17/Spatial-Transform-Decoupling. + +
+
+
+
+
+ + ☆ Local Spherical Harmonics Improve Skeleton-Based Hand Action Recognition + + +
+ Hand action recognition is essential. Communication, human-robot +interactions, and gesture control are dependent on it. Skeleton-based action +recognition traditionally includes hands, which belong to the classes which +remain challenging to correctly recognize to date. We propose a method +specifically designed for hand action recognition which uses relative angular +embeddings and local Spherical Harmonics to create novel hand representations. +The use of Spherical Harmonics creates rotation-invariant representations which +make hand action recognition even more robust against inter-subject differences +and viewpoint changes. We conduct extensive experiments on the hand joints in +the First-Person Hand Action Benchmark with RGB-D Videos and 3D Hand Pose +Annotations, and on the NTU RGB+D 120 dataset, demonstrating the benefit of +using Local Spherical Harmonics Representations. Our code is available at +https://github.com/KathPra/LSHR_LSHT. + +
+
+
+
+
+ + ☆ Improving Diversity in Zero-Shot GAN Adaptation with Semantic Variations ICCV 2023 + + +
+ Training deep generative models usually requires a large amount of data. To +alleviate the data collection cost, the task of zero-shot GAN adaptation aims +to reuse well-trained generators to synthesize images of an unseen target +domain without any further training samples. Due to the data absence, the +textual description of the target domain and the vision-language models, e.g., +CLIP, are utilized to effectively guide the generator. However, with only a +single representative text feature instead of real images, the synthesized +images gradually lose diversity as the model is optimized, which is also known +as mode collapse. To tackle the problem, we propose a novel method to find +semantic variations of the target text in the CLIP space. Specifically, we +explore diverse semantic variations based on the informative text feature of +the target domain while regularizing the uncontrolled deviation of the semantic +information. With the obtained variations, we design a novel directional moment +loss that matches the first and second moments of image and text direction +distributions. Moreover, we introduce elastic weight consolidation and a +relation consistency loss to effectively preserve valuable content information +from the source domain, e.g., appearances. Through extensive experiments, we +demonstrate the efficacy of the proposed methods in ensuring sample diversity +in various scenarios of zero-shot GAN adaptation. We also conduct ablation +studies to validate the effect of each proposed component. Notably, our model +achieves a new state-of-the-art on zero-shot GAN adaptation in terms of both +diversity and quality. + +
+
+ comment: Accepted to ICCV 2023 (poster) +
+
+
+
+
+ + ☆ Learning Weakly Convex Regularizers for Convergent Image-Reconstruction + Algorithms + + +
+ We propose to learn non-convex regularizers with a prescribed upper bound on +their weak-convexity modulus. Such regularizers give rise to variational +denoisers that minimize a convex energy. They rely on few parameters (less than +15,000) and offer a signal-processing interpretation as they mimic handcrafted +sparsity-promoting regularizers. Through numerical experiments, we show that +such denoisers outperform convex-regularization methods as well as the popular +BM3D denoiser. Additionally, the learned regularizer can be deployed to solve +inverse problems with iterative schemes that provably converge. For both CT and +MRI reconstruction, the regularizer generalizes well and offers an excellent +tradeoff between performance, number of parameters, guarantees, and +interpretability when compared to other data-driven approaches. + +
+
+
+
+
+ + ☆ Joint learning of images and videos with a single Vision Transformer + + +
+ In this study, we propose a method for jointly learning of images and videos +using a single model. In general, images and videos are often trained by +separate models. We propose in this paper a method that takes a batch of images +as input to Vision Transformer IV-ViT, and also a set of video frames with +temporal aggregation by late fusion. Experimental results on two image datasets +and two action recognition datasets are presented. + +
+
+ comment: MVA2023 (18th International Conference on Machine Vision + Applications), Hamamatsu, Japan, 23-25 July 2023 +
+
+
+
+
+ + ☆ SRFormer: Empowering Regression-Based Text Detection Transformer with + Segmentation + + +
+ Existing techniques for text detection can be broadly classified into two +primary groups: segmentation-based methods and regression-based methods. +Segmentation models offer enhanced robustness to font variations but require +intricate post-processing, leading to high computational overhead. +Regression-based methods undertake instance-aware prediction but face +limitations in robustness and data efficiency due to their reliance on +high-level representations. In our academic pursuit, we propose SRFormer, a +unified DETR-based model with amalgamated Segmentation and Regression, aiming +at the synergistic harnessing of the inherent robustness in segmentation +representations, along with the straightforward post-processing of +instance-level regression. Our empirical analysis indicates that favorable +segmentation predictions can be obtained at the initial decoder layers. In +light of this, we constrain the incorporation of segmentation branches to the +first few decoder layers and employ progressive regression refinement in +subsequent layers, achieving performance gains while minimizing additional +computational load from the mask. Furthermore, we propose a Mask-informed Query +Enhancement module. We take the segmentation result as a natural soft-ROI to +pool and extract robust pixel representations, which are then employed to +enhance and diversify instance queries. Extensive experimentation across +multiple benchmarks has yielded compelling findings, highlighting our method's +exceptional robustness, superior training and data efficiency, as well as its +state-of-the-art performance. + +
+
+
+
+
+ + ☆ LightDepth: Single-View Depth Self-Supervision from Illumination Decline + + +
+ Single-view depth estimation can be remarkably effective if there is enough +ground-truth depth data for supervised training. However, there are scenarios, +especially in medicine in the case of endoscopies, where such data cannot be +obtained. In such cases, multi-view self-supervision and synthetic-to-real +transfer serve as alternative approaches, however, with a considerable +performance reduction in comparison to supervised case. Instead, we propose a +single-view self-supervised method that achieves a performance similar to the +supervised case. In some medical devices, such as endoscopes, the camera and +light sources are co-located at a small distance from the target surfaces. +Thus, we can exploit that, for any given albedo and surface orientation, pixel +brightness is inversely proportional to the square of the distance to the +surface, providing a strong single-view self-supervisory signal. In our +experiments, our self-supervised models deliver accuracies comparable to those +of fully supervised ones, while being applicable without depth ground-truth +data. + +
+
+
+
+
+ + ☆ Dataset Quantization + + +
+ State-of-the-art deep neural networks are trained with large amounts +(millions or even billions) of data. The expensive computation and memory costs +make it difficult to train them on limited hardware resources, especially for +recent popular large language models (LLM) and computer vision models (CV). +Recent popular dataset distillation methods are thus developed, aiming to +reduce the number of training samples via synthesizing small-scale datasets via +gradient matching. However, as the gradient calculation is coupled with the +specific network architecture, the synthesized dataset is biased and performs +poorly when used for training unseen architectures. To address these +limitations, we present dataset quantization (DQ), a new framework to compress +large-scale datasets into small subsets which can be used for training any +neural network architectures. Extensive experiments demonstrate that DQ is able +to generate condensed small datasets for training unseen network architectures +with state-of-the-art compression ratios for lossless model training. To the +best of our knowledge, DQ is the first method that can successfully distill +large-scale datasets such as ImageNet-1k with a state-of-the-art compression +ratio. Notably, with 60% data from ImageNet and 20% data from Alpaca's +instruction tuning data, the models can be trained with negligible or no +performance drop for both vision tasks (including classification, semantic +segmentation, and object detection) as well as language tasks (including +instruction tuning tasks such as BBH and DROP). + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Information Theory-Guided Heuristic Progressive Multi-View Coding + + +
+ Multi-view representation learning aims to capture comprehensive information +from multiple views of a shared context. Recent works intuitively apply +contrastive learning to different views in a pairwise manner, which is still +scalable: view-specific noise is not filtered in learning view-shared +representations; the fake negative pairs, where the negative terms are actually +within the same class as the positive, and the real negative pairs are +coequally treated; evenly measuring the similarities between terms might +interfere with optimization. Importantly, few works study the theoretical +framework of generalized self-supervised multi-view learning, especially for +more than two views. To this end, we rethink the existing multi-view learning +paradigm from the perspective of information theory and then propose a novel +information theoretical framework for generalized multi-view learning. Guided +by it, we build a multi-view coding method with a three-tier progressive +architecture, namely Information theory-guided hierarchical Progressive +Multi-view Coding (IPMC). In the distribution-tier, IPMC aligns the +distribution between views to reduce view-specific noise. In the set-tier, IPMC +constructs self-adjusted contrasting pools, which are adaptively modified by a +view filter. Lastly, in the instance-tier, we adopt a designed unified loss to +learn representations and reduce the gradient interference. Theoretically and +empirically, we demonstrate the superiority of IPMC over state-of-the-art +methods. + +
+
+ comment: This paper is accepted y the jourcal of Elsevier Neural Networks by + 2023. arXiv admin note: substantial text overlap with arXiv:2109.02344 +
+
+
+
+
+ + ☆ PHE-SICH-CT-IDS: A Benchmark CT Image Dataset for Evaluation Semantic + Segmentation, Object Detection and Radiomic Feature Extraction of + Perihematomal Edema in Spontaneous Intracerebral Hemorrhage + + +
+ Intracerebral hemorrhage is one of the diseases with the highest mortality +and poorest prognosis worldwide. Spontaneous intracerebral hemorrhage (SICH) +typically presents acutely, prompt and expedited radiological examination is +crucial for diagnosis, localization, and quantification of the hemorrhage. +Early detection and accurate segmentation of perihematomal edema (PHE) play a +critical role in guiding appropriate clinical intervention and enhancing +patient prognosis. However, the progress and assessment of computer-aided +diagnostic methods for PHE segmentation and detection face challenges due to +the scarcity of publicly accessible brain CT image datasets. This study +establishes a publicly available CT dataset named PHE-SICH-CT-IDS for +perihematomal edema in spontaneous intracerebral hemorrhage. The dataset +comprises 120 brain CT scans and 7,022 CT images, along with corresponding +medical information of the patients. To demonstrate its effectiveness, +classical algorithms for semantic segmentation, object detection, and radiomic +feature extraction are evaluated. The experimental results confirm the +suitability of PHE-SICH-CT-IDS for assessing the performance of segmentation, +detection and radiomic feature extraction methods. To the best of our +knowledge, this is the first publicly available dataset for PHE in SICH, +comprising various data formats suitable for applications across diverse +medical scenarios. We believe that PHE-SICH-CT-IDS will allure researchers to +explore novel algorithms, providing valuable support for clinicians and +patients in the clinical setting. PHE-SICH-CT-IDS is freely published for +non-commercial purpose at: +https://figshare.com/articles/dataset/PHE-SICH-CT-IDS/23957937. + +
+
+
+
+
+ + ☆ QD-BEV : Quantization-aware View-guided Distillation for Multi-view 3D + Object Detection ICCV 2023 + + +
+ Multi-view 3D detection based on BEV (bird-eye-view) has recently achieved +significant improvements. However, the huge memory consumption of +state-of-the-art models makes it hard to deploy them on vehicles, and the +non-trivial latency will affect the real-time perception of streaming +applications. Despite the wide application of quantization to lighten models, +we show in our paper that directly applying quantization in BEV tasks will 1) +make the training unstable, and 2) lead to intolerable performance degradation. +To solve these issues, our method QD-BEV enables a novel view-guided +distillation (VGD) objective, which can stabilize the quantization-aware +training (QAT) while enhancing the model performance by leveraging both image +features and BEV features. Our experiments show that QD-BEV achieves similar or +even better accuracy than previous methods with significant efficiency gains. +On the nuScenes datasets, the 4-bit weight and 6-bit activation quantized +QD-BEV-Tiny model achieves 37.2% NDS with only 15.8 MB model size, +outperforming BevFormer-Tiny by 1.8% with an 8x model compression. On the Small +and Base variants, QD-BEV models also perform superbly and achieve 47.9% NDS +(28.2 MB) and 50.9% NDS (32.9 MB), respectively. + +
+
+ comment: ICCV 2023 Accept +
+
+
+
+
+ + ☆ Performance Enhancement Leveraging Mask-RCNN on Bengali Document Layout + Analysis + + +
+ Understanding digital documents is like solving a puzzle, especially +historical ones. Document Layout Analysis (DLA) helps with this puzzle by +dividing documents into sections like paragraphs, images, and tables. This is +crucial for machines to read and understand these documents.In the DL Sprint +2.0 competition, we worked on understanding Bangla documents. We used a dataset +called BaDLAD with lots of examples. We trained a special model called Mask +R-CNN to help with this understanding. We made this model better by +step-by-step hyperparameter tuning, and we achieved a good dice score of +0.889.However, not everything went perfectly. We tried using a model trained +for English documents, but it didn't fit well with Bangla. This showed us that +each language has its own challenges. Our solution for the DL Sprint 2.0 is +publicly available at +https://www.kaggle.com/competitions/dlsprint2/discussion/432201 along with +notebooks, weights, and inference notebook. + +
+
+ comment: Contest paper, Conest: DL sprint 2.0 (Link: + https://www.kaggle.com/competitions/dlsprint2), Solution link: + https://www.kaggle.com/competitions/dlsprint2/discussion/432201 +
+
+
+
+
+ + ☆ Frequency Compensated Diffusion Model for Real-scene Dehazing + + +
+ Due to distribution shift, deep learning based methods for image dehazing +suffer from performance degradation when applied to real-world hazy images. In +this paper, we consider a dehazing framework based on conditional diffusion +models for improved generalization to real haze. First, we find that optimizing +the training objective of diffusion models, i.e., Gaussian noise vectors, is +non-trivial. The spectral bias of deep networks hinders the higher frequency +modes in Gaussian vectors from being learned and hence impairs the +reconstruction of image details. To tackle this issue, we design a network +unit, named Frequency Compensation block (FCB), with a bank of filters that +jointly emphasize the mid-to-high frequencies of an input signal. We +demonstrate that diffusion models with FCB achieve significant gains in both +perceptual and distortion metrics. Second, to further boost the generalization +performance, we propose a novel data synthesis pipeline, HazeAug, to augment +haze in terms of degree and diversity. Within the framework, a solid baseline +for blind dehazing is set up where models are trained on synthetic hazy-clean +pairs, and directly generalize to real data. Extensive evaluations show that +the proposed dehazing diffusion model significantly outperforms +state-of-the-art methods on real-world images. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ An Examination of the Compositionality of Large Generative + Vision-Language Models + + +
+ With the success of Large Language Models (LLMs), a surge of Generative +Vision-Language Models (GVLMs) have been constructed via multimodal instruction +tuning. The tuning recipe substantially deviates from the common contrastive +vision-language learning. However, the performance of GVLMs in multimodal +compositional reasoning remains largely unexplored, as existing evaluation +metrics and benchmarks focus predominantly on assessing contrastive models like +CLIP. In this paper, we examine the potential evaluation metrics to assess the +GVLMs and hypothesize generative score methods are suitable for evaluating +compositionality. In addition, current benchmarks tend to prioritize syntactic +correctness over semantics. The presence of morphological bias in these +benchmarks can be exploited by GVLMs, leading to ineffective evaluations. To +combat this, we define a MorphoBias Score to quantify the morphological bias +and propose a novel LLM-based strategy to calibrate the bias. Moreover, a +challenging task is added to evaluate the robustness of GVLMs against inherent +inclination toward syntactic correctness. We include the calibrated dataset and +the task into a new benchmark, namely MOrphologicall De-biased Benchmark +(MODE). Our study provides the first unbiased benchmark for the +compositionality of GVLMs, facilitating future research in this direction. We +will release our code and datasets. + +
+
+
+
+
+ + ☆ Semantic Graph Representation Learning for Handwritten Mathematical + Expression Recognition + + +
+ Handwritten mathematical expression recognition (HMER) has attracted +extensive attention recently. However, current methods cannot explicitly study +the interactions between different symbols, which may fail when faced similar +symbols. To alleviate this issue, we propose a simple but efficient method to +enhance semantic interaction learning (SIL). Specifically, we firstly construct +a semantic graph based on the statistical symbol co-occurrence probabilities. +Then we design a semantic aware module (SAM), which projects the visual and +classification feature into semantic space. The cosine distance between +different projected vectors indicates the correlation between symbols. And +jointly optimizing HMER and SIL can explicitly enhances the model's +understanding of symbol relationships. In addition, SAM can be easily plugged +into existing attention-based models for HMER and consistently bring +improvement. Extensive experiments on public benchmark datasets demonstrate +that our proposed module can effectively enhance the recognition performance. +Our method achieves better recognition performance than prior arts on both +CROHME and HME100K datasets. + +
+
+ comment: 12 Pages +
+
+
+
+
+ + ☆ SynDrone -- Multi-modal UAV Dataset for Urban Scenarios ICCV + + +
+ The development of computer vision algorithms for Unmanned Aerial Vehicles +(UAVs) imagery heavily relies on the availability of annotated high-resolution +aerial data. However, the scarcity of large-scale real datasets with +pixel-level annotations poses a significant challenge to researchers as the +limited number of images in existing datasets hinders the effectiveness of deep +learning models that require a large amount of training data. In this paper, we +propose a multimodal synthetic dataset containing both images and 3D data taken +at multiple flying heights to address these limitations. In addition to +object-level annotations, the provided data also include pixel-level labeling +in 28 classes, enabling exploration of the potential advantages in tasks like +semantic segmentation. In total, our dataset contains 72k labeled samples that +allow for effective training of deep architectures showing promising results in +synthetic-to-real adaptation. The dataset will be made publicly available to +support the development of novel computer vision methods targeting UAV +applications. + +
+
+ comment: Accepted at ICCV Workshops, downloadable dataset with CC-BY license, + 8 pages, 4 figures, 8 tables +
+
+
+
+
+ + ☆ Texture Generation on 3D Meshes with Point-UV Diffusion ICCV 2023 + + +
+ In this work, we focus on synthesizing high-quality textures on 3D meshes. We +present Point-UV diffusion, a coarse-to-fine pipeline that marries the +denoising diffusion model with UV mapping to generate 3D consistent and +high-quality texture images in UV space. We start with introducing a point +diffusion model to synthesize low-frequency texture components with our +tailored style guidance to tackle the biased color distribution. The derived +coarse texture offers global consistency and serves as a condition for the +subsequent UV diffusion stage, aiding in regularizing the model to generate a +3D consistent UV texture image. Then, a UV diffusion model with hybrid +conditions is developed to enhance the texture fidelity in the 2D UV space. Our +method can process meshes of any genus, generating diversified, +geometry-compatible, and high-fidelity textures. Code is available at +https://cvmi-lab.github.io/Point-UV-Diffusion + +
+
+ comment: Accepted to ICCV 2023, Oral +
+
+
+
+
+ + ☆ Enhancing Medical Image Segmentation: Optimizing Cross-Entropy Weights + and Post-Processing with Autoencoders ICCV + + +
+ The task of medical image segmentation presents unique challenges, +necessitating both localized and holistic semantic understanding to accurately +delineate areas of interest, such as critical tissues or aberrant features. +This complexity is heightened in medical image segmentation due to the high +degree of inter-class similarities, intra-class variations, and possible image +obfuscation. The segmentation task further diversifies when considering the +study of histopathology slides for autoimmune diseases like dermatomyositis. +The analysis of cell inflammation and interaction in these cases has been less +studied due to constraints in data acquisition pipelines. Despite the +progressive strides in medical science, we lack a comprehensive collection of +autoimmune diseases. As autoimmune diseases globally escalate in prevalence and +exhibit associations with COVID-19, their study becomes increasingly essential. +While there is existing research that integrates artificial intelligence in the +analysis of various autoimmune diseases, the exploration of dermatomyositis +remains relatively underrepresented. In this paper, we present a deep-learning +approach tailored for Medical image segmentation. Our proposed method +outperforms the current state-of-the-art techniques by an average of 12.26% for +U-Net and 12.04% for U-Net++ across the ResNet family of encoders on the +dermatomyositis dataset. Furthermore, we probe the importance of optimizing +loss function weights and benchmark our methodology on three challenging +medical image segmentation tasks + +
+
+ comment: Accepted at ICCV CVAMD 2023 +
+
+
+
+
+ + ☆ ADNet: Lane Shape Prediction via Anchor Decomposition ICCV2023 + + +
+ In this paper, we revisit the limitations of anchor-based lane detection +methods, which have predominantly focused on fixed anchors that stem from the +edges of the image, disregarding their versatility and quality. To overcome the +inflexibility of anchors, we decompose them into learning the heat map of +starting points and their associated directions. This decomposition removes the +limitations on the starting point of anchors, making our algorithm adaptable to +different lane types in various datasets. To enhance the quality of anchors, we +introduce the Large Kernel Attention (LKA) for Feature Pyramid Network (FPN). +This significantly increases the receptive field, which is crucial in capturing +the sufficient context as lane lines typically run throughout the entire image. +We have named our proposed system the Anchor Decomposition Network (ADNet). +Additionally, we propose the General Lane IoU (GLIoU) loss, which significantly +improves the performance of ADNet in complex scenarios. Experimental results on +three widely used lane detection benchmarks, VIL-100, CULane, and TuSimple, +demonstrate that our approach outperforms the state-of-the-art methods on +VIL-100 and exhibits competitive accuracy on CULane and TuSimple. Code and +models will be released on https://github.com/ Sephirex-X/ADNet. + +
+
+ comment: ICCV2023 accepted +
+
+
+
+
+ + ☆ STEERER: Resolving Scale Variations for Counting and Localization via + Selective Inheritance Learning ICCV2023 + + +
+ Scale variation is a deep-rooted problem in object counting, which has not +been effectively addressed by existing scale-aware algorithms. An important +factor is that they typically involve cooperative learning across +multi-resolutions, which could be suboptimal for learning the most +discriminative features from each scale. In this paper, we propose a novel +method termed STEERER (\textbf{S}elec\textbf{T}iv\textbf{E} +inh\textbf{ER}itance l\textbf{E}a\textbf{R}ning) that addresses the issue of +scale variations in object counting. STEERER selects the most suitable scale +for patch objects to boost feature extraction and only inherits discriminative +features from lower to higher resolution progressively. The main insights of +STEERER are a dedicated Feature Selection and Inheritance Adaptor (FSIA), which +selectively forwards scale-customized features at each scale, and a Masked +Selection and Inheritance Loss (MSIL) that helps to achieve high-quality +density maps across all scales. Our experimental results on nine datasets with +counting and localization tasks demonstrate the unprecedented scale +generalization ability of STEERER. Code is available at +\url{https://github.com/taohan10200/STEERER}. + +
+
+ comment: Accepted by ICCV2023, 9 pages +
+
+
+
+
+ + ☆ Privacy-Preserving Face Recognition Using Random Frequency Components ICCV 2023 + + +
+ The ubiquitous use of face recognition has sparked increasing privacy +concerns, as unauthorized access to sensitive face images could compromise the +information of individuals. This paper presents an in-depth study of the +privacy protection of face images' visual information and against recovery. +Drawing on the perceptual disparity between humans and models, we propose to +conceal visual information by pruning human-perceivable low-frequency +components. For impeding recovery, we first elucidate the seeming paradox +between reducing model-exploitable information and retaining high recognition +accuracy. Based on recent theoretical insights and our observation on model +attention, we propose a solution to the dilemma, by advocating for the training +and inference of recognition models on randomly selected frequency components. +We distill our findings into a novel privacy-preserving face recognition +method, PartialFace. Extensive experiments demonstrate that PartialFace +effectively balances privacy protection goals and recognition accuracy. Code is +available at: https://github.com/Tencent/TFace. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ DOMINO++: Domain-aware Loss Regularization for Deep Learning + Generalizability MICCAI + + +
+ Out-of-distribution (OOD) generalization poses a serious challenge for modern +deep learning (DL). OOD data consists of test data that is significantly +different from the model's training data. DL models that perform well on +in-domain test data could struggle on OOD data. Overcoming this discrepancy is +essential to the reliable deployment of DL. Proper model calibration decreases +the number of spurious connections that are made between model features and +class outputs. Hence, calibrated DL can improve OOD generalization by only +learning features that are truly indicative of the respective classes. Previous +work proposed domain-aware model calibration (DOMINO) to improve DL +calibration, but it lacks designs for model generalizability to OOD data. In +this work, we propose DOMINO++, a dual-guidance and dynamic domain-aware loss +regularization focused on OOD generalizability. DOMINO++ integrates +expert-guided and data-guided knowledge in its regularization. Unlike DOMINO +which imposed a fixed scaling and regularization rate, DOMINO++ designs a +dynamic scaling factor and an adaptive regularization rate. Comprehensive +evaluations compare DOMINO++ with DOMINO and the baseline model for head tissue +segmentation from magnetic resonance images (MRIs) on OOD data. The OOD data +consists of synthetic noisy and rotated datasets, as well as real data using a +different MRI scanner from a separate site. DOMINO++'s superior performance +demonstrates its potential to improve the trustworthy deployment of DL on real +clinical data. + +
+
+ comment: 12 pages, 5 figures, 5 tables, Accepted by the International + Conference on Medical Image Computing and Computer Assisted Intervention + (MICCAI) 2023 +
+
+
+
+
+ + ☆ COCA: Classifier-Oriented Calibration for Source-Free Universal Domain + Adaptation via Textual Prototype + + +
+ Universal Domain Adaptation (UniDA) aims to distinguish common and private +classes between the source and target domains where domain shift exists. +Recently, due to more stringent data restrictions, researchers have introduced +Source-Free UniDA (SF-UniDA) in more realistic scenarios. SF-UniDA methods +eliminate the need for direct access to source samples when performing +adaptation to the target domain. However, existing SF-UniDA methods still +require an extensive quantity of labeled source samples to train a source +model, resulting in significant labeling costs. To tackle this issue, we +present a novel Classifier-Oriented Calibration (COCA) method. This method, +which leverages textual prototypes, is formulated for the source model based on +few-shot learning. Specifically, we propose studying few-shot learning, usually +explored for closed-set scenarios, to identify common and domain-private +classes despite a significant domain shift between source and target domains. +Essentially, we present a novel paradigm based on the vision-language model to +learn SF-UniDA and hugely reduce the labeling costs on the source domain. +Experimental results demonstrate that our approach outperforms state-of-the-art +UniDA and SF-UniDA models. + +
+
+
+
+
+ + ☆ CVFC: Attention-Based Cross-View Feature Consistency for Weakly + Supervised Semantic Segmentation of Pathology Images + + +
+ Histopathology image segmentation is the gold standard for diagnosing cancer, +and can indicate cancer prognosis. However, histopathology image segmentation +requires high-quality masks, so many studies now use imagelevel labels to +achieve pixel-level segmentation to reduce the need for fine-grained +annotation. To solve this problem, we propose an attention-based cross-view +feature consistency end-to-end pseudo-mask generation framework named CVFC +based on the attention mechanism. Specifically, CVFC is a three-branch joint +framework composed of two Resnet38 and one Resnet50, and the independent branch +multi-scale integrated feature map to generate a class activation map (CAM); in +each branch, through down-sampling and The expansion method adjusts the size of +the CAM; the middle branch projects the feature matrix to the query and key +feature spaces, and generates a feature space perception matrix through the +connection layer and inner product to adjust and refine the CAM of each branch; +finally, through the feature consistency loss and feature cross loss to +optimize the parameters of CVFC in co-training mode. After a large number of +experiments, An IoU of 0.7122 and a fwIoU of 0.7018 are obtained on the +WSSS4LUAD dataset, which outperforms HistoSegNet, SEAM, C-CAM, WSSS-Tissue, and +OEEM, respectively. + +
+
+ comment: Submitted to BIBM2023 +
+
+
+
+
+ + ☆ Explore and Tell: Embodied Visual Captioning in 3D Environments ICCV 2023 + + +
+ While current visual captioning models have achieved impressive performance, +they often assume that the image is well-captured and provides a complete view +of the scene. In real-world scenarios, however, a single image may not offer a +good viewpoint, hindering fine-grained scene understanding. To overcome this +limitation, we propose a novel task called Embodied Captioning, which equips +visual captioning models with navigation capabilities, enabling them to +actively explore the scene and reduce visual ambiguity from suboptimal +viewpoints. Specifically, starting at a random viewpoint, an agent must +navigate the environment to gather information from different viewpoints and +generate a comprehensive paragraph describing all objects in the scene. To +support this task, we build the ET-Cap dataset with Kubric simulator, +consisting of 10K 3D scenes with cluttered objects and three annotated +paragraphs per scene. We propose a Cascade Embodied Captioning model (CaBOT), +which comprises of a navigator and a captioner, to tackle this task. The +navigator predicts which actions to take in the environment, while the +captioner generates a paragraph description based on the whole navigation +trajectory. Extensive experiments demonstrate that our model outperforms other +carefully designed baselines. Our dataset, codes and models are available at +https://aim3-ruc.github.io/ExploreAndTell. + +
+
+ comment: 12 pages; 10 figures; ICCV 2023 +
+
+
+
+
+ + ☆ LDCSF: Local depth convolution-based Swim framework for classifying + multi-label histopathology images + + +
+ Histopathological images are the gold standard for diagnosing liver cancer. +However, the accuracy of fully digital diagnosis in computational pathology +needs to be improved. In this paper, in order to solve the problem of +multi-label and low classification accuracy of histopathology images, we +propose a locally deep convolutional Swim framework (LDCSF) to classify +multi-label histopathology images. In order to be able to provide local field +of view diagnostic results, we propose the LDCSF model, which consists of a +Swin transformer module, a local depth convolution (LDC) module, a feature +reconstruction (FR) module, and a ResNet module. The Swin transformer module +reduces the amount of computation generated by the attention mechanism by +limiting the attention to each window. The LDC then reconstructs the attention +map and performs convolution operations in multiple channels, passing the +resulting feature map to the next layer. The FR module uses the corresponding +weight coefficient vectors obtained from the channels to dot product with the +original feature map vector matrix to generate representative feature maps. +Finally, the residual network undertakes the final classification task. As a +result, the classification accuracy of LDCSF for interstitial area, necrosis, +non-tumor and tumor reached 0.9460, 0.9960, 0.9808, 0.9847, respectively. +Finally, we use the results of multi-label pathological image classification to +calculate the tumor-to-stromal ratio, which lays the foundation for the +analysis of the microenvironment of liver cancer histopathological images. +Second, we released a multilabel histopathology image of liver cancer, our code +and data are available at https://github.com/panliangrui/LSF. + +
+
+ comment: Submitted to BIBM2023 +
+
+
+
+
+ + ☆ When Prompt-based Incremental Learning Does Not Meet Strong Pretraining ICCV 2023 + + +
+ Incremental learning aims to overcome catastrophic forgetting when learning +deep networks from sequential tasks. With impressive learning efficiency and +performance, prompt-based methods adopt a fixed backbone to sequential tasks by +learning task-specific prompts. However, existing prompt-based methods heavily +rely on strong pretraining (typically trained on ImageNet-21k), and we find +that their models could be trapped if the potential gap between the pretraining +task and unknown future tasks is large. In this work, we develop a learnable +Adaptive Prompt Generator (APG). The key is to unify the prompt retrieval and +prompt learning processes into a learnable prompt generator. Hence, the whole +prompting process can be optimized to reduce the negative effects of the gap +between tasks effectively. To make our APG avoid learning ineffective +knowledge, we maintain a knowledge pool to regularize APG with the feature +distribution of each class. Extensive experiments show that our method +significantly outperforms advanced methods in exemplar-free incremental +learning without (strong) pretraining. Besides, under strong retraining, our +method also has comparable performance to existing prompt-based models, showing +that our method can still benefit from pretraining. Codes can be found at +https://github.com/TOM-tym/APG + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ X-VoE: Measuring eXplanatory Violation of Expectation in Physical Events ICCV 2023 + + +
+ Intuitive physics is pivotal for human understanding of the physical world, +enabling prediction and interpretation of events even in infancy. Nonetheless, +replicating this level of intuitive physics in artificial intelligence (AI) +remains a formidable challenge. This study introduces X-VoE, a comprehensive +benchmark dataset, to assess AI agents' grasp of intuitive physics. Built on +the developmental psychology-rooted Violation of Expectation (VoE) paradigm, +X-VoE establishes a higher bar for the explanatory capacities of intuitive +physics models. Each VoE scenario within X-VoE encompasses three distinct +settings, probing models' comprehension of events and their underlying +explanations. Beyond model evaluation, we present an explanation-based learning +system that captures physics dynamics and infers occluded object states solely +from visual sequences, without explicit occlusion labels. Experimental outcomes +highlight our model's alignment with human commonsense when tested against +X-VoE. A remarkable feature is our model's ability to visually expound VoE +events by reconstructing concealed scenes. Concluding, we discuss the findings' +implications and outline future research directions. Through X-VoE, we catalyze +the advancement of AI endowed with human-like intuitive physics capabilities. + +
+
+ comment: 19 pages, 16 figures, selected for an Oral presentation at ICCV 2023. + Project link: https://pku.ai/publication/intuitive2023iccv/ +
+
+
+
+
+ + ☆ Efficient Joint Optimization of Layer-Adaptive Weight Pruning in Deep + Neural Networks + + +
+ In this paper, we propose a novel layer-adaptive weight-pruning approach for +Deep Neural Networks (DNNs) that addresses the challenge of optimizing the +output distortion minimization while adhering to a target pruning ratio +constraint. Our approach takes into account the collective influence of all +layers to design a layer-adaptive pruning scheme. We discover and utilize a +very important additivity property of output distortion caused by pruning +weights on multiple layers. This property enables us to formulate the pruning +as a combinatorial optimization problem and efficiently solve it through +dynamic programming. By decomposing the problem into sub-problems, we achieve +linear time complexity, making our optimization algorithm fast and feasible to +run on CPUs. Our extensive experiments demonstrate the superiority of our +approach over existing methods on the ImageNet and CIFAR-10 datasets. On +CIFAR-10, our method achieves remarkable improvements, outperforming others by +up to 1.0% for ResNet-32, 0.5% for VGG-16, and 0.7% for DenseNet-121 in terms +of top-1 accuracy. On ImageNet, we achieve up to 4.7% and 4.6% higher top-1 +accuracy compared to other methods for VGG-16 and ResNet-50, respectively. +These results highlight the effectiveness and practicality of our approach for +enhancing DNN performance through layer-adaptive weight pruning. Code will be +available on https://github.com/Akimoto-Cris/RD_VIT_PRUNE. + +
+
+
+
+
+ + ☆ UniM$^2$AE: Multi-modal Masked Autoencoders with Unified 3D + Representation for 3D Perception in Autonomous Driving + + +
+ Masked Autoencoders (MAE) play a pivotal role in learning potent +representations, delivering outstanding results across various 3D perception +tasks essential for autonomous driving. In real-world driving scenarios, it's +commonplace to deploy multiple sensors for comprehensive environment +perception. While integrating multi-modal features from these sensors can +produce rich and powerful features, there is a noticeable gap in MAE methods +addressing this integration. This research delves into multi-modal Masked +Autoencoders tailored for a unified representation space in autonomous driving, +aiming to pioneer a more efficient fusion of two distinct modalities. To +intricately marry the semantics inherent in images with the geometric +intricacies of LiDAR point clouds, the UniM$^2$AE is proposed. This model +stands as a potent yet straightforward, multi-modal self-supervised +pre-training framework, mainly consisting of two designs. First, it projects +the features from both modalities into a cohesive 3D volume space, ingeniously +expanded from the bird's eye view (BEV) to include the height dimension. The +extension makes it possible to back-project the informative features, obtained +by fusing features from both modalities, into their native modalities to +reconstruct the multiple masked inputs. Second, the Multi-modal 3D Interactive +Module (MMIM) is invoked to facilitate the efficient inter-modal interaction +during the interaction process. Extensive experiments conducted on the nuScenes +Dataset attest to the efficacy of UniM$^2$AE, indicating enhancements in 3D +object detection and BEV map segmentation by 1.2\%(NDS) and 6.5\% (mIoU), +respectively. Code is available at https://github.com/hollow-503/UniM2AE. + +
+
+ comment: Code available at https://github.com/hollow-503/UniM2AE +
+
+
+
+
+ + ☆ The Change You Want to See (Now in 3D) + + +
+ The goal of this paper is to detect what has changed, if anything, between +two "in the wild" images of the same 3D scene acquired from different camera +positions and at different temporal instances. The open-set nature of this +problem, occlusions/dis-occlusions due to the shift in viewpoint, and the lack +of suitable training datasets, presents substantial challenges in devising a +solution. + To address this problem, we contribute a change detection model that is +trained entirely on synthetic data and is class-agnostic, yet it is performant +out-of-the-box on real world images without requiring fine-tuning. Our solution +entails a "register and difference" approach that leverages self-supervised +frozen embeddings and feature differences, which allows the model to generalise +to a wide variety of scenes and domains. The model is able to operate directly +on two RGB images, without requiring access to ground truth camera intrinsics, +extrinsics, depth maps, point clouds, or additional before-after images. +Finally, we collect and release a new evaluation dataset consisting of +real-world image pairs with human-annotated differences and demonstrate the +efficacy of our method. The code, datasets and pre-trained model can be found +at: https://github.com/ragavsachdeva/CYWS-3D + +
+
+
+
+
+ + ☆ In-Rack Test Tube Pose Estimation Using RGB-D Data + + +
+ Accurate robotic manipulation of test tubes in biology and medical industries +is becoming increasingly important to address workforce shortages and improve +worker safety. The detection and localization of test tubes are essential for +the robots to successfully manipulate test tubes. In this paper, we present a +framework to detect and estimate poses for the in-rack test tubes using color +and depth data. The methodology involves the utilization of a YOLO object +detector to effectively classify and localize both the test tubes and the tube +racks within the provided image data. Subsequently, the pose of the tube rack +is estimated through point cloud registration techniques. During the process of +estimating the poses of the test tubes, we capitalize on constraints derived +from the arrangement of rack slots. By employing an optimization-based +algorithm, we effectively evaluate and refine the pose of the test tubes. This +strategic approach ensures the robustness of pose estimation, even when +confronted with noisy and incomplete point cloud data. + +
+
+ comment: Submit to IEEE ROBIO 2023 +
+
+
+
+
+ + ☆ Turning a CLIP Model into a Scene Text Spotter + + +
+ We exploit the potential of the large-scale Contrastive Language-Image +Pretraining (CLIP) model to enhance scene text detection and spotting tasks, +transforming it into a robust backbone, FastTCM-CR50. This backbone utilizes +visual prompt learning and cross-attention in CLIP to extract image and +text-based prior knowledge. Using predefined and learnable prompts, +FastTCM-CR50 introduces an instance-language matching process to enhance the +synergy between image and text embeddings, thereby refining text regions. Our +Bimodal Similarity Matching (BSM) module facilitates dynamic language prompt +generation, enabling offline computations and improving performance. +FastTCM-CR50 offers several advantages: 1) It can enhance existing text +detectors and spotters, improving performance by an average of 1.7% and 1.5%, +respectively. 2) It outperforms the previous TCM-CR50 backbone, yielding an +average improvement of 0.2% and 0.56% in text detection and spotting tasks, +along with a 48.5% increase in inference speed. 3) It showcases robust few-shot +training capabilities. Utilizing only 10% of the supervised data, FastTCM-CR50 +improves performance by an average of 26.5% and 5.5% for text detection and +spotting tasks, respectively. 4) It consistently enhances performance on +out-of-distribution text detection and spotting datasets, particularly the +NightTime-ArT subset from ICDAR2019-ArT and the DOTA dataset for oriented +object detection. The code is available at https://github.com/wenwenyu/TCM. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2302.14338 +
+
+
+
+
+ + ☆ Simple Baselines for Interactive Video Retrieval with Questions and + Answers ICCV 2023 + + +
+ To date, the majority of video retrieval systems have been optimized for a +"single-shot" scenario in which the user submits a query in isolation, ignoring +previous interactions with the system. Recently, there has been renewed +interest in interactive systems to enhance retrieval, but existing approaches +are complex and deliver limited gains in performance. In this work, we revisit +this topic and propose several simple yet effective baselines for interactive +video retrieval via question-answering. We employ a VideoQA model to simulate +user interactions and show that this enables the productive study of the +interactive retrieval task without access to ground truth dialogue data. +Experiments on MSR-VTT, MSVD, and AVSD show that our framework using +question-based interaction significantly improves the performance of text-based +video retrieval systems. + +
+
+ comment: ICCV 2023, project page: + https://github.com/kevinliang888/IVR-QA-baselines +
+
+
+
+
+ + ☆ Long-Term Prediction of Natural Video Sequences with Robust Video + Predictors + + +
+ Predicting high dimensional video sequences is a curiously difficult problem. +The number of possible futures for a given video sequence grows exponentially +over time due to uncertainty. This is especially evident when trying to predict +complicated natural video scenes from a limited snapshot of the world. The +inherent uncertainty accumulates the further into the future you predict making +long-term prediction very difficult. In this work we introduce a number of +improvements to existing work that aid in creating Robust Video Predictors +(RoViPs). We show that with a combination of deep Perceptual and +uncertainty-based reconstruction losses we are able to create high quality +short-term predictions. Attention-based skip connections are utilised to allow +for long range spatial movement of input features to further improve +performance. Finally, we show that by simply making the predictor robust to its +own prediction errors, it is possible to produce very long, realistic natural +video sequences using an iterated single-step prediction task. + +
+
+
+
+
+ + ☆ Audio-Visual Class-Incremental Learning ICCV 2023 + + +
+ In this paper, we introduce audio-visual class-incremental learning, a +class-incremental learning scenario for audio-visual video recognition. We +demonstrate that joint audio-visual modeling can improve class-incremental +learning, but current methods fail to preserve semantic similarity between +audio and visual features as incremental step grows. Furthermore, we observe +that audio-visual correlations learned in previous tasks can be forgotten as +incremental steps progress, leading to poor performance. To overcome these +challenges, we propose AV-CIL, which incorporates Dual-Audio-Visual Similarity +Constraint (D-AVSC) to maintain both instance-aware and class-aware semantic +similarity between audio-visual modalities and Visual Attention Distillation +(VAD) to retain previously learned audio-guided visual attentive ability. We +create three audio-visual class-incremental datasets, AVE-Class-Incremental +(AVE-CI), Kinetics-Sounds-Class-Incremental (K-S-CI), and +VGGSound100-Class-Incremental (VS100-CI) based on the AVE, Kinetics-Sounds, and +VGGSound datasets, respectively. Our experiments on AVE-CI, K-S-CI, and +VS100-CI demonstrate that AV-CIL significantly outperforms existing +class-incremental learning methods in audio-visual class-incremental learning. +Code and data are available at: https://github.com/weiguoPian/AV-CIL_ICCV2023. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ TeD-SPAD: Temporal Distinctiveness for Self-supervised + Privacy-preservation for video Anomaly Detection ICCV 2023 + + +
+ Video anomaly detection (VAD) without human monitoring is a complex computer +vision task that can have a positive impact on society if implemented +successfully. While recent advances have made significant progress in solving +this task, most existing approaches overlook a critical real-world concern: +privacy. With the increasing popularity of artificial intelligence +technologies, it becomes crucial to implement proper AI ethics into their +development. Privacy leakage in VAD allows models to pick up and amplify +unnecessary biases related to people's personal information, which may lead to +undesirable decision making. In this paper, we propose TeD-SPAD, a +privacy-aware video anomaly detection framework that destroys visual private +information in a self-supervised manner. In particular, we propose the use of a +temporally-distinct triplet loss to promote temporally discriminative features, +which complements current weakly-supervised VAD methods. Using TeD-SPAD, we +achieve a positive trade-off between privacy protection and utility anomaly +detection performance on three popular weakly supervised VAD datasets: +UCF-Crime, XD-Violence, and ShanghaiTech. Our proposed anonymization model +reduces private attribute prediction by 32.25% while only reducing frame-level +ROC AUC on the UCF-Crime anomaly detection dataset by 3.69%. Project Page: +https://joefioresi718.github.io/TeD-SPAD_webpage/ + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Temporal-Distributed Backdoor Attack Against Video Based Action + Recognition + + +
+ Deep neural networks (DNNs) have achieved tremendous success in various +applications including video action recognition, yet remain vulnerable to +backdoor attacks (Trojans). The backdoor-compromised model will mis-classify to +the target class chosen by the attacker when a test instance (from a non-target +class) is embedded with a specific trigger, while maintaining high accuracy on +attack-free instances. Although there are extensive studies on backdoor attacks +against image data, the susceptibility of video-based systems under backdoor +attacks remains largely unexplored. Current studies are direct extensions of +approaches proposed for image data, e.g., the triggers are +\textbf{independently} embedded within the frames, which tend to be detectable +by existing defenses. In this paper, we introduce a \textit{simple} yet +\textit{effective} backdoor attack against video data. Our proposed attack, +adding perturbations in a transformed domain, plants an \textbf{imperceptible, +temporally distributed} trigger across the video frames, and is shown to be +resilient to existing defensive strategies. The effectiveness of the proposed +attack is demonstrated by extensive experiments with various well-known models +on two video recognition benchmarks, UCF101 and HMDB51, and a sign language +recognition benchmark, Greek Sign Language (GSL) dataset. We delve into the +impact of several influential factors on our proposed attack and identify an +intriguing effect termed "collateral damage" through extensive studies. + +
+
+
+
+
+ + ☆ MetaGCD: Learning to Continually Learn in Generalized Category Discovery ICCV2023 + + +
+ In this paper, we consider a real-world scenario where a model that is +trained on pre-defined classes continually encounters unlabeled data that +contains both known and novel classes. The goal is to continually discover +novel classes while maintaining the performance in known classes. We name the +setting Continual Generalized Category Discovery (C-GCD). Existing methods for +novel class discovery cannot directly handle the C-GCD setting due to some +unrealistic assumptions, such as the unlabeled data only containing novel +classes. Furthermore, they fail to discover novel classes in a continual +fashion. In this work, we lift all these assumptions and propose an approach, +called MetaGCD, to learn how to incrementally discover with less forgetting. +Our proposed method uses a meta-learning framework and leverages the offline +labeled data to simulate the testing incremental learning process. A +meta-objective is defined to revolve around two conflicting learning objectives +to achieve novel class discovery without forgetting. Furthermore, a soft +neighborhood-based contrastive network is proposed to discriminate uncorrelated +images while attracting correlated images. We build strong baselines and +conduct extensive experiments on three widely used benchmarks to demonstrate +the superiority of our method. + +
+
+ comment: This paper has been accepted by ICCV2023 +
+
+
+
+
+ + ☆ UnLoc: A Unified Framework for Video Localization Tasks ICCV 2023 + + +
+ While large-scale image-text pretrained models such as CLIP have been used +for multiple video-level tasks on trimmed videos, their use for temporal +localization in untrimmed videos is still a relatively unexplored task. We +design a new approach for this called UnLoc, which uses pretrained image and +text towers, and feeds tokens to a video-text fusion model. The output of the +fusion module are then used to construct a feature pyramid in which each level +connects to a head to predict a per-frame relevancy score and start/end time +displacements. Unlike previous works, our architecture enables Moment +Retrieval, Temporal Localization, and Action Segmentation with a single stage +model, without the need for action proposals, motion based pretrained features +or representation masking. Unlike specialized models, we achieve state of the +art results on all three different localization tasks with a unified approach. +Code will be available at: \url{https://github.com/google-research/scenic}. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Beyond Discriminative Regions: Saliency Maps as Alternatives to CAMs for + Weakly Supervised Semantic Segmentation + + +
+ In recent years, several Weakly Supervised Semantic Segmentation (WS3) +methods have been proposed that use class activation maps (CAMs) generated by a +classifier to produce pseudo-ground truths for training segmentation models. +While CAMs are good at highlighting discriminative regions (DR) of an image, +they are known to disregard regions of the object that do not contribute to the +classifier's prediction, termed non-discriminative regions (NDR). In contrast, +attribution methods such as saliency maps provide an alternative approach for +assigning a score to every pixel based on its contribution to the +classification prediction. This paper provides a comprehensive comparison +between saliencies and CAMs for WS3. Our study includes multiple perspectives +on understanding their similarities and dissimilarities. Moreover, we provide +new evaluation metrics that perform a comprehensive assessment of WS3 +performance of alternative methods w.r.t. CAMs. We demonstrate the +effectiveness of saliencies in addressing the limitation of CAMs through our +empirical studies on benchmark datasets. Furthermore, we propose random +cropping as a stochastic aggregation technique that improves the performance of +saliency, making it a strong alternative to CAM for WS3. + +
+
+ comment: 24 pages, 13 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ YOLOBench: Benchmarking Efficient Object Detectors on Embedded Systems + + +
+ We present YOLOBench, a benchmark comprised of 550+ YOLO-based object +detection models on 4 different datasets and 4 different embedded hardware +platforms (x86 CPU, ARM CPU, Nvidia GPU, NPU). We collect accuracy and latency +numbers for a variety of YOLO-based one-stage detectors at different model +scales by performing a fair, controlled comparison of these detectors with a +fixed training environment (code and training hyperparameters). +Pareto-optimality analysis of the collected data reveals that, if modern +detection heads and training techniques are incorporated into the learning +process, multiple architectures of the YOLO series achieve a good +accuracy-latency trade-off, including older models like YOLOv3 and YOLOv4. We +also evaluate training-free accuracy estimators used in neural architecture +search on YOLOBench and demonstrate that, while most state-of-the-art zero-cost +accuracy estimators are outperformed by a simple baseline like MAC count, some +of them can be effectively used to predict Pareto-optimal detection models. We +showcase that by using a zero-cost proxy to identify a YOLO architecture +competitive against a state-of-the-art YOLOv8 model on a Raspberry Pi 4 CPU. +The code and data are available at +https://github.com/Deeplite/deeplite-torch-zoo + +
+
+
+
+
+ + ♻ ☆ A Gated Attention Transformer for Multi-Person Pose Tracking ICCV + + +
+ Multi-person pose tracking is an important element for many applications and +requires to estimate the human poses of all persons in a video and to track +them over time. The association of poses across frames remains an open research +problem, in particular for online tracking methods, due to motion blur, crowded +scenes and occlusions. To tackle the association challenge, we propose a Gated +Attention Transformer. The core aspect of our model is the gating mechanism +that automatically adapts the impact of appearance embeddings and embeddings +based on temporal pose similarity in the attention layers. In order to +re-identify persons that have been occluded, we incorporate a pose-conditioned +re-identification network that provides initial embeddings and allows to match +persons even if the number of visible joints differ between frames. We further +propose a matching layer based on gated attention for pose-to-track association +and duplicate removal. We evaluate our approach on PoseTrack 2018 and +PoseTrack21. + +
+
+ comment: Accepted to ICCVW23 +
+
+
+
+
+ + ♻ ☆ Multi-Directional Subspace Editing in Style-Space + + +
+ This paper describes a new technique for finding disentangled semantic +directions in the latent space of StyleGAN. Our method identifies meaningful +orthogonal subspaces that allow editing of one human face attribute, while +minimizing undesired changes in other attributes. Our model is capable of +editing a single attribute in multiple directions, resulting in a range of +possible generated images. We compare our scheme with three state-of-the-art +models and show that our method outperforms them in terms of face editing and +disentanglement capabilities. Additionally, we suggest quantitative measures +for evaluating attribute separation and disentanglement, and exhibit the +superiority of our model with respect to those measures. + +
+
+
+
+
+ + ♻ ☆ Rethinking Data Distillation: Do Not Overlook Calibration ICCV 2023 + + +
+ Neural networks trained on distilled data often produce over-confident output +and require correction by calibration methods. Existing calibration methods +such as temperature scaling and mixup work well for networks trained on +original large-scale data. However, we find that these methods fail to +calibrate networks trained on data distilled from large source datasets. In +this paper, we show that distilled data lead to networks that are not +calibratable due to (i) a more concentrated distribution of the maximum logits +and (ii) the loss of information that is semantically meaningful but unrelated +to classification tasks. To address this problem, we propose Masked Temperature +Scaling (MTS) and Masked Distillation Training (MDT) which mitigate the +limitations of distilled data and achieve better calibration results while +maintaining the efficiency of dataset distillation. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ WeditGAN: Few-shot Image Generation via Latent Space Relocation + + +
+ In few-shot image generation, directly training GAN models on just a handful +of images faces the risk of overfitting. A popular solution is to transfer the +models pretrained on large source domains to small target ones. In this work, +we introduce WeditGAN, which realizes model transfer by editing the +intermediate latent codes $w$ in StyleGANs with learned constant offsets +($\Delta w$), discovering and constructing target latent spaces via simply +relocating the distribution of source latent spaces. The established one-to-one +mapping between latent spaces can naturally prevents mode collapse and +overfitting. Besides, we also propose variants of WeditGAN to further enhance +the relocation process by regularizing the direction or finetuning the +intensity of $\Delta w$. Experiments on a collection of widely used +source/target datasets manifest the capability of WeditGAN in generating +realistic and diverse images, which is simple yet highly effective in the +research area of few-shot image generation. + +
+
+ comment: under review, see supplementary material for updates of this version +
+
+
+
+
+ + ♻ ☆ One-Vote Veto: Semi-Supervised Learning for Low-Shot Glaucoma Diagnosis + + +
+ Convolutional neural networks (CNNs) are a promising technique for automated +glaucoma diagnosis from images of the fundus, and these images are routinely +acquired as part of an ophthalmic exam. Nevertheless, CNNs typically require a +large amount of well-labeled data for training, which may not be available in +many biomedical image classification applications, especially when diseases are +rare and where labeling by experts is costly. This article makes two +contributions to address this issue: (1) It extends the conventional Siamese +network and introduces a training method for low-shot learning when labeled +data are limited and imbalanced, and (2) it introduces a novel semi-supervised +learning strategy that uses additional unlabeled training data to achieve +greater accuracy. Our proposed multi-task Siamese network (MTSN) can employ any +backbone CNN, and we demonstrate with four backbone CNNs that its accuracy with +limited training data approaches the accuracy of backbone CNNs trained with a +dataset that is 50 times larger. We also introduce One-Vote Veto (OVV) +self-training, a semi-supervised learning strategy that is designed +specifically for MTSNs. By taking both self-predictions and contrastive +predictions of the unlabeled training data into account, OVV self-training +provides additional pseudo labels for fine-tuning a pre-trained MTSN. Using a +large (imbalanced) dataset with 66,715 fundus photographs acquired over 15 +years, extensive experimental results demonstrate the effectiveness of low-shot +learning with MTSN and semi-supervised learning with OVV self-training. Three +additional, smaller clinical datasets of fundus images acquired under different +conditions (cameras, instruments, locations, populations) are used to +demonstrate the generalizability of the proposed methods. + +
+
+ comment: accepted by IEEE Transactions on Medical Imaging (T-MI). DOI: + 10.1109/TMI.2023.3307689 +
+
+
+
+
+ + ♻ ☆ SatlasPretrain: A Large-Scale Dataset for Remote Sensing Image + Understanding ICCV 2023 + + +
+ Remote sensing images are useful for a wide variety of planet monitoring +applications, from tracking deforestation to tackling illegal fishing. The +Earth is extremely diverse -- the amount of potential tasks in remote sensing +images is massive, and the sizes of features range from several kilometers to +just tens of centimeters. However, creating generalizable computer vision +methods is a challenge in part due to the lack of a large-scale dataset that +captures these diverse features for many tasks. In this paper, we present +SatlasPretrain, a remote sensing dataset that is large in both breadth and +scale, combining Sentinel-2 and NAIP images with 302M labels under 137 +categories and seven label types. We evaluate eight baselines and a proposed +method on SatlasPretrain, and find that there is substantial room for +improvement in addressing research challenges specific to remote sensing, +including processing image time series that consist of images from very +different types of sensors, and taking advantage of long-range spatial context. +Moreover, we find that pre-training on SatlasPretrain substantially improves +performance on downstream tasks, increasing average accuracy by 18% over +ImageNet and 6% over the next best baseline. The dataset, pre-trained model +weights, and code are available at https://satlas-pretrain.allen.ai/. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Sound Localization from Motion: Jointly Learning Sound Direction and + Camera Rotation ICCV 2023 + + +
+ The images and sounds that we perceive undergo subtle but geometrically +consistent changes as we rotate our heads. In this paper, we use these cues to +solve a problem we call Sound Localization from Motion (SLfM): jointly +estimating camera rotation and localizing sound sources. We learn to solve +these tasks solely through self-supervision. A visual model predicts camera +rotation from a pair of images, while an audio model predicts the direction of +sound sources from binaural sounds. We train these models to generate +predictions that agree with one another. At test time, the models can be +deployed independently. To obtain a feature representation that is well-suited +to solving this challenging problem, we also propose a method for learning an +audio-visual representation through cross-view binauralization: estimating +binaural sound from one view, given images and sound from another. Our model +can successfully estimate accurate rotations on both real and synthetic scenes, +and localize sound sources with accuracy competitive with state-of-the-art +self-supervised approaches. Project site: https://ificl.github.io/SLfM/ + +
+
+ comment: ICCV 2023. Project site: https://ificl.github.io/SLfM/ +
+
+
+
+
+ + ♻ ☆ Self-supervised Hypergraphs for Learning Multiple World Interpretations ICCV 2023 + + +
+ We present a method for learning multiple scene representations given a small +labeled set, by exploiting the relationships between such representations in +the form of a multi-task hypergraph. We also show how we can use the hypergraph +to improve a powerful pretrained VisTransformer model without any additional +labeled data. In our hypergraph, each node is an interpretation layer (e.g., +depth or segmentation) of the scene. Within each hyperedge, one or several +input nodes predict the layer at the output node. Thus, each node could be an +input node in some hyperedges and an output node in others. In this way, +multiple paths can reach the same node, to form ensembles from which we obtain +robust pseudolabels, which allow self-supervised learning in the hypergraph. We +test different ensemble models and different types of hyperedges and show +superior performance to other multi-task graph models in the field. We also +introduce Dronescapes, a large video dataset captured with UAVs in different +complex real-world scenes, with multiple representations, suitable for +multi-task learning. + +
+
+ comment: Accepted in ICCV 2023 Workshops +
+
+
+
+
+ + ♻ ☆ Deep Person Generation: A Survey from the Perspective of Face, Pose and + Cloth Synthesis + + +
+ Deep person generation has attracted extensive research attention due to its +wide applications in virtual agents, video conferencing, online shopping and +art/movie production. With the advancement of deep learning, visual appearances +(face, pose, cloth) of a person image can be easily generated or manipulated on +demand. In this survey, we first summarize the scope of person generation, and +then systematically review recent progress and technical trends in deep person +generation, covering three major tasks: talking-head generation (face), +pose-guided person generation (pose) and garment-oriented person generation +(cloth). More than two hundred papers are covered for a thorough overview, and +the milestone works are highlighted to witness the major technical +breakthrough. Based on these fundamental tasks, a number of applications are +investigated, e.g., virtual fitting, digital human, generative data +augmentation. We hope this survey could shed some light on the future prospects +of deep person generation, and provide a helpful foundation for full +applications towards digital human. + +
+
+
+
+
+ + ♻ ☆ Real-time Multi-person Eyeblink Detection in the Wild for Untrimmed + Video CVPR 2023 + + +
+ Real-time eyeblink detection in the wild can widely serve for fatigue +detection, face anti-spoofing, emotion analysis, etc. The existing research +efforts generally focus on single-person cases towards trimmed video. However, +multi-person scenario within untrimmed videos is also important for practical +applications, which has not been well concerned yet. To address this, we shed +light on this research field for the first time with essential contributions on +dataset, theory, and practices. In particular, a large-scale dataset termed +MPEblink that involves 686 untrimmed videos with 8748 eyeblink events is +proposed under multi-person conditions. The samples are captured from +unconstrained films to reveal "in the wild" characteristics. Meanwhile, a +real-time multi-person eyeblink detection method is also proposed. Being +different from the existing counterparts, our proposition runs in a one-stage +spatio-temporal way with end-to-end learning capacity. Specifically, it +simultaneously addresses the sub-tasks of face detection, face tracking, and +human instance-level eyeblink detection. This paradigm holds 2 main advantages: +(1) eyeblink features can be facilitated via the face's global context (e.g., +head pose and illumination condition) with joint optimization and interaction, +and (2) addressing these sub-tasks in parallel instead of sequential manner can +save time remarkably to meet the real-time running requirement. Experiments on +MPEblink verify the essential challenges of real-time multi-person eyeblink +detection in the wild for untrimmed video. Our method also outperforms existing +approaches by large margins and with a high inference speed. + +
+
+ comment: Accepted by CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Vox-E: Text-guided Voxel Editing of 3D Objects + + +
+ Large scale text-guided diffusion models have garnered significant attention +due to their ability to synthesize diverse images that convey complex visual +concepts. This generative power has more recently been leveraged to perform +text-to-3D synthesis. In this work, we present a technique that harnesses the +power of latent diffusion models for editing existing 3D objects. Our method +takes oriented 2D images of a 3D object as input and learns a grid-based +volumetric representation of it. To guide the volumetric representation to +conform to a target text prompt, we follow unconditional text-to-3D methods and +optimize a Score Distillation Sampling (SDS) loss. However, we observe that +combining this diffusion-guided loss with an image-based regularization loss +that encourages the representation not to deviate too strongly from the input +object is challenging, as it requires achieving two conflicting goals while +viewing only structure-and-appearance coupled 2D projections. Thus, we +introduce a novel volumetric regularization loss that operates directly in 3D +space, utilizing the explicit nature of our 3D representation to enforce +correlation between the global structure of the original and edited object. +Furthermore, we present a technique that optimizes cross-attention volumetric +grids to refine the spatial extent of the edits. Extensive experiments and +comparisons demonstrate the effectiveness of our approach in creating a myriad +of edits which cannot be achieved by prior works. + +
+
+ comment: Project webpage: https://tau-vailab.github.io/Vox-E/ +
+
+
+
+
+ + ♻ ☆ Hierarchical Integration Diffusion Model for Realistic Image Deblurring + + +
+ Diffusion models (DMs) have recently been introduced in image deblurring and +exhibited promising performance, particularly in terms of details +reconstruction. However, the diffusion model requires a large number of +inference iterations to recover the clean image from pure Gaussian noise, which +consumes massive computational resources. Moreover, the distribution +synthesized by the diffusion model is often misaligned with the target results, +leading to restrictions in distortion-based metrics. To address the above +issues, we propose the Hierarchical Integration Diffusion Model (HI-Diff), for +realistic image deblurring. Specifically, we perform the DM in a highly +compacted latent space to generate the prior feature for the deblurring +process. The deblurring process is implemented by a regression-based method to +obtain better distortion accuracy. Meanwhile, the highly compact latent space +ensures the efficiency of the DM. Furthermore, we design the hierarchical +integration module to fuse the prior into the regression-based model from +multiple scales, enabling better generalization in complex blurry scenarios. +Comprehensive experiments on synthetic and real-world blur datasets demonstrate +that our HI-Diff outperforms state-of-the-art methods. Code and trained models +are available at https://github.com/zhengchen1999/HI-Diff. + +
+
+ comment: Code is available at https://github.com/zhengchen1999/HI-Diff +
+
+
+
+
+ + ♻ ☆ BoxDiff: Text-to-Image Synthesis with Training-Free Box-Constrained + Diffusion ICCV 2023 + + +
+ Recent text-to-image diffusion models have demonstrated an astonishing +capacity to generate high-quality images. However, researchers mainly studied +the way of synthesizing images with only text prompts. While some works have +explored using other modalities as conditions, considerable paired data, e.g., +box/mask-image pairs, and fine-tuning time are required for nurturing models. +As such paired data is time-consuming and labor-intensive to acquire and +restricted to a closed set, this potentially becomes the bottleneck for +applications in an open world. This paper focuses on the simplest form of +user-provided conditions, e.g., box or scribble. To mitigate the aforementioned +problem, we propose a training-free method to control objects and contexts in +the synthesized images adhering to the given spatial conditions. Specifically, +three spatial constraints, i.e., Inner-Box, Outer-Box, and Corner Constraints, +are designed and seamlessly integrated into the denoising step of diffusion +models, requiring no additional training and massive annotated layout data. +Extensive experimental results demonstrate that the proposed constraints can +control what and where to present in the images while retaining the ability of +Diffusion models to synthesize with high fidelity and diverse concept coverage. +The code is publicly available at https://github.com/showlab/BoxDiff. + +
+
+ comment: Accepted by ICCV 2023. Code is available at: + https://github.com/showlab/BoxDiff +
+
+
+
+
+ + ♻ ☆ SPARF: Large-Scale Learning of 3D Sparse Radiance Fields from Few Input + Images ICCV 2023 + + +
+ Recent advances in Neural Radiance Fields (NeRFs) treat the problem of novel +view synthesis as Sparse Radiance Field (SRF) optimization using sparse voxels +for efficient and fast rendering (plenoxels,InstantNGP). In order to leverage +machine learning and adoption of SRFs as a 3D representation, we present SPARF, +a large-scale ShapeNet-based synthetic dataset for novel view synthesis +consisting of $\sim$ 17 million images rendered from nearly 40,000 shapes at +high resolution (400 X 400 pixels). The dataset is orders of magnitude larger +than existing synthetic datasets for novel view synthesis and includes more +than one million 3D-optimized radiance fields with multiple voxel resolutions. +Furthermore, we propose a novel pipeline (SuRFNet) that learns to generate +sparse voxel radiance fields from only few views. This is done by using the +densely collected SPARF dataset and 3D sparse convolutions. SuRFNet employs +partial SRFs from few/one images and a specialized SRF loss to learn to +generate high-quality sparse voxel radiance fields that can be rendered from +novel views. Our approach achieves state-of-the-art results in the task of +unconstrained novel view synthesis based on few views on ShapeNet as compared +to recent baselines. The SPARF dataset is made public with the code and models +on the project website https://abdullahamdi.com/sparf/ . + +
+
+ comment: published at ICCV 2023 workshop proceedings +
+
+
+
+
+ + ♻ ☆ Tube-Link: A Flexible Cross Tube Framework for Universal Video + Segmentation ICCV-2023 + + +
+ Video segmentation aims to segment and track every pixel in diverse scenarios +accurately. In this paper, we present Tube-Link, a versatile framework that +addresses multiple core tasks of video segmentation with a unified +architecture. Our framework is a near-online approach that takes a short +subclip as input and outputs the corresponding spatial-temporal tube masks. To +enhance the modeling of cross-tube relationships, we propose an effective way +to perform tube-level linking via attention along the queries. In addition, we +introduce temporal contrastive learning to instance-wise discriminative +features for tube-level association. Our approach offers flexibility and +efficiency for both short and long video inputs, as the length of each subclip +can be varied according to the needs of datasets or scenarios. Tube-Link +outperforms existing specialized architectures by a significant margin on five +video segmentation datasets. Specifically, it achieves almost 13% relative +improvements on VIPSeg and 4% improvements on KITTI-STEP over the strong +baseline Video K-Net. When using a ResNet50 backbone on Youtube-VIS-2019 and +2021, Tube-Link boosts IDOL by 3% and 4%, respectively. + +
+
+ comment: ICCV-2023, Project page: https://github.com/lxtGH/Tube-Link (fix + typos and errors, update the results) +
+
+
+
+
+ + ♻ ☆ Learning Support and Trivial Prototypes for Interpretable Image + Classification ICCV 2023 + + +
+ Prototypical part network (ProtoPNet) methods have been designed to achieve +interpretable classification by associating predictions with a set of training +prototypes, which we refer to as trivial prototypes because they are trained to +lie far from the classification boundary in the feature space. Note that it is +possible to make an analogy between ProtoPNet and support vector machine (SVM) +given that the classification from both methods relies on computing similarity +with a set of training points (i.e., trivial prototypes in ProtoPNet, and +support vectors in SVM). However, while trivial prototypes are located far from +the classification boundary, support vectors are located close to this +boundary, and we argue that this discrepancy with the well-established SVM +theory can result in ProtoPNet models with inferior classification accuracy. In +this paper, we aim to improve the classification of ProtoPNet with a new method +to learn support prototypes that lie near the classification boundary in the +feature space, as suggested by the SVM theory. In addition, we target the +improvement of classification results with a new model, named ST-ProtoPNet, +which exploits our support prototypes and the trivial prototypes to provide +more effective classification. Experimental results on CUB-200-2011, Stanford +Cars, and Stanford Dogs datasets demonstrate that ST-ProtoPNet achieves +state-of-the-art classification accuracy and interpretability results. We also +show that the proposed support prototypes tend to be better localised in the +object of interest rather than in the background region. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Approximated Prompt Tuning for Vision-Language Pre-trained Models + + +
+ Prompt tuning is a parameter-efficient way to deploy large-scale pre-trained +models to downstream tasks by adding task-specific tokens. In terms of +vision-language pre-trained (VLP) models, prompt tuning often requires a large +number of learnable tokens to bridge the gap between the pre-training and +downstream tasks, which greatly exacerbates the already high computational +overhead. In this paper, we revisit the principle of prompt tuning for +Transformer-based VLP models, and reveal that the impact of soft prompt tokens +can be actually approximated via independent information diffusion steps, +thereby avoiding the expensive global attention modeling and reducing the +computational complexity to a large extent. Based on this finding, we propose a +novel Approximated Prompt Tuning (APT) approach towards efficient VL transfer +learning. To validate APT, we apply it to two representative VLP models, namely +ViLT and METER, and conduct extensive experiments on a bunch of downstream +tasks. Meanwhile, the generalization of APT is also validated on CLIP for image +classification and StableDiffusion for text-to-image generation. The +experimental results not only show the superior performance gains and +computation efficiency of APT against the conventional prompt tuning methods, +e.g., +7.01% accuracy and -82.30% additional computation overhead on METER, but +also confirm its merits over other parameter-efficient transfer learning +approaches. + +
+
+
+
+
+ + ♻ ☆ Fusing Structural and Functional Connectivities using Disentangled VAE + for Detecting MCI + + +
+ Brain network analysis is a useful approach to studying human brain disorders +because it can distinguish patients from healthy people by detecting abnormal +connections. Due to the complementary information from multiple modal +neuroimages, multimodal fusion technology has a lot of potential for improving +prediction performance. However, effective fusion of multimodal medical images +to achieve complementarity is still a challenging problem. In this paper, a +novel hierarchical structural-functional connectivity fusing (HSCF) model is +proposed to construct brain structural-functional connectivity matrices and +predict abnormal brain connections based on functional magnetic resonance +imaging (fMRI) and diffusion tensor imaging (DTI). Specifically, the prior +knowledge is incorporated into the separators for disentangling each modality +of information by the graph convolutional networks (GCN). And a disentangled +cosine distance loss is devised to ensure the disentanglement's effectiveness. +Moreover, the hierarchical representation fusion module is designed to +effectively maximize the combination of relevant and effective features between +modalities, which makes the generated structural-functional connectivity more +robust and discriminative in the cognitive disease analysis. Results from a +wide range of tests performed on the public Alzheimer's Disease Neuroimaging +Initiative (ADNI) database show that the proposed model performs better than +competing approaches in terms of classification evaluation. In general, the +proposed HSCF model is a promising model for generating brain +structural-functional connectivities and identifying abnormal brain connections +as cognitive disease progresses. + +
+
+ comment: 4 figures +
+
+
+
+
+ + ♻ ☆ CrossMap Transformer: A Crossmodal Masked Path Transformer Using Double + Back-Translation for Vision-and-Language Navigation + + +
+ Navigation guided by natural language instructions is particularly suitable +for Domestic Service Robots that interacts naturally with users. This task +involves the prediction of a sequence of actions that leads to a specified +destination given a natural language navigation instruction. The task thus +requires the understanding of instructions, such as ``Walk out of the bathroom +and wait on the stairs that are on the right''. The Visual and Language +Navigation remains challenging, notably because it requires the exploration of +the environment and at the accurate following of a path specified by the +instructions to model the relationship between language and vision. To address +this, we propose the CrossMap Transformer network, which encodes the linguistic +and visual features to sequentially generate a path. The CrossMap transformer +is tied to a Transformer-based speaker that generates navigation instructions. +The two networks share common latent features, for mutual enhancement through a +double back translation model: Generated paths are translated into instructions +while generated instructions are translated into path The experimental results +show the benefits of our approach in terms of instruction understanding and +instruction generation. + +
+
+ comment: 8 pages, 5 figures, 5 tables. Submitted to IEEE Robotics and + Automation Letters +
+
+
+
+
+ + ♻ ☆ Multi-scale Target-Aware Framework for Constrained Image Splicing + Detection and Localization + + +
+ Constrained image splicing detection and localization (CISDL) is a +fundamental task of multimedia forensics, which detects splicing operation +between two suspected images and localizes the spliced region on both images. +Recent works regard it as a deep matching problem and have made significant +progress. However, existing frameworks typically perform feature extraction and +correlation matching as separate processes, which may hinder the model's +ability to learn discriminative features for matching and can be susceptible to +interference from ambiguous background pixels. In this work, we propose a +multi-scale target-aware framework to couple feature extraction and correlation +matching in a unified pipeline. In contrast to previous methods, we design a +target-aware attention mechanism that jointly learns features and performs +correlation matching between the probe and donor images. Our approach can +effectively promote the collaborative learning of related patches, and perform +mutual promotion of feature learning and correlation matching. Additionally, in +order to handle scale transformations, we introduce a multi-scale projection +method, which can be readily integrated into our target-aware framework that +enables the attention process to be conducted between tokens containing +information of varying scales. Our experiments demonstrate that our model, +which uses a unified pipeline, outperforms state-of-the-art methods on several +benchmark datasets and is robust against scale transformations. + +
+
+ comment: accepted by ACMMM2023 +
+
+
+
+
+ + ♻ ☆ Contrastive Learning for Lane Detection via Cross-Similarity + + +
+ Detecting road lanes is challenging due to intricate markings vulnerable to +unfavorable conditions. Lane markings have strong shape priors, but their +visibility is easily compromised. Factors like lighting, weather, vehicles, +pedestrians, and aging colors challenge the detection. A large amount of data +is required to train a lane detection approach that can withstand natural +variations caused by low visibility. This is because there are numerous lane +shapes and natural variations that exist. Our solution, Contrastive Learning +for Lane Detection via cross-similarity (CLLD), is a self-supervised learning +method that tackles this challenge by enhancing lane detection models +resilience to real-world conditions that cause lane low visibility. CLLD is a +novel multitask contrastive learning that trains lane detection approaches to +detect lane markings even in low visible situations by integrating local +feature contrastive learning (CL) with our new proposed operation +cross-similarity. Local feature CL focuses on extracting features for small +image parts, which is necessary to localize lane segments, while +cross-similarity captures global features to detect obscured lane segments +using their surrounding. We enhance cross-similarity by randomly masking parts +of input images for augmentation. Evaluated on benchmark datasets, CLLD +outperforms state-of-the-art contrastive learning, especially in +visibility-impairing conditions like shadows. Compared to supervised learning, +CLLD excels in scenarios like shadows and crowded scenes. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Transferable Attack for Semantic Segmentation + + +
+ We analysis performance of semantic segmentation models wrt. adversarial +attacks, and observe that the adversarial examples generated from a source +model fail to attack the target models. i.e The conventional attack methods, +such as PGD and FGSM, do not transfer well to target models, making it +necessary to study the transferable attacks, especially transferable attacks +for semantic segmentation. We find two main factors to achieve transferable +attack. Firstly, the attack should come with effective data augmentation and +translation-invariant features to deal with unseen models. Secondly, stabilized +optimization strategies are needed to find the optimal attack direction. Based +on the above observations, we propose an ensemble attack for semantic +segmentation to achieve more effective attacks with higher transferability. The +source code and experimental results are publicly available via our project +page: https://github.com/anucvers/TASS. + +
+
+ comment: Source code is available at: https://github.com/anucvers/TASS +
+
+
+
+
+ + ♻ ☆ Suspected Object Matters: Rethinking Model's Prediction for One-stage + Visual Grounding ACM MM 23 + + +
+ Recently, one-stage visual grounders attract high attention due to their +comparable accuracy but significantly higher efficiency than two-stage +grounders. However, inter-object relation modeling has not been well studied +for one-stage grounders. Inter-object relationship modeling, though important, +is not necessarily performed among all objects, as only part of them are +related to the text query and may confuse the model. We call these objects +suspected objects. However, exploring their relationships in the one-stage +paradigm is non-trivial because: First, no object proposals are available as +the basis on which to select suspected objects and perform relationship +modeling. Second, suspected objects are more confusing than others, as they may +share similar semantics, be entangled with certain relationships, etc, and +thereby more easily mislead the model prediction. Toward this end, we propose a +Suspected Object Transformation mechanism (SOT), which can be seamlessly +integrated into existing CNN and Transformer-based one-stage visual grounders +to encourage the target object selection among the suspected ones. Suspected +objects are dynamically discovered from a learned activation map adapted to the +model current discrimination ability during training. Afterward, on top of +suspected objects, a Keyword-Aware Discrimination module (KAD) and an +Exploration by Random Connection strategy (ERC) are concurrently proposed to +help the model rethink its initial prediction. On the one hand, KAD leverages +keywords contributing high to suspected object discrimination. On the other +hand, ERC allows the model to seek the correct object instead of being trapped +in a situation that always exploits the current false prediction. Extensive +experiments demonstrate the effectiveness of our proposed method. + +
+
+ comment: Accepted to ACM MM 23 +
+
+
+
+
+ + ♻ ☆ Automatic Classification of Blood Cell Images Using Convolutional Neural + Network + + +
+ Human blood primarily comprises plasma, red blood cells, white blood cells, +and platelets. It plays a vital role in transporting nutrients to different +organs, where it stores essential health-related data about the human body. +Blood cells are utilized to defend the body against diverse infections, +including fungi, viruses, and bacteria. Hence, blood analysis can help +physicians assess an individual's physiological condition. Blood cells have +been sub-classified into eight groups: Neutrophils, eosinophils, basophils, +lymphocytes, monocytes, immature granulocytes (promyelocytes, myelocytes, and +metamyelocytes), erythroblasts, and platelets or thrombocytes on the basis of +their nucleus, shape, and cytoplasm. Traditionally, pathologists and +hematologists in laboratories have examined these blood cells using a +microscope before manually classifying them. The manual approach is slower and +more prone to human error. Therefore, it is essential to automate this process. +In our paper, transfer learning with CNN pre-trained models. VGG16, VGG19, +ResNet-50, ResNet-101, ResNet-152, InceptionV3, MobileNetV2, and DenseNet-20 +applied to the PBC dataset's normal DIB. The overall accuracy achieved with +these models lies between 91.375 and 94.72%. Hence, inspired by these +pre-trained architectures, a model has been proposed to automatically classify +the ten types of blood cells with increased accuracy. A novel CNN-based +framework has been presented to improve accuracy. The proposed CNN model has +been tested on the PBC dataset normal DIB. The outcomes of the experiments +demonstrate that our CNN-based framework designed for blood cell classification +attains an accuracy of 99.91% on the PBC dataset. Our proposed convolutional +neural network model performs competitively when compared to earlier results +reported in the literature. + +
+
+ comment: 15 +
+
+
+
+
+ + ♻ ☆ Conditioning Generative Latent Optimization to solve Imaging Inverse + Problems + + +
+ Computed Tomography (CT) is a prominent example of Imaging Inverse Problem +(IIP), highlighting the unrivalled performances of data-driven methods in +degraded measurements setups like sparse X-ray projections. Although a +significant proportion of deep learning approaches benefit from large +supervised datasets to directly map experimental measurements to medical scans, +they cannot generalize to unknown acquisition setups. In contrast, fully +unsupervised techniques, most notably using score-based generative models, have +recently demonstrated similar or better performances compared to supervised +approaches to solve IIPs while being flexible at test time regarding the +imaging setup. However, their use cases are limited by two factors: (a) they +need considerable amounts of training data to have good generalization +properties and (b) they require a backward operator, like +Filtered-Back-Projection in the case of CT, to condition the learned prior +distribution of medical scans to experimental measurements. To overcome these +issues, we propose an unsupervised conditional approach to the Generative +Latent Optimization framework (cGLO), in which the parameters of a decoder +network are initialized on an unsupervised dataset. The decoder is then used +for reconstruction purposes, by performing Generative Latent Optimization with +a loss function directly comparing simulated measurements from proposed +reconstructions to experimental measurements. The resulting approach, tested on +sparse-view CT using multiple training dataset sizes, demonstrates better +reconstruction quality compared to state-of-the-art score-based strategies in +most data regimes and shows an increasing performance advantage for smaller +training datasets and reduced projection angles. Furthermore, cGLO does not +require any backward operator and could expand use cases even to non-linear +IIPs. + +
+
+ comment: comments: 20 pages, 9 figures; typos corrected +
+
+
+
+
+ + ♻ ☆ OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text + Documents + + +
+ Large multimodal models trained on natural documents, which interleave images +and text, outperform models trained on image-text pairs on various multimodal +benchmarks. However, the datasets used to train these models have not been +released, and the collection process has not been fully specified. We introduce +the OBELICS dataset, an open web-scale filtered dataset of interleaved +image-text documents comprising 141 million web pages extracted from Common +Crawl, 353 million associated images, and 115 billion text tokens. We describe +the dataset creation process, present comprehensive filtering rules, and +provide an analysis of the dataset's content. To show the viability of OBELICS, +we train vision and language models of 9 and 80 billion parameters named +IDEFICS, and obtain competitive performance on different multimodal benchmarks. +We release our dataset, models and code. + +
+
+
+
+
+ + ♻ ☆ Classification of White Blood Cells Using Machine and Deep Learning + Models: A Systematic Review + + +
+ Machine learning (ML) and deep learning (DL) models have been employed to +significantly improve analyses of medical imagery, with these approaches used +to enhance the accuracy of prediction and classification. Model predictions and +classifications assist diagnoses of various cancers and tumors. This review +presents an in-depth analysis of modern techniques applied within the domain of +medical image analysis for white blood cell classification. The methodologies +that use blood smear images, magnetic resonance imaging (MRI), X-rays, and +similar medical imaging domains are identified and discussed, with a detailed +analysis of ML/DL techniques applied to the classification of white blood cells +(WBCs) representing the primary focus of the review. The data utilized in this +research has been extracted from a collection of 136 primary papers that were +published between the years 2006 and 2023. The most widely used techniques and +best-performing white blood cell classification methods are identified. While +the use of ML and DL for white blood cell classification has concurrently +increased and improved in recent year, significant challenges remain - 1) +Availability of appropriate datasets remain the primary challenge, and may be +resolved using data augmentation techniques. 2) Medical training of researchers +is recommended to improve current understanding of white blood cell structure +and subsequent selection of appropriate classification models. 3) Advanced DL +networks including Generative Adversarial Networks, R-CNN, Fast R-CNN, and +faster R-CNN will likely be increasingly employed to supplement or replace +current techniques. + +
+
+
+
+
+ + ♻ ☆ Concept Evolution in Deep Learning Training: A Unified Interpretation + Framework and Discoveries CIKM'23 + + +
+ We present ConceptEvo, a unified interpretation framework for deep neural +networks (DNNs) that reveals the inception and evolution of learned concepts +during training. Our work addresses a critical gap in DNN interpretation +research, as existing methods primarily focus on post-training interpretation. +ConceptEvo introduces two novel technical contributions: (1) an algorithm that +generates a unified semantic space, enabling side-by-side comparison of +different models during training, and (2) an algorithm that discovers and +quantifies important concept evolutions for class predictions. Through a +large-scale human evaluation and quantitative experiments, we demonstrate that +ConceptEvo successfully identifies concept evolutions across different models, +which are not only comprehensible to humans but also crucial for class +predictions. ConceptEvo is applicable to both modern DNN architectures, such as +ConvNeXt, and classic DNNs, such as VGGs and InceptionV3. + +
+
+ comment: Accepted at CIKM'23 +
+
+
+
+
+ + ♻ ☆ ZeroPose: CAD-Model-based Zero-Shot Pose Estimation + + +
+ In this paper, we present a CAD model-based zero-shot pose estimation +pipeline called ZeroPose. Existing pose estimation methods remain to require +expensive training when applied to an unseen object, which greatly hinders +their scalability in the practical application of industry. In contrast, the +proposed method enables the accurate estimation of pose parameters for +previously unseen objects without the need for training. Specifically, we +design a two-step pipeline consisting of CAD model-based zero-shot instance +segmentation and a zero-shot pose estimator. For the first step, there is a +simple but effective way to leverage CAD models and visual foundation models +SAM and Imagebind to segment the interest unseen object at the instance level. +For the second step, we based on the intensive geometric information in the CAD +model of the rigid object to propose a lightweight hierarchical geometric +structure matching mechanism achieving zero-shot pose estimation. Extensive +experimental results on the seven core datasets on the BOP challenge show that +the proposed zero-shot instance segmentation methods achieve comparable +performance with supervised MaskRCNN and the zero-shot pose estimation results +outperform the SOTA pose estimators with better efficiency. + +
+
+
+
+
+ + ♻ ☆ Synthesizing Diverse Human Motions in 3D Indoor Scenes + + +
+ We present a novel method for populating 3D indoor scenes with virtual humans +that can navigate in the environment and interact with objects in a realistic +manner. Existing approaches rely on training sequences that contain captured +human motions and the 3D scenes they interact with. However, such interaction +data are costly, difficult to capture, and can hardly cover all plausible +human-scene interactions in complex environments. To address these challenges, +we propose a reinforcement learning-based approach that enables virtual humans +to navigate in 3D scenes and interact with objects realistically and +autonomously, driven by learned motion control policies. The motion control +policies employ latent motion action spaces, which correspond to realistic +motion primitives and are learned from large-scale motion capture data using a +powerful generative motion model. For navigation in a 3D environment, we +propose a scene-aware policy with novel state and reward designs for collision +avoidance. Combined with navigation mesh-based path-finding algorithms to +generate intermediate waypoints, our approach enables the synthesis of diverse +human motions navigating in 3D indoor scenes and avoiding obstacles. To +generate fine-grained human-object interactions, we carefully curate +interaction goal guidance using a marker-based body representation and leverage +features based on the signed distance field (SDF) to encode human-scene +proximity relations. Our method can synthesize realistic and diverse +human-object interactions (e.g.,~sitting on a chair and then getting up) even +for out-of-distribution test scenarios with different object shapes, +orientations, starting body positions, and poses. Experimental results +demonstrate that our approach outperforms state-of-the-art methods in terms of +both motion naturalness and diversity. Code and video results are available at: +https://zkf1997.github.io/DIMOS. + +
+
+
+
+
+ + ♻ ☆ One-shot Implicit Animatable Avatars with Model-based Priors ICCV 2023 + + +
+ Existing neural rendering methods for creating human avatars typically either +require dense input signals such as video or multi-view images, or leverage a +learned prior from large-scale specific 3D human datasets such that +reconstruction can be performed with sparse-view inputs. Most of these methods +fail to achieve realistic reconstruction when only a single image is available. +To enable the data-efficient creation of realistic animatable 3D humans, we +propose ELICIT, a novel method for learning human-specific neural radiance +fields from a single image. Inspired by the fact that humans can effortlessly +estimate the body geometry and imagine full-body clothing from a single image, +we leverage two priors in ELICIT: 3D geometry prior and visual semantic prior. +Specifically, ELICIT utilizes the 3D body shape geometry prior from a skinned +vertex-based template model (i.e., SMPL) and implements the visual clothing +semantic prior with the CLIP-based pretrained models. Both priors are used to +jointly guide the optimization for creating plausible content in the invisible +areas. Taking advantage of the CLIP models, ELICIT can use text descriptions to +generate text-conditioned unseen regions. In order to further improve visual +details, we propose a segmentation-based sampling strategy that locally refines +different parts of the avatar. Comprehensive evaluations on multiple popular +benchmarks, including ZJU-MoCAP, Human3.6M, and DeepFashion, show that ELICIT +has outperformed strong baseline methods of avatar creation when only a single +image is available. The code is public for research purposes at +https://huangyangyi.github.io/ELICIT/. + +
+
+ comment: To appear at ICCV 2023. Project website: + https://huangyangyi.github.io/ELICIT/ +
+
+
+
+
+ + ♻ ☆ Review helps learn better: Temporal Supervised Knowledge Distillation AAAI 2024 + + +
+ Reviewing plays an important role when learning knowledge. The knowledge +acquisition at a certain time point may be strongly inspired with the help of +previous experience. Thus the knowledge growing procedure should show strong +relationship along the temporal dimension. In our research, we find that during +the network training, the evolution of feature map follows temporal sequence +property. A proper temporal supervision may further improve the network +training performance. Inspired by this observation, we propose Temporal +Supervised Knowledge Distillation (TSKD). Specifically, we extract the +spatiotemporal features in the different training phases of student by +convolutional Long Short-term memory network (Conv-LSTM). Then, we train the +student net through a dynamic target, rather than static teacher network +features. This process realizes the refinement of old knowledge in student +network, and utilizes it to assist current learning. Extensive experiments +verify the effectiveness and advantages of our method over existing knowledge +distillation methods, including various network architectures and different +tasks (image classification and object detection) . + +
+
+ comment: Under review in AAAI 2024 +
+
+
+
+
+ + ♻ ☆ DeepCut: Unsupervised Segmentation using Graph Neural Networks + Clustering + + +
+ Image segmentation is a fundamental task in computer vision. Data annotation +for training supervised methods can be labor-intensive, motivating unsupervised +methods. Current approaches often rely on extracting deep features from +pre-trained networks to construct a graph, and classical clustering methods +like k-means and normalized-cuts are then applied as a post-processing step. +However, this approach reduces the high-dimensional information encoded in the +features to pair-wise scalar affinities. To address this limitation, this study +introduces a lightweight Graph Neural Network (GNN) to replace classical +clustering methods while optimizing for the same clustering objective function. +Unlike existing methods, our GNN takes both the pair-wise affinities between +local image features and the raw features as input. This direct connection +between the raw features and the clustering objective enables us to implicitly +perform classification of the clusters between different graphs, resulting in +part semantic segmentation without the need for additional post-processing +steps. We demonstrate how classical clustering objectives can be formulated as +self-supervised loss functions for training an image segmentation GNN. +Furthermore, we employ the Correlation-Clustering (CC) objective to perform +clustering without defining the number of clusters, allowing for k-less +clustering. We apply the proposed method for object localization, segmentation, +and semantic part segmentation tasks, surpassing state-of-the-art performance +on multiple benchmarks. + +
+
+
+
+
+ + ♻ ☆ V2A-Mapper: A Lightweight Solution for Vision-to-Audio Generation by + Connecting Foundation Models + + +
+ Building artificial intelligence (AI) systems on top of a set of foundation +models (FMs) is becoming a new paradigm in AI research. Their representative +and generative abilities learnt from vast amounts of data can be easily adapted +and transferred to a wide range of downstream tasks without extra training from +scratch. However, leveraging FMs in cross-modal generation remains +under-researched when audio modality is involved. On the other hand, +automatically generating semantically-relevant sound from visual input is an +important problem in cross-modal generation studies. To solve this +vision-to-audio (V2A) generation problem, existing methods tend to design and +build complex systems from scratch using modestly sized datasets. In this +paper, we propose a lightweight solution to this problem by leveraging +foundation models, specifically CLIP, CLAP, and AudioLDM. We first investigate +the domain gap between the latent space of the visual CLIP and the auditory +CLAP models. Then we propose a simple yet effective mapper mechanism +(V2A-Mapper) to bridge the domain gap by translating the visual input between +CLIP and CLAP spaces. Conditioned on the translated CLAP embedding, pretrained +audio generative FM AudioLDM is adopted to produce high-fidelity and +visually-aligned sound. Compared to previous approaches, our method only +requires a quick training of the V2A-Mapper. We further analyze and conduct +extensive experiments on the choice of the V2A-Mapper and show that a +generative mapper is better at fidelity and variability (FD) while a regression +mapper is slightly better at relevance (CS). Both objective and subjective +evaluation on two V2A datasets demonstrate the superiority of our proposed +method compared to current state-of-the-art approaches - trained with 86% fewer +parameters but achieving 53% and 19% improvement in FD and CS, respectively. + +
+
+ comment: 13 pages, 10 figures. Demo page: https://v2a-mapper.github.io/ +
+
+
+
+
+ + ♻ ☆ Generalized Sum Pooling for Metric Learning ICCV + + +
+ A common architectural choice for deep metric learning is a convolutional +neural network followed by global average pooling (GAP). Albeit simple, GAP is +a highly effective way to aggregate information. One possible explanation for +the effectiveness of GAP is considering each feature vector as representing a +different semantic entity and GAP as a convex combination of them. Following +this perspective, we generalize GAP and propose a learnable generalized sum +pooling method (GSP). GSP improves GAP with two distinct abilities: i) the +ability to choose a subset of semantic entities, effectively learning to ignore +nuisance information, and ii) learning the weights corresponding to the +importance of each entity. Formally, we propose an entropy-smoothed optimal +transport problem and show that it is a strict generalization of GAP, i.e., a +specific realization of the problem gives back GAP. We show that this +optimization problem enjoys analytical gradients enabling us to use it as a +direct learnable replacement for GAP. We further propose a zero-shot loss to +ease the learning of GSP. We show the effectiveness of our method with +extensive evaluations on 4 popular metric learning benchmarks. Code is +available at: GSP-DML Framework + +
+
+ comment: Accepted as a conference paper at International Conference on + Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ♻ ☆ Feature-Suppressed Contrast for Self-Supervised Food Pre-training ACM MM 2023 + + +
+ Most previous approaches for analyzing food images have relied on extensively +annotated datasets, resulting in significant human labeling expenses due to the +varied and intricate nature of such images. Inspired by the effectiveness of +contrastive self-supervised methods in utilizing unlabelled data, weiqing +explore leveraging these techniques on unlabelled food images. In contrastive +self-supervised methods, two views are randomly generated from an image by data +augmentations. However, regarding food images, the two views tend to contain +similar informative contents, causing large mutual information, which impedes +the efficacy of contrastive self-supervised learning. To address this problem, +we propose Feature Suppressed Contrast (FeaSC) to reduce mutual information +between views. As the similar contents of the two views are salient or highly +responsive in the feature map, the proposed FeaSC uses a response-aware scheme +to localize salient features in an unsupervised manner. By suppressing some +salient features in one view while leaving another contrast view unchanged, the +mutual information between the two views is reduced, thereby enhancing the +effectiveness of contrast learning for self-supervised food pre-training. As a +plug-and-play module, the proposed method consistently improves BYOL and +SimSiam by 1.70\% $\sim$ 6.69\% classification accuracy on four publicly +available food recognition datasets. Superior results have also been achieved +on downstream segmentation tasks, demonstrating the effectiveness of the +proposed method. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Ref-DVGO: Reflection-Aware Direct Voxel Grid Optimization for an + Improved Quality-Efficiency Trade-Off in Reflective Scene Reconstruction ICCV + + +
+ Neural Radiance Fields (NeRFs) have revolutionized the field of novel view +synthesis, demonstrating remarkable performance. However, the modeling and +rendering of reflective objects remain challenging problems. Recent methods +have shown significant improvements over the baselines in handling reflective +scenes, albeit at the expense of efficiency. In this work, we aim to strike a +balance between efficiency and quality. To this end, we investigate an +implicit-explicit approach based on conventional volume rendering to enhance +the reconstruction quality and accelerate the training and rendering processes. +We adopt an efficient density-based grid representation and reparameterize the +reflected radiance in our pipeline. Our proposed reflection-aware approach +achieves a competitive quality efficiency trade-off compared to competing +methods. Based on our experimental results, we propose and discuss hypotheses +regarding the factors influencing the results of density-based methods for +reconstructing reflective objects. The source code is available at +https://github.com/gkouros/ref-dvgo. + +
+
+ comment: 5 pages, 4 figures, 3 tables, ICCV TRICKY 2023 Workshop +
+
+
+
+
+ + ♻ ☆ Residual Pattern Learning for Pixel-wise Out-of-Distribution Detection + in Semantic Segmentation ICCV'23 + + +
+ Semantic segmentation models classify pixels into a set of known +(``in-distribution'') visual classes. When deployed in an open world, the +reliability of these models depends on their ability not only to classify +in-distribution pixels but also to detect out-of-distribution (OoD) pixels. +Historically, the poor OoD detection performance of these models has motivated +the design of methods based on model re-training using synthetic training +images that include OoD visual objects. Although successful, these re-trained +methods have two issues: 1) their in-distribution segmentation accuracy may +drop during re-training, and 2) their OoD detection accuracy does not +generalise well to new contexts (e.g., country surroundings) outside the +training set (e.g., city surroundings). In this paper, we mitigate these issues +with: (i) a new residual pattern learning (RPL) module that assists the +segmentation model to detect OoD pixels without affecting the inlier +segmentation performance; and (ii) a novel context-robust contrastive learning +(CoroCL) that enforces RPL to robustly detect OoD pixels among various +contexts. Our approach improves by around 10\% FPR and 7\% AuPRC the previous +state-of-the-art in Fishyscapes, Segment-Me-If-You-Can, and RoadAnomaly +datasets. Our code is available at: https://github.com/yyliu01/RPL. + +
+
+ comment: The paper contains 16 pages and it is accepted by ICCV'23 +
+
+
+
+
+ + ♻ ☆ TMA: Temporal Motion Aggregation for Event-based Optical Flow ICCV2023 + + +
+ Event cameras have the ability to record continuous and detailed trajectories +of objects with high temporal resolution, thereby providing intuitive motion +cues for optical flow estimation. Nevertheless, most existing learning-based +approaches for event optical flow estimation directly remould the paradigm of +conventional images by representing the consecutive event stream as static +frames, ignoring the inherent temporal continuity of event data. In this paper, +we argue that temporal continuity is a vital element of event-based optical +flow and propose a novel Temporal Motion Aggregation (TMA) approach to unlock +its potential. Technically, TMA comprises three components: an event splitting +strategy to incorporate intermediate motion information underlying the temporal +context, a linear lookup strategy to align temporally fine-grained motion +features and a novel motion pattern aggregation module to emphasize consistent +patterns for motion feature enhancement. By incorporating temporally +fine-grained motion information, TMA can derive better flow estimates than +existing methods at early stages, which not only enables TMA to obtain more +accurate final predictions, but also greatly reduces the demand for a number of +refinements. Extensive experiments on DSEC-Flow and MVSEC datasets verify the +effectiveness and superiority of our TMA. Remarkably, compared to E-RAFT, TMA +achieves a 6\% improvement in accuracy and a 40\% reduction in inference time +on DSEC-Flow. Code will be available at \url{https://github.com/ispc-lab/TMA}. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ CT Perfusion is All We Need: 4D CNN Segmentation of Penumbra and Core in + Patients With Suspected Ischemic Stroke + + +
+ Precise and fast prediction methods for ischemic areas comprised of dead +tissue, core, and salvageable tissue, penumbra, in acute ischemic stroke (AIS) +patients are of significant clinical interest. They play an essential role in +improving diagnosis and treatment planning. Computed Tomography (CT) scan is +one of the primary modalities for early assessment in patients with suspected +AIS. CT Perfusion (CTP) is often used as a primary assessment to determine +stroke location, severity, and volume of ischemic lesions. Current automatic +segmentation methods for CTP mostly use already processed 3D parametric maps +conventionally used for clinical interpretation by radiologists as input. +Alternatively, the raw CTP data is used on a slice-by-slice basis as 2D+time +input, where the spatial information over the volume is ignored. In addition, +these methods are only interested in segmenting core regions, while predicting +penumbra can be essential for treatment planning. This paper investigates +different methods to utilize the entire 4D CTP as input to fully exploit the +spatio-temporal information, leading us to propose a novel 4D convolution +layer. Our comprehensive experiments on a local dataset of 152 patients divided +into three groups show that our proposed models generate more precise results +than other methods explored. Adopting the proposed 4D mJ-Net, a Dice +Coefficient of 0.53 and 0.23 is achieved for segmenting penumbra and core +areas, respectively. The code is available on +https://github.com/Biomedical-Data-Analysis-Laboratory/4D-mJ-Net.git. + +
+
+
+
+
+ + ♻ ☆ 3DPortraitGAN: Learning One-Quarter Headshot 3D GANs from a Single-View + Portrait Dataset with Diverse Body Poses + + +
+ 3D-aware face generators are typically trained on 2D real-life face image +datasets that primarily consist of near-frontal face data, and as such, they +are unable to construct one-quarter headshot 3D portraits with complete head, +neck, and shoulder geometry. Two reasons account for this issue: First, +existing facial recognition methods struggle with extracting facial data +captured from large camera angles or back views. Second, it is challenging to +learn a distribution of 3D portraits covering the one-quarter headshot region +from single-view data due to significant geometric deformation caused by +diverse body poses. To this end, we first create the dataset +360{\deg}-Portrait-HQ (360{\deg}PHQ for short) which consists of high-quality +single-view real portraits annotated with a variety of camera parameters (the +yaw angles span the entire 360{\deg} range) and body poses. We then propose +3DPortraitGAN, the first 3D-aware one-quarter headshot portrait generator that +learns a canonical 3D avatar distribution from the 360{\deg}PHQ dataset with +body pose self-learning. Our model can generate view-consistent portrait images +from all camera angles with a canonical one-quarter headshot 3D representation. +Our experiments show that the proposed framework can accurately predict +portrait body poses and generate view-consistent, realistic portrait images +with complete geometry from all camera angles. + +
+
+
+
+
+ + ♻ ☆ Transformer-based Detection of Microorganisms on High-Resolution Petri + Dish Images ICCV + + +
+ Many medical or pharmaceutical processes have strict guidelines regarding +continuous hygiene monitoring. This often involves the labor-intensive task of +manually counting microorganisms in Petri dishes by trained personnel. +Automation attempts often struggle due to major challenges: significant scaling +differences, low separation, low contrast, etc. To address these challenges, we +introduce AttnPAFPN, a high-resolution detection pipeline that leverages a +novel transformer variation, the efficient-global self-attention mechanism. Our +streamlined approach can be easily integrated in almost any multi-scale object +detection pipeline. In a comprehensive evaluation on the publicly available +AGAR dataset, we demonstrate the superior accuracy of our network over the +current state-of-the-art. In order to demonstrate the task-independent +performance of our approach, we perform further experiments on COCO and +LIVECell datasets. + +
+
+ comment: This paper has been accepted at IEEE International Conference on + Computer Vision Workshops (ICCV workshop), 2023 +
+
+
+
+
+ + ♻ ☆ AvatarCraft: Transforming Text into Neural Human Avatars with + Parameterized Shape and Pose Control ICCV 2023 + + +
+ Neural implicit fields are powerful for representing 3D scenes and generating +high-quality novel views, but it remains challenging to use such implicit +representations for creating a 3D human avatar with a specific identity and +artistic style that can be easily animated. Our proposed method, AvatarCraft, +addresses this challenge by using diffusion models to guide the learning of +geometry and texture for a neural avatar based on a single text prompt. We +carefully design the optimization framework of neural implicit fields, +including a coarse-to-fine multi-bounding box training strategy, shape +regularization, and diffusion-based constraints, to produce high-quality +geometry and texture. Additionally, we make the human avatar animatable by +deforming the neural implicit field with an explicit warping field that maps +the target human mesh to a template human mesh, both represented using +parametric human models. This simplifies animation and reshaping of the +generated avatar by controlling pose and shape parameters. Extensive +experiments on various text descriptions show that AvatarCraft is effective and +robust in creating human avatars and rendering novel views, poses, and shapes. +Our project page is: https://avatar-craft.github.io/. + +
+
+ comment: ICCV 2023 Camera Ready +
+
+
+
+
+ + ♻ ☆ Boosting Salient Object Detection with Transformer-based Asymmetric + Bilateral U-Net + + +
+ Existing salient object detection (SOD) methods mainly rely on U-shaped +convolution neural networks (CNNs) with skip connections to combine the global +contexts and local spatial details that are crucial for locating salient +objects and refining object details, respectively. Despite great successes, the +ability of CNNs in learning global contexts is limited. Recently, the vision +transformer has achieved revolutionary progress in computer vision owing to its +powerful modeling of global dependencies. However, directly applying the +transformer to SOD is suboptimal because the transformer lacks the ability to +learn local spatial representations. To this end, this paper explores the +combination of transformers and CNNs to learn both global and local +representations for SOD. We propose a transformer-based Asymmetric Bilateral +U-Net (ABiU-Net). The asymmetric bilateral encoder has a transformer path and a +lightweight CNN path, where the two paths communicate at each encoder stage to +learn complementary global contexts and local spatial details, respectively. +The asymmetric bilateral decoder also consists of two paths to process features +from the transformer and CNN encoder paths, with communication at each decoder +stage for decoding coarse salient object locations and fine-grained object +details, respectively. Such communication between the two encoder/decoder paths +enables AbiU-Net to learn complementary global and local representations, +taking advantage of the natural merits of transformers and CNNs, respectively. +Hence, ABiU-Net provides a new perspective for transformer-based SOD. Extensive +experiments demonstrate that ABiU-Net performs favorably against previous +state-of-the-art SOD methods. The code is available at +https://github.com/yuqiuyuqiu/ABiU-Net. + +
+
+ comment: Accepted by IEEE Transactions on Circuits and Systems for Video + Technology (TCSVT) +
+
+
+
+
+ + ♻ ☆ Adaptive Superpixel for Active Learning in Semantic Segmentation + + +
+ Learning semantic segmentation requires pixel-wise annotations, which can be +time-consuming and expensive. To reduce the annotation cost, we propose a +superpixel-based active learning (AL) framework, which collects a dominant +label per superpixel instead. To be specific, it consists of adaptive +superpixel and sieving mechanisms, fully dedicated to AL. At each round of AL, +we adaptively merge neighboring pixels of similar learned features into +superpixels. We then query a selected subset of these superpixels using an +acquisition function assuming no uniform superpixel size. This approach is more +efficient than existing methods, which rely only on innate features such as RGB +color and assume uniform superpixel sizes. Obtaining a dominant label per +superpixel drastically reduces annotators' burden as it requires fewer clicks. +However, it inevitably introduces noisy annotations due to mismatches between +superpixel and ground truth segmentation. To address this issue, we further +devise a sieving mechanism that identifies and excludes potentially noisy +annotations from learning. Our experiments on both Cityscapes and PASCAL VOC +datasets demonstrate the efficacy of adaptive superpixel and sieving +mechanisms. + +
+
+
+
+
+ + ♻ ☆ Decoupled Iterative Refinement Framework for Interacting Hands + Reconstruction from a Single RGB Image ICCV 2023 + + +
+ Reconstructing interacting hands from a single RGB image is a very +challenging task. On the one hand, severe mutual occlusion and similar local +appearance between two hands confuse the extraction of visual features, +resulting in the misalignment of estimated hand meshes and the image. On the +other hand, there are complex spatial relationship between interacting hands, +which significantly increases the solution space of hand poses and increases +the difficulty of network learning. In this paper, we propose a decoupled +iterative refinement framework to achieve pixel-alignment hand reconstruction +while efficiently modeling the spatial relationship between hands. +Specifically, we define two feature spaces with different characteristics, +namely 2D visual feature space and 3D joint feature space. First, we obtain +joint-wise features from the visual feature map and utilize a graph convolution +network and a transformer to perform intra- and inter-hand information +interaction in the 3D joint feature space, respectively. Then, we project the +joint features with global information back into the 2D visual feature space in +an obfuscation-free manner and utilize the 2D convolution for pixel-wise +enhancement. By performing multiple alternate enhancements in the two feature +spaces, our method can achieve an accurate and robust reconstruction of +interacting hands. Our method outperforms all existing two-hand reconstruction +methods by a large margin on the InterHand2.6M dataset. + +
+
+ comment: Accepted to ICCV 2023 (Oral) +
+
+
+
+
+ + ♻ ☆ Self-Reference Deep Adaptive Curve Estimation for Low-Light Image + Enhancement + + +
+ In this paper, we propose a 2-stage low-light image enhancement method called +Self-Reference Deep Adaptive Curve Estimation (Self-DACE). In the first stage, +we present an intuitive, lightweight, fast, and unsupervised luminance +enhancement algorithm. The algorithm is based on a novel low-light enhancement +curve that can be used to locally boost image brightness. We also propose a new +loss function with a simplified physical model designed to preserve natural +images' color, structure, and fidelity. We use a vanilla CNN to map each pixel +through deep Adaptive Adjustment Curves (AAC) while preserving the local image +structure. Secondly, we introduce the corresponding denoising scheme to remove +the latent noise in the darkness. We approximately model the noise in the dark +and deploy a Denoising-Net to estimate and remove the noise after the first +stage. Exhaustive qualitative and quantitative analysis shows that our method +outperforms existing state-of-the-art algorithms on multiple real-world +datasets. + +
+
+
+
+
+ + ♻ ☆ Dataset Distillation Using Parameter Pruning + + +
+ In this study, we propose a novel dataset distillation method based on +parameter pruning. The proposed method can synthesize more robust distilled +datasets and improve distillation performance by pruning difficult-to-match +parameters during the distillation process. Experimental results on two +benchmark datasets show the superiority of the proposed method. + +
+
+ comment: Published as a journal paper at IEICE Trans. Fund +
+
+
+
+
+ + ♻ ☆ Agent-Centric Relation Graph for Object Visual Navigation + + +
+ Object visual navigation aims to steer an agent toward a target object based +on visual observations. It is highly desirable to reasonably perceive the +environment and accurately control the agent. In the navigation task, we +introduce an Agent-Centric Relation Graph (ACRG) for learning the visual +representation based on the relationships in the environment. ACRG is a highly +effective structure that consists of two relationships, i.e., the horizontal +relationship among objects and the distance relationship between the agent and +objects . On the one hand, we design the Object Horizontal Relationship Graph +(OHRG) that stores the relative horizontal location among objects. On the other +hand, we propose the Agent-Target Distance Relationship Graph (ATDRG) that +enables the agent to perceive the distance between the target and objects. For +ATDRG, we utilize image depth to obtain the target distance and imply the +vertical location to capture the distance relationship among objects in the +vertical direction. With the above graphs, the agent can perceive the +environment and output navigation actions. Experimental results in the +artificial environment AI2-THOR demonstrate that ACRG significantly outperforms +other state-of-the-art methods in unseen testing environments. + +
+
+ comment: 16 pages, 13 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ Tree-of-Mixed-Thought: Combining Fast and Slow Thinking for Multi-hop + Visual Reasoning + + +
+ There emerges a promising trend of using large language models (LLMs) to +generate code-like plans for complex inference tasks such as visual reasoning. +This paradigm, known as LLM-based planning, provides flexibility in problem +solving and endows better interpretability. However, current research is mostly +limited to basic scenarios of simple questions that can be straightforward +answered in a few inference steps. Planning for the more challenging multi-hop +visual reasoning tasks remains under-explored. Specifically, under multi-hop +reasoning situations, the trade-off between accuracy and the complexity of +plan-searching becomes prominent. The prevailing algorithms either address the +efficiency issue by employing the fast one-stop generation or adopt a complex +iterative generation method to improve accuracy. Both fail to balance the need +for efficiency and performance. Drawing inspiration from the dual system of +cognition in the human brain, the fast and the slow think processes, we propose +a hierarchical plan-searching algorithm that integrates the one-stop reasoning +(fast) and the Tree-of-thought (slow). Our approach succeeds in performance +while significantly saving inference steps. Moreover, we repurpose the PTR and +the CLEVER datasets, developing a systematic framework for evaluating the +performance and efficiency of LLMs-based plan-search algorithms under reasoning +tasks at different levels of difficulty. Extensive experiments demonstrate the +superiority of our proposed algorithm in terms of performance and efficiency. +The dataset and code will be release soon. + +
+
+ comment: 16 pages,1 figures, under review +
+
+
+
+
+ + ♻ ☆ Self-distillation Regularized Connectionist Temporal Classification Loss + for Text Recognition: A Simple Yet Effective Approach + + +
+ Text recognition methods are gaining rapid development. Some advanced +techniques, e.g., powerful modules, language models, and un- and +semi-supervised learning schemes, consecutively push the performance on public +benchmarks forward. However, the problem of how to better optimize a text +recognition model from the perspective of loss functions is largely overlooked. +CTC-based methods, widely used in practice due to their good balance between +performance and inference speed, still grapple with accuracy degradation. This +is because CTC loss emphasizes the optimization of the entire sequence target +while neglecting to learn individual characters. We propose a self-distillation +scheme for CTC-based model to address this issue. It incorporates a framewise +regularization term in CTC loss to emphasize individual supervision, and +leverages the maximizing-a-posteriori of latent alignment to solve the +inconsistency problem that arises in distillation between CTC-based models. We +refer to the regularized CTC loss as Distillation Connectionist Temporal +Classification (DCTC) loss. DCTC loss is module-free, requiring no extra +parameters, longer inference lag, or additional training data or phases. +Extensive experiments on public benchmarks demonstrate that DCTC can boost text +recognition model accuracy by up to 2.6%, without any of these drawbacks. + +
+
+ comment: Ziyin Zhang and Ning Lu are co-first authors +
+
+
+
+
+ + ♻ ☆ SelfDocSeg: A Self-Supervised vision-based Approach towards Document + Segmentation ICDAR 2023 + + +
+ Document layout analysis is a known problem to the documents research +community and has been vastly explored yielding a multitude of solutions +ranging from text mining, and recognition to graph-based representation, visual +feature extraction, etc. However, most of the existing works have ignored the +crucial fact regarding the scarcity of labeled data. With growing internet +connectivity to personal life, an enormous amount of documents had been +available in the public domain and thus making data annotation a tedious task. +We address this challenge using self-supervision and unlike, the few existing +self-supervised document segmentation approaches which use text mining and +textual labels, we use a complete vision-based approach in pre-training without +any ground-truth label or its derivative. Instead, we generate pseudo-layouts +from the document images to pre-train an image encoder to learn the document +object representation and localization in a self-supervised framework before +fine-tuning it with an object detection model. We show that our pipeline sets a +new benchmark in this context and performs at par with the existing methods and +the supervised counterparts, if not outperforms. The code is made publicly +available at: https://github.com/MaitySubhajit/SelfDocSeg + +
+
+ comment: Accepted at The 17th International Conference on Document Analysis + and Recognition (ICDAR 2023) +
+
+
+
+
+ + ♻ ☆ PCT-CycleGAN: Paired Complementary Temporal Cycle-Consistent Adversarial + Networks for Radar-Based Precipitation Nowcasting CIKM 2023 + + +
+ The precipitation nowcasting methods have been elaborated over the centuries +because rain has a crucial impact on human life. Not only quantitative +precipitation forecast (QPF) models and convolutional long short-term memory +(ConvLSTM), but also various sophisticated methods such as the latest MetNet-2 +are emerging. In this paper, we propose a paired complementary temporal +cycle-consistent adversarial networks (PCT-CycleGAN) for radar-based +precipitation nowcasting, inspired by cycle-consistent adversarial networks +(CycleGAN), which shows strong performance in image-to-image translation. +PCT-CycleGAN generates temporal causality using two generator networks with +forward and backward temporal dynamics in paired complementary cycles. Each +generator network learns a huge number of one-to-one mappings about +time-dependent radar-based precipitation data to approximate a mapping function +representing the temporal dynamics in each direction. To create robust temporal +causality between paired complementary cycles, novel connection loss is +proposed. And torrential loss to cover exceptional heavy rain events is also +proposed. The generator network learning forward temporal dynamics in +PCT-CycleGAN generates radar-based precipitation data 10 minutes from the +current time. Also, it provides a reliable prediction of up to 2 hours with +iterative forecasting. The superiority of PCT-CycleGAN is demonstrated through +qualitative and quantitative comparisons with several previous methods. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Environment-Invariant Curriculum Relation Learning for Fine-Grained + Scene Graph Generation ICCV2023 + + +
+ The scene graph generation (SGG) task is designed to identify the predicates +based on the subject-object pairs.However,existing datasets generally include +two imbalance cases: one is the class imbalance from the predicted predicates +and another is the context imbalance from the given subject-object pairs, which +presents significant challenges for SGG. Most existing methods focus on the +imbalance of the predicted predicate while ignoring the imbalance of the +subject-object pairs, which could not achieve satisfactory results. To address +the two imbalance cases, we propose a novel Environment Invariant Curriculum +Relation learning (EICR) method, which can be applied in a plug-and-play +fashion to existing SGG methods. Concretely, to remove the imbalance of the +subject-object pairs, we first construct different distribution environments +for the subject-object pairs and learn a model invariant to the environment +changes. Then, we construct a class-balanced curriculum learning strategy to +balance the different environments to remove the predicate imbalance. +Comprehensive experiments conducted on VG and GQA datasets demonstrate that our +EICR framework can be taken as a general strategy for various SGG models, and +achieve significant improvements. + +
+
+ comment: ICCV2023. arXiv admin note: text overlap with arXiv:2203.11654 by + other authors +
+
+
+
+
+ + ♻ ☆ Unified Visual Relationship Detection with Vision and Language Models ICCV 2023 + + +
+ This work focuses on training a single visual relationship detector +predicting over the union of label spaces from multiple datasets. Merging +labels spanning different datasets could be challenging due to inconsistent +taxonomies. The issue is exacerbated in visual relationship detection when +second-order visual semantics are introduced between pairs of objects. To +address this challenge, we propose UniVRD, a novel bottom-up method for Unified +Visual Relationship Detection by leveraging vision and language models (VLMs). +VLMs provide well-aligned image and text embeddings, where similar +relationships are optimized to be close to each other for semantic unification. +Our bottom-up design enables the model to enjoy the benefit of training with +both object detection and visual relationship datasets. Empirical results on +both human-object interaction detection and scene-graph generation demonstrate +the competitive performance of our model. UniVRD achieves 38.07 mAP on +HICO-DET, outperforming the current best bottom-up HOI detector by 14.26 mAP. +More importantly, we show that our unified detector performs as well as +dataset-specific models in mAP, and achieves further improvements when we scale +up the model. Our code will be made publicly available on GitHub. + +
+
+ comment: Accepted to ICCV 2023. Code is available at + https://github.com/google-research/scenic/tree/main/scenic/projects/univrd +
+
+
+
+
+ + ♻ ☆ SATR: Zero-Shot Semantic Segmentation of 3D Shapes + + +
+ We explore the task of zero-shot semantic segmentation of 3D shapes by using +large-scale off-the-shelf 2D image recognition models. Surprisingly, we find +that modern zero-shot 2D object detectors are better suited for this task than +contemporary text/image similarity predictors or even zero-shot 2D segmentation +networks. Our key finding is that it is possible to extract accurate 3D +segmentation maps from multi-view bounding box predictions by using the +topological properties of the underlying surface. For this, we develop the +Segmentation Assignment with Topological Reweighting (SATR) algorithm and +evaluate it on ShapeNetPart and our proposed FAUST benchmarks. SATR achieves +state-of-the-art performance and outperforms a baseline algorithm by 1.3% and +4% average mIoU on the FAUST coarse and fine-grained benchmarks, respectively, +and by 5.2% average mIoU on the ShapeNetPart benchmark. Our source code and +data will be publicly released. Project webpage: +https://samir55.github.io/SATR/. + +
+
+ comment: Project webpage: https://samir55.github.io/SATR/ +
+
+
+
+
+ + ♻ ☆ To pretrain or not to pretrain? A case study of domain-specific + pretraining for semantic segmentation in histopathology + + +
+ Annotating medical imaging datasets is costly, so fine-tuning (or transfer +learning) is the most effective method for digital pathology vision +applications such as disease classification and semantic segmentation. However, +due to texture bias in models trained on real-world images, transfer learning +for histopathology applications might result in underperforming models, which +necessitates the need for using unlabeled histopathology data and +self-supervised methods to discover domain-specific characteristics. Here, we +tested the premise that histopathology-specific pretrained models provide +better initializations for pathology vision tasks, i.e., gland and cell +segmentation. In this study, we compare the performance of gland and cell +segmentation tasks with histopathology domain-specific and non-domain-specific +(real-world images) pretrained weights. Moreover, we investigate the dataset +size at which domain-specific pretraining produces significant gains in +performance. In addition, we investigated whether domain-specific +initialization improves the effectiveness of out-of-distribution testing on +distinct datasets but the same task. The results indicate that performance gain +using domain-specific pretrained weights depends on both the task and the size +of the training dataset. In instances with limited dataset sizes, a significant +improvement in gland segmentation performance was also observed, whereas models +trained on cell segmentation datasets exhibit no improvement. + +
+
+
+
+
+ + ♻ ☆ A Man-in-the-Middle Attack against Object Detection Systems + + +
+ Object detection systems using deep learning models have become increasingly +popular in robotics thanks to the rising power of CPUs and GPUs in embedded +systems. However, these models are susceptible to adversarial attacks. While +some attacks are limited by strict assumptions on access to the detection +system, we propose a novel hardware attack inspired by Man-in-the-Middle +attacks in cryptography. This attack generates an Universal Adversarial +Perturbation (UAP) and then inject the perturbation between the USB camera and +the detection system via a hardware attack. Besides, prior research is misled +by an evaluation metric that measures the model accuracy rather than the attack +performance. In combination with our proposed evaluation metrics, we +significantly increases the strength of adversarial perturbations. These +findings raise serious concerns for applications of deep learning models in +safety-critical systems, such as autonomous driving. + +
+
+ comment: 6 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ ADASSM: Adversarial Data Augmentation in Statistical Shape Models From + Images + + +
+ Statistical shape models (SSM) have been well-established as an excellent +tool for identifying variations in the morphology of anatomy across the +underlying population. Shape models use consistent shape representation across +all the samples in a given cohort, which helps to compare shapes and identify +the variations that can detect pathologies and help in formulating treatment +plans. In medical imaging, computing these shape representations from CT/MRI +scans requires time-intensive preprocessing operations, including but not +limited to anatomy segmentation annotations, registration, and texture +denoising. Deep learning models have demonstrated exceptional capabilities in +learning shape representations directly from volumetric images, giving rise to +highly effective and efficient Image-to-SSM networks. Nevertheless, these +models are data-hungry and due to the limited availability of medical data, +deep learning models tend to overfit. Offline data augmentation techniques, +that use kernel density estimation based (KDE) methods for generating +shape-augmented samples, have successfully aided Image-to-SSM networks in +achieving comparable accuracy to traditional SSM methods. However, these +augmentation methods focus on shape augmentation, whereas deep learning models +exhibit image-based texture bias resulting in sub-optimal models. This paper +introduces a novel strategy for on-the-fly data augmentation for the +Image-to-SSM framework by leveraging data-dependent noise generation or texture +augmentation. The proposed framework is trained as an adversary to the +Image-to-SSM network, augmenting diverse and challenging noisy samples. Our +approach achieves improved accuracy by encouraging the model to focus on the +underlying geometry rather than relying solely on pixel values. + +
+
+
+
+
+ + ♻ ☆ LOPR: Latent Occupancy PRediction using Generative Models + + +
+ Environment prediction frameworks are integral for autonomous vehicles, +enabling safe navigation in dynamic environments. LiDAR generated occupancy +grid maps (L-OGMs) offer a robust bird's eye-view scene representation that +facilitates joint scene predictions without relying on manual labeling unlike +commonly used trajectory prediction frameworks. Prior approaches have optimized +deterministic L-OGM prediction architectures directly in grid cell space. While +these methods have achieved some degree of success in prediction, they +occasionally grapple with unrealistic and incorrect predictions. We claim that +the quality and realism of the forecasted occupancy grids can be enhanced with +the use of generative models. We propose a framework that decouples occupancy +prediction into: representation learning and stochastic prediction within the +learned latent space. Our approach allows for conditioning the model on other +available sensor modalities such as RGB-cameras and high definition maps. We +demonstrate that our approach achieves state-of-the-art performance and is +readily transferable between different robotic platforms on the real-world +NuScenes, Waymo Open, and a custom dataset we collected on an experimental +vehicle platform. + +
+
+
+
+
+
+
+
+ + Information Retrieval 20 + +
+
+
+ + ☆ Leveraging Large Language Models for Pre-trained Recommender Systems + + +
+ Recent advancements in recommendation systems have shifted towards more +comprehensive and personalized recommendations by utilizing large language +models (LLM). However, effectively integrating LLM's commonsense knowledge and +reasoning abilities into recommendation systems remains a challenging problem. +In this paper, we propose RecSysLLM, a novel pre-trained recommendation model +based on LLMs. RecSysLLM retains LLM reasoning and knowledge while integrating +recommendation domain knowledge through unique designs of data, training, and +inference. This allows RecSysLLM to leverage LLMs' capabilities for +recommendation tasks in an efficient, unified framework. We demonstrate the +effectiveness of RecSysLLM on benchmarks and real-world scenarios. RecSysLLM +provides a promising approach to developing unified recommendation systems by +fully exploiting the power of pre-trained language models. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Enhancing Recommender Systems with Large Language Model Reasoning Graphs + + +
+ Recommendation systems aim to provide users with relevant suggestions, but +often lack interpretability and fail to capture higher-level semantic +relationships between user behaviors and profiles. In this paper, we propose a +novel approach that leverages large language models (LLMs) to construct +personalized reasoning graphs. These graphs link a user's profile and +behavioral sequences through causal and logical inferences, representing the +user's interests in an interpretable way. Our approach, LLM reasoning graphs +(LLMRG), has four components: chained graph reasoning, divergent extension, +self-verification and scoring, and knowledge base self-improvement. The +resulting reasoning graph is encoded using graph neural networks, which serves +as additional input to improve conventional recommender systems, without +requiring extra user or item information. Our approach demonstrates how LLMs +can enable more logical and interpretable recommender systems through +personalized reasoning graphs. LLMRG allows recommendations to benefit from +both engineered recommendation systems and LLM-derived reasoning graphs. We +demonstrate the effectiveness of LLMRG on benchmarks and real-world scenarios +in enhancing base recommendation models. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ DynED: Dynamic Ensemble Diversification in Data Stream Classification CIKM '23 + + +
+ Ensemble methods are commonly used in classification due to their remarkable +performance. Achieving high accuracy in a data stream environment is a +challenging task considering disruptive changes in the data distribution, also +known as concept drift. A greater diversity of ensemble components is known to +enhance prediction accuracy in such settings. Despite the diversity of +components within an ensemble, not all contribute as expected to its overall +performance. This necessitates a method for selecting components that exhibit +high performance and diversity. We present a novel ensemble construction and +maintenance approach based on MMR (Maximal Marginal Relevance) that dynamically +combines the diversity and prediction accuracy of components during the process +of structuring an ensemble. The experimental results on both four real and 11 +synthetic datasets demonstrate that the proposed approach (DynED) provides a +higher average mean accuracy compared to the five state-of-the-art baselines. + +
+
+ comment: Proceedings of the 32nd ACM International Conference on Information + and Knowledge Management (CIKM '23), October 21--25, 2023, Birmingham, United + Kingdom +
+
+
+
+
+ + ☆ LSCPM: communities in massive real-world Link Streams by Clique + Percolation Method + + +
+ Community detection is a popular approach to understand the organization of +interactions in static networks. For that purpose, the Clique Percolation +Method (CPM), which involves the percolation of k-cliques, is a well-studied +technique that offers several advantages. Besides, studying interactions that +occur over time is useful in various contexts, which can be modeled by the link +stream formalism. The Dynamic Clique Percolation Method (DCPM) has been +proposed for extending CPM to temporal networks. + However, existing implementations are unable to handle massive datasets. We +present a novel algorithm that adapts CPM to link streams, which has the +advantage that it allows us to speed up the computation time with respect to +the existing DCPM method. We evaluate it experimentally on real datasets and +show that it scales to massive link streams. For example, it allows to obtain a +complete set of communities in under twenty-five minutes for a dataset with +thirty million links, what the state of the art fails to achieve even after a +week of computation. We further show that our method provides communities +similar to DCPM, but slightly more aggregated. We exhibit the relevance of the +obtained communities in real world cases, and show that they provide +information on the importance of vertices in the link streams. + +
+
+ comment: 18 pages, 7 figures, to be published in 30th International Symposium + on Temporal Representation and Reasoning (TIME 2023) +
+
+
+
+
+ + ☆ A Topology-aware Analysis of Graph Collaborative Filtering + + +
+ The successful integration of graph neural networks into recommender systems +(RSs) has led to a novel paradigm in collaborative filtering (CF), graph +collaborative filtering (graph CF). By representing user-item data as an +undirected, bipartite graph, graph CF utilizes short- and long-range +connections to extract collaborative signals that yield more accurate user +preferences than traditional CF methods. Although the recent literature +highlights the efficacy of various algorithmic strategies in graph CF, the +impact of datasets and their topological features on recommendation performance +is yet to be studied. To fill this gap, we propose a topology-aware analysis of +graph CF. In this study, we (i) take some widely-adopted recommendation +datasets and use them to generate a large set of synthetic sub-datasets through +two state-of-the-art graph sampling methods, (ii) measure eleven of their +classical and topological characteristics, and (iii) estimate the accuracy +calculated on the generated sub-datasets considering four popular and recent +graph-based RSs (i.e., LightGCN, DGCF, UltraGCN, and SVD-GCN). Finally, the +investigation presents an explanatory framework that reveals the linear +relationships between characteristics and accuracy measures. The results, +statistically validated under different graph sampling settings, confirm the +existence of solid dependencies between topological characteristics and +accuracy in the graph-based recommendation, offering a new perspective on how +to interpret graph CF. + +
+
+
+
+
+ + ☆ DepreSym: A Depression Symptom Annotated Corpus and the Role of LLMs as + Assessors of Psychological Markers + + +
+ Computational methods for depression detection aim to mine traces of +depression from online publications posted by Internet users. However, +solutions trained on existing collections exhibit limited generalisation and +interpretability. To tackle these issues, recent studies have shown that +identifying depressive symptoms can lead to more robust models. The eRisk +initiative fosters research on this area and has recently proposed a new +ranking task focused on developing search methods to find sentences related to +depressive symptoms. This search challenge relies on the symptoms specified by +the Beck Depression Inventory-II (BDI-II), a questionnaire widely used in +clinical practice. Based on the participant systems' results, we present the +DepreSym dataset, consisting of 21580 sentences annotated according to their +relevance to the 21 BDI-II symptoms. The labelled sentences come from a pool of +diverse ranking methods, and the final dataset serves as a valuable resource +for advancing the development of models that incorporate depressive markers +such as clinical symptoms. Due to the complex nature of this relevance +annotation, we designed a robust assessment methodology carried out by three +expert assessors (including an expert psychologist). Additionally, we explore +here the feasibility of employing recent Large Language Models (ChatGPT and +GPT4) as potential assessors in this complex task. We undertake a comprehensive +examination of their performance, determine their main limitations and analyze +their role as a complement or replacement for human annotators. + +
+
+
+
+
+ + ☆ Contrastive Graph Prompt-tuning for Cross-domain Recommendation + + +
+ Recommender systems are frequently challenged by the data sparsity problem. +One approach to mitigate this issue is through cross-domain recommendation +techniques. In a cross-domain context, sharing knowledge between domains can +enhance the effectiveness in the target domain. Recent cross-domain methods +have employed a pre-training approach, but we argue that these methods often +result in suboptimal fine-tuning, especially with large neural models. Modern +language models utilize prompts for efficient model tuning. Such prompts act as +a tunable latent vector, allowing for the freezing of the main model +parameters. In our research, we introduce the Personalised Graph Prompt-based +Recommendation (PGPRec) framework. This leverages the advantages of +prompt-tuning. Within this framework, we formulate personalized graph prompts +item-wise, rooted in items that a user has previously engaged with. +Specifically, we employ Contrastive Learning (CL) to produce pre-trained +embeddings that offer greater generalizability in the pre-training phase, +ensuring robust training during the tuning phase. Our evaluation of PGPRec in +cross-domain scenarios involves comprehensive testing with the top-k +recommendation tasks and a cold-start analysis. Our empirical findings, based +on four Amazon Review datasets, reveal that the PGPRec framework can decrease +the tuned parameters by as much as 74%, maintaining competitive performance. +Remarkably, there's an 11.41% enhancement in performance against the leading +baseline in cold-start situations. + +
+
+
+
+
+ + ☆ Evaluating Temporal Persistence Using Replicability Measures + + +
+ In real-world Information Retrieval (IR) experiments, the Evaluation +Environment (EE) is exposed to constant change. Documents are added, removed, +or updated, and the information need and the search behavior of users is +evolving. Simultaneously, IR systems are expected to retain a consistent +quality. The LongEval Lab seeks to investigate the longitudinal persistence of +IR systems, and in this work, we describe our participation. We submitted runs +of five advanced retrieval systems, namely a Reciprocal Rank Fusion (RRF) +approach, ColBERT, monoT5, Doc2Query, and E5, to both sub-tasks. Further, we +cast the longitudinal evaluation as a replicability study to better understand +the temporal change observed. As a result, we quantify the persistence of the +submitted runs and see great potential in this evaluation method. + +
+
+ comment: To be published in Proceedings of the Working Notes of CLEF 2023 - + Conference and Labs of the Evaluation Forum, Thessaloniki, Greece 18 - 21, + 2023 +
+
+
+
+
+ + ☆ DPAN: Dynamic Preference-based and Attribute-aware Network for Relevant + Recommendations + + +
+ In e-commerce platforms, the relevant recommendation is a unique scenario +providing related items for a trigger item that users are interested in. +However, users' preferences for the similarity and diversity of recommendation +results are dynamic and vary under different conditions. Moreover, individual +item-level diversity is too coarse-grained since all recommended items are +related to the trigger item. Thus, the two main challenges are to learn +fine-grained representations of similarity and diversity and capture users' +dynamic preferences for them under different conditions. To address these +challenges, we propose a novel method called the Dynamic Preference-based and +Attribute-aware Network (DPAN) for predicting Click-Through Rate (CTR) in +relevant recommendations. Specifically, based on Attribute-aware Activation +Values Generation (AAVG), Bi-dimensional Compression-based Re-expression (BCR) +is designed to obtain similarity and diversity representations of user +interests and item information. Then Shallow and Deep Union-based Fusion (SDUF) +is proposed to capture users' dynamic preferences for the diverse degree of +recommendation results according to various conditions. DPAN has demonstrated +its effectiveness through extensive offline experiments and online A/B testing, +resulting in a significant 7.62% improvement in CTR. Currently, DPAN has been +successfully deployed on our e-commerce platform serving the primary traffic +for relevant recommendations. The code of DPAN has been made publicly +available. + +
+
+
+
+
+ + ☆ Single-User Injection for Invisible Shilling Attack against Recommender + Systems CIKM 2023 + + +
+ Recommendation systems (RS) are crucial for alleviating the information +overload problem. Due to its pivotal role in guiding users to make decisions, +unscrupulous parties are lured to launch attacks against RS to affect the +decisions of normal users and gain illegal profits. Among various types of +attacks, shilling attack is one of the most subsistent and profitable attacks. +In shilling attack, an adversarial party injects a number of well-designed fake +user profiles into the system to mislead RS so that the attack goal can be +achieved. Although existing shilling attack methods have achieved promising +results, they all adopt the attack paradigm of multi-user injection, where some +fake user profiles are required. This paper provides the first study of +shilling attack in an extremely limited scenario: only one fake user profile is +injected into the victim RS to launch shilling attacks (i.e., single-user +injection). We propose a novel single-user injection method SUI-Attack for +invisible shilling attack. SUI-Attack is a graph based attack method that +models shilling attack as a node generation task over the user-item bipartite +graph of the victim RS, and it constructs the fake user profile by generating +user features and edges that link the fake user to items. Extensive experiments +demonstrate that SUI-Attack can achieve promising attack results in single-user +injection. In addition to its attack power, SUI-Attack increases the +stealthiness of shilling attack and reduces the risk of being detected. We +provide our implementation at: https://github.com/KDEGroup/SUI-Attack. + +
+
+ comment: CIKM 2023. 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ FiBiNet++: Reducing Model Size by Low Rank Feature Interaction Layer for + CTR Prediction + + +
+ Click-Through Rate (CTR) estimation has become one of the most fundamental +tasks in many real-world applications and various deep models have been +proposed. Some research has proved that FiBiNet is one of the best performance +models and outperforms all other models on Avazu dataset. However, the large +model size of FiBiNet hinders its wider application. In this paper, we propose +a novel FiBiNet++ model to redesign FiBiNet's model structure, which greatly +reduces model size while further improves its performance. One of the primary +techniques involves our proposed "Low Rank Layer" focused on feature +interaction, which serves as a crucial driver of achieving a superior +compression ratio for models. Extensive experiments on three public datasets +show that FiBiNet++ effectively reduces non-embedding model parameters of +FiBiNet by 12x to 16x on three datasets. On the other hand, FiBiNet++ leads to +significant performance improvements compared to state-of-the-art CTR methods, +including FiBiNet. + +
+
+
+
+
+ + ♻ ☆ OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text + Documents + + +
+ Large multimodal models trained on natural documents, which interleave images +and text, outperform models trained on image-text pairs on various multimodal +benchmarks. However, the datasets used to train these models have not been +released, and the collection process has not been fully specified. We introduce +the OBELICS dataset, an open web-scale filtered dataset of interleaved +image-text documents comprising 141 million web pages extracted from Common +Crawl, 353 million associated images, and 115 billion text tokens. We describe +the dataset creation process, present comprehensive filtering rules, and +provide an analysis of the dataset's content. To show the viability of OBELICS, +we train vision and language models of 9 and 80 billion parameters named +IDEFICS, and obtain competitive performance on different multimodal benchmarks. +We release our dataset, models and code. + +
+
+
+
+
+ + ♻ ☆ Meta-Learning with Adaptive Weighted Loss for Imbalanced Cold-Start + Recommendation CIKM 2023 + + +
+ Sequential recommenders have made great strides in capturing a user's +preferences. Nevertheless, the cold-start recommendation remains a fundamental +challenge as they typically involve limited user-item interactions for +personalization. Recently, gradient-based meta-learning approaches have emerged +in the sequential recommendation field due to their fast adaptation and +easy-to-integrate abilities. The meta-learning algorithms formulate the +cold-start recommendation as a few-shot learning problem, where each user is +represented as a task to be adapted. While meta-learning algorithms generally +assume that task-wise samples are evenly distributed over classes or values, +user-item interactions in real-world applications do not conform to such a +distribution (e.g., watching favorite videos multiple times, leaving only +positive ratings without any negative ones). Consequently, imbalanced user +feedback, which accounts for the majority of task training data, may dominate +the user adaptation process and prevent meta-learning algorithms from learning +meaningful meta-knowledge for personalized recommendations. To alleviate this +limitation, we propose a novel sequential recommendation framework based on +gradient-based meta-learning that captures the imbalanced rating distribution +of each user and computes adaptive loss for user-specific learning. Our work is +the first to tackle the impact of imbalanced ratings in cold-start sequential +recommendation scenarios. Through extensive experiments conducted on real-world +datasets, we demonstrate the effectiveness of our framework. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ♻ ☆ A Profit-Maximizing Strategy for Advertising on the e-Commerce Platforms + + +
+ The online advertising management platform has become increasingly popular +among e-commerce vendors/advertisers, offering a streamlined approach to reach +target customers. Despite its advantages, configuring advertising strategies +correctly remains a challenge for online vendors, particularly those with +limited resources. Ineffective strategies often result in a surge of +unproductive ``just looking'' clicks, leading to disproportionately high +advertising expenses comparing to the growth of sales. In this paper, we +present a novel profit-maximing strategy for targeting options of online +advertising. The proposed model aims to find the optimal set of features to +maximize the probability of converting targeted audiences into actual buyers. +We address the optimization challenge by reformulating it as a multiple-choice +knapsack problem (MCKP). We conduct an empirical study featuring real-world +data from Tmall to show that our proposed method can effectively optimize the +advertising strategy with budgetary constraints. + +
+
+ comment: Online advertising campaigns +
+
+
+
+
+ + ♻ ☆ RECOMED: A Comprehensive Pharmaceutical Recommendation System + + +
+ A comprehensive pharmaceutical recommendation system was designed based on +the patients and drugs features extracted from Drugs.com and Druglib.com. +First, data from these databases were combined, and a dataset of patients and +drug information was built. Secondly, the patients and drugs were clustered, +and then the recommendation was performed using different ratings provided by +patients, and importantly by the knowledge obtained from patients and drug +specifications, and considering drug interactions. To the best of our +knowledge, we are the first group to consider patients conditions and history +in the proposed approach for selecting a specific medicine appropriate for that +particular user. Our approach applies artificial intelligence (AI) models for +the implementation. Sentiment analysis using natural language processing +approaches is employed in pre-processing along with neural network-based +methods and recommender system algorithms for modeling the system. In our work, +patients conditions and drugs features are used for making two models based on +matrix factorization. Then we used drug interaction to filter drugs with severe +or mild interactions with other drugs. We developed a deep learning model for +recommending drugs by using data from 2304 patients as a training set, and then +we used data from 660 patients as our validation set. After that, we used +knowledge from critical information about drugs and combined the outcome of the +model into a knowledge-based system with the rules obtained from constraints on +taking medicine. + +
+
+ comment: 39 pages, 14 figures, 13 tables +
+
+
+
+
+ + ♻ ☆ Differentiable Retrieval Augmentation via Generative Language Modeling + for E-commerce Query Intent Classification CIKM2023 + + +
+ Retrieval augmentation, which enhances downstream models by a knowledge +retriever and an external corpus instead of by merely increasing the number of +model parameters, has been successfully applied to many natural language +processing (NLP) tasks such as text classification, question answering and so +on. However, existing methods that separately or asynchronously train the +retriever and downstream model mainly due to the non-differentiability between +the two parts, usually lead to degraded performance compared to end-to-end +joint training. In this paper, we propose Differentiable Retrieval Augmentation +via Generative lANguage modeling(Dragan), to address this problem by a novel +differentiable reformulation. We demonstrate the effectiveness of our proposed +method on a challenging NLP task in e-commerce search, namely query intent +classification. Both the experimental results and ablation study show that the +proposed method significantly and reasonably improves the state-of-the-art +baselines on both offline evaluation and online A/B test. + +
+
+ comment: 5 pages, 2 figures; accepted by CIKM2023 +
+
+
+
+
+ + ♻ ☆ Optimal Bandwidth Selection for DENCLUE Algorithm + + +
+ In modern day industry, clustering algorithms are daily routines of algorithm +engineers. Although clustering algorithms experienced rapid growth before 2010. +Innovation related to the research topic has stagnated after deep learning +became the de facto industrial standard for machine learning applications. In +2007, a density-based clustering algorithm named DENCLUE was invented to solve +clustering problem for nonlinear data structures. However, its parameter +selection problem was largely neglected until 2011. In this paper, we propose a +new approach to compute the optimal parameters for the DENCLUE algorithm, and +discuss its performance in the experiment section. + +
+
+
+
+
+ + ♻ ☆ Adaptive Preferential Attached kNN Graph with Distribution-Awareness + + +
+ Graph-based kNN algorithms have garnered widespread popularity for machine +learning tasks due to their simplicity and effectiveness. However, as factual +data often inherit complex distributions, the conventional kNN graph's reliance +on a unified k-value can hinder its performance. A crucial factor behind this +challenge is the presence of ambiguous samples along decision boundaries that +are inevitably more prone to incorrect classifications. To address the +situation, we propose the Preferential Attached k-Nearest Neighbors Graph +(paNNG), which adopts distribution-aware adaptive-k into graph construction. By +incorporating distribution information as a cohesive entity, paNNG can +significantly improve performance on ambiguous samples by "pulling" them +towards their original classes and hence enhance overall generalization +capability. Through rigorous evaluations on diverse datasets, paNNG outperforms +state-of-the-art algorithms, showcasing its adaptability and efficacy across +various real-world scenarios. + +
+
+
+
+
+ + ♻ ☆ Task Relation-aware Continual User Representation Learning KDD 2023 + + +
+ User modeling, which learns to represent users into a low-dimensional +representation space based on their past behaviors, got a surge of interest +from the industry for providing personalized services to users. Previous +efforts in user modeling mainly focus on learning a task-specific user +representation that is designed for a single task. However, since learning +task-specific user representations for every task is infeasible, recent studies +introduce the concept of universal user representation, which is a more +generalized representation of a user that is relevant to a variety of tasks. +Despite their effectiveness, existing approaches for learning universal user +representations are impractical in real-world applications due to the data +requirement, catastrophic forgetting and the limited learning capability for +continually added tasks. In this paper, we propose a novel continual user +representation learning method, called TERACON, whose learning capability is +not limited as the number of learned tasks increases while capturing the +relationship between the tasks. The main idea is to introduce an embedding for +each task, i.e., task embedding, which is utilized to generate task-specific +soft masks that not only allow the entire model parameters to be updated until +the end of training sequence, but also facilitate the relationship between the +tasks to be captured. Moreover, we introduce a novel knowledge retention module +with pseudo-labeling strategy that successfully alleviates the long-standing +problem of continual learning, i.e., catastrophic forgetting. Extensive +experiments on public and proprietary real-world datasets demonstrate the +superiority and practicality of TERACON. Our code is available at +https://github.com/Sein-Kim/TERACON. + +
+
+ comment: KDD 2023 +
+
+
+
+
+ + ♻ ☆ Regression Compatible Listwise Objectives for Calibrated Ranking with + Binary Relevance + + +
+ As Learning-to-Rank (LTR) approaches primarily seek to improve ranking +quality, their output scores are not scale-calibrated by design. This +fundamentally limits LTR usage in score-sensitive applications. Though a simple +multi-objective approach that combines a regression and a ranking objective can +effectively learn scale-calibrated scores, we argue that the two objectives are +not necessarily compatible, which makes the trade-off less ideal for either of +them. In this paper, we propose a practical regression compatible ranking (RCR) +approach that achieves a better trade-off, where the two ranking and regression +components are proved to be mutually aligned. Although the same idea applies to +ranking with both binary and graded relevance, we mainly focus on binary labels +in this paper. We evaluate the proposed approach on several public LTR +benchmarks and show that it consistently achieves either best or competitive +result in terms of both regression and ranking metrics, and significantly +improves the Pareto frontiers in the context of multi-objective optimization. +Furthermore, we evaluated the proposed approach on YouTube Search and found +that it not only improved the ranking quality of the production pCTR model, but +also brought gains to the click prediction accuracy. The proposed approach has +been successfully deployed in the YouTube production system. + +
+
+
+
+
+
+
+
+ + Machine Learning 162 + +
+
+
+ + ☆ Structured World Models from Human Videos + + +
+ We tackle the problem of learning complex, general behaviors directly in the +real world. We propose an approach for robots to efficiently learn manipulation +skills using only a handful of real-world interaction trajectories from many +different settings. Inspired by the success of learning from large-scale +datasets in the fields of computer vision and natural language, our belief is +that in order to efficiently learn, a robot must be able to leverage +internet-scale, human video data. Humans interact with the world in many +interesting ways, which can allow a robot to not only build an understanding of +useful actions and affordances but also how these actions affect the world for +manipulation. Our approach builds a structured, human-centric action space +grounded in visual affordances learned from human videos. Further, we train a +world model on human videos and fine-tune on a small amount of robot +interaction data without any task supervision. We show that this approach of +affordance-space world models enables different robots to learn various +manipulation skills in complex settings, in under 30 minutes of interaction. +Videos can be found at https://human-world-model.github.io + +
+
+ comment: RSS 2023. Website at https://human-world-model.github.io +
+
+
+
+
+ + ☆ Unlocking Accuracy and Fairness in Differentially Private Image + Classification + + +
+ Privacy-preserving machine learning aims to train models on private data +without leaking sensitive information. Differential privacy (DP) is considered +the gold standard framework for privacy-preserving training, as it provides +formal privacy guarantees. However, compared to their non-private counterparts, +models trained with DP often have significantly reduced accuracy. Private +classifiers are also believed to exhibit larger performance disparities across +subpopulations, raising fairness concerns. The poor performance of classifiers +trained with DP has prevented the widespread adoption of privacy preserving +machine learning in industry. Here we show that pre-trained foundation models +fine-tuned with DP can achieve similar accuracy to non-private classifiers, +even in the presence of significant distribution shifts between pre-training +data and downstream tasks. We achieve private accuracies within a few percent +of the non-private state of the art across four datasets, including two medical +imaging benchmarks. Furthermore, our private medical classifiers do not exhibit +larger performance disparities across demographic groups than non-private +models. This milestone to make DP training a practical and reliable technology +has the potential to widely enable machine learning practitioners to train +safely on sensitive datasets while protecting individuals' privacy. + +
+
+
+
+
+ + ☆ Analyzing Transformer Dynamics as Movement through Embedding Space + + +
+ Transformer language models exhibit intelligent behaviors such as +understanding natural language, recognizing patterns, acquiring knowledge, +reasoning, planning, reflecting and using tools. This paper explores how their +underlying mechanics give rise to intelligent behaviors. We adopt a systems +approach to analyze Transformers in detail and develop a mathematical framework +that frames their dynamics as movement through embedding space. This novel +perspective provides a principled way of thinking about the problem and reveals +important insights related to the emergence of intelligence: + 1. At its core the Transformer is a Embedding Space walker, mapping +intelligent behavior to trajectories in this vector space. + 2. At each step of the walk, it composes context into a single composite +vector whose location in Embedding Space defines the next step. + 3. No learning actually occurs during decoding; in-context learning and +generalization are simply the result of different contexts composing into +different vectors. + 4. Ultimately the knowledge, intelligence and skills exhibited by the model +are embodied in the organization of vectors in Embedding Space rather than in +specific neurons or layers. These abilities are properties of this +organization. + 5. Attention's contribution boils down to the association-bias it lends to +vector composition and which influences the aforementioned organization. +However, more investigation is needed to ascertain its significance. + 6. The entire model is composed from two principal operations: data +independent filtering and data dependent aggregation. This generalization +unifies Transformers with other sequence models and across modalities. + Building upon this foundation we formalize and test a semantic space theory +which posits that embedding vectors represent semantic concepts and find some +evidence of its validity. + +
+
+
+
+
+ + ☆ Majorana Demonstrator Data Release for AI/ML Applications + + +
+ The enclosed data release consists of a subset of the calibration data from +the Majorana Demonstrator experiment. Each Majorana event is accompanied by raw +Germanium detector waveforms, pulse shape discrimination cuts, and calibrated +final energies, all shared in an HDF5 file format along with relevant metadata. +This release is specifically designed to support the training and testing of +Artificial Intelligence (AI) and Machine Learning (ML) algorithms upon our +data. This document is structured as follows. Section I provides an overview of +the dataset's content and format; Section II outlines the location of this +dataset and the method for accessing it; Section III presents the NPML Machine +Learning Challenge associated with this dataset; Section IV contains a +disclaimer from the Majorana collaboration regarding the use of this dataset; +Appendix A contains technical details of this data release. Please direct +questions about the material provided within this release to liaobo77@ucsd.edu +(A. Li). + +
+
+ comment: Zenodo DOI: 10.5281/zenodo.8257027 +
+
+
+
+
+ + ☆ Evaluating quantum generative models via imbalanced data classification + benchmarks + + +
+ A limited set of tools exist for assessing whether the behavior of quantum +machine learning models diverges from conventional models, outside of abstract +or theoretical settings. We present a systematic application of explainable +artificial intelligence techniques to analyze synthetic data generated from a +hybrid quantum-classical neural network adapted from twenty different +real-world data sets, including solar flares, cardiac arrhythmia, and speech +data. Each of these data sets exhibits varying degrees of complexity and class +imbalance. We benchmark the quantum-generated data relative to state-of-the-art +methods for mitigating class imbalance for associated classification tasks. We +leverage this approach to elucidate the qualities of a problem that make it +more or less likely to be amenable to a hybrid quantum-classical generative +model. + +
+
+
+
+
+ + ☆ Real World Time Series Benchmark Datasets with Distribution Shifts: + Global Crude Oil Price and Volatility IJCAI 2023 + + +
+ The scarcity of task-labeled time-series benchmarks in the financial domain +hinders progress in continual learning. Addressing this deficit would foster +innovation in this area. Therefore, we present COB, Crude Oil Benchmark +datasets. COB includes 30 years of asset prices that exhibit significant +distribution shifts and optimally generates corresponding task (i.e., regime) +labels based on these distribution shifts for the three most important crude +oils in the world. Our contributions include creating real-world benchmark +datasets by transforming asset price data into volatility proxies, fitting +models using expectation-maximization (EM), generating contextual task labels +that align with real-world events, and providing these labels as well as the +general algorithm to the public. We show that the inclusion of these task +labels universally improves performance on four continual learning algorithms, +some state-of-the-art, over multiple forecasting horizons. We hope these +benchmarks accelerate research in handling distribution shifts in real-world +data, especially due to the global importance of the assets considered. We've +made the (1) raw price data, (2) task labels generated by our approach, (3) and +code for our algorithm available at https://oilpricebenchmarks.github.io. + +
+
+ comment: 7 pages, 5 figures. Awarded Best Paper Runner Up / Honorable Mention + and presented as Contributed Talk at IJCAI 2023, the 32nd International Joint + Conference on Artificial Intelligence (AI4TS) +
+
+
+
+
+ + ☆ Neural Networks Optimizations Against Concept and Data Drift in Malware + Detection + + +
+ Despite the promising results of machine learning models in malware +detection, they face the problem of concept drift due to malware constant +evolution. This leads to a decline in performance over time, as the data +distribution of the new files differs from the training one, requiring regular +model update. In this work, we propose a model-agnostic protocol to improve a +baseline neural network to handle with the drift problem. We show the +importance of feature reduction and training with the most recent validation +set possible, and propose a loss function named Drift-Resilient Binary +Cross-Entropy, an improvement to the classical Binary Cross-Entropy more +effective against drift. We train our model on the EMBER dataset (2018) and +evaluate it on a dataset of recent malicious files, collected between 2020 and +2023. Our improved model shows promising results, detecting 15.2% more malware +than a baseline model. + +
+
+
+
+
+ + ☆ Graph Neural Bandits KDD 2023 + + +
+ Contextual bandits algorithms aim to choose the optimal arm with the highest +reward out of a set of candidates based on the contextual information. Various +bandit algorithms have been applied to real-world applications due to their +ability of tackling the exploitation-exploration dilemma. Motivated by online +recommendation scenarios, in this paper, we propose a framework named Graph +Neural Bandits (GNB) to leverage the collaborative nature among users empowered +by graph neural networks (GNNs). Instead of estimating rigid user clusters as +in existing works, we model the "fine-grained" collaborative effects through +estimated user graphs in terms of exploitation and exploration respectively. +Then, to refine the recommendation strategy, we utilize separate GNN-based +models on estimated user graphs for exploitation and adaptive exploration. +Theoretical analysis and experimental results on multiple real data sets in +comparison with state-of-the-art baselines are provided to demonstrate the +effectiveness of our proposed framework. + +
+
+ comment: Accepted to SIGKDD 2023 +
+
+
+
+
+ + ☆ DynED: Dynamic Ensemble Diversification in Data Stream Classification CIKM '23 + + +
+ Ensemble methods are commonly used in classification due to their remarkable +performance. Achieving high accuracy in a data stream environment is a +challenging task considering disruptive changes in the data distribution, also +known as concept drift. A greater diversity of ensemble components is known to +enhance prediction accuracy in such settings. Despite the diversity of +components within an ensemble, not all contribute as expected to its overall +performance. This necessitates a method for selecting components that exhibit +high performance and diversity. We present a novel ensemble construction and +maintenance approach based on MMR (Maximal Marginal Relevance) that dynamically +combines the diversity and prediction accuracy of components during the process +of structuring an ensemble. The experimental results on both four real and 11 +synthetic datasets demonstrate that the proposed approach (DynED) provides a +higher average mean accuracy compared to the five state-of-the-art baselines. + +
+
+ comment: Proceedings of the 32nd ACM International Conference on Information + and Knowledge Management (CIKM '23), October 21--25, 2023, Birmingham, United + Kingdom +
+
+
+
+
+ + ☆ Differentiable Frank-Wolfe Optimization Layer + + +
+ Differentiable optimization has received a significant amount of attention +due to its foundational role in the domain of machine learning based on neural +networks. The existing methods leverages the optimality conditions and implicit +function theorem to obtain the Jacobian matrix of the output, which increases +the computational cost and limits the application of differentiable +optimization. In addition, some non-differentiable constraints lead to more +challenges when using prior differentiable optimization layers. This paper +proposes a differentiable layer, named Differentiable Frank-Wolfe Layer +(DFWLayer), by rolling out the Frank-Wolfe method, a well-known optimization +algorithm which can solve constrained optimization problems without projections +and Hessian matrix computations, thus leading to a efficient way of dealing +with large-scale problems. Theoretically, we establish a bound on the +suboptimality gap of the DFWLayer in the context of l1-norm constraints. +Experimental assessments demonstrate that the DFWLayer not only attains +competitive accuracy in solutions and gradients but also consistently adheres +to constraints. Moreover, it surpasses the baselines in both forward and +backward computational speeds. + +
+
+
+
+
+ + ☆ Stabilizing Unsupervised Environment Design with a Learned Adversary + + +
+ A key challenge in training generally-capable agents is the design of +training tasks that facilitate broad generalization and robustness to +environment variations. This challenge motivates the problem setting of +Unsupervised Environment Design (UED), whereby a student agent trains on an +adaptive distribution of tasks proposed by a teacher agent. A pioneering +approach for UED is PAIRED, which uses reinforcement learning (RL) to train a +teacher policy to design tasks from scratch, making it possible to directly +generate tasks that are adapted to the agent's current capabilities. Despite +its strong theoretical backing, PAIRED suffers from a variety of challenges +that hinder its practical performance. Thus, state-of-the-art methods currently +rely on curation and mutation rather than generation of new tasks. In this +work, we investigate several key shortcomings of PAIRED and propose solutions +for each shortcoming. As a result, we make it possible for PAIRED to match or +exceed state-of-the-art methods, producing robust agents in several established +challenging procedurally-generated environments, including a partially-observed +maze navigation task and a continuous-control car racing environment. We +believe this work motivates a renewed emphasis on UED methods based on learned +models that directly generate challenging environments, potentially unlocking +more open-ended RL training and, as a result, more general agents. + +
+
+ comment: CoLLAs 2023 - Oral; Minqi and Jack contributed equally +
+
+
+
+
+ + ☆ MGMAE: Motion Guided Masking for Video Masked Autoencoding ICCV 2023 + + +
+ Masked autoencoding has shown excellent performance on self-supervised video +representation learning. Temporal redundancy has led to a high masking ratio +and customized masking strategy in VideoMAE. In this paper, we aim to further +improve the performance of video masked autoencoding by introducing a motion +guided masking strategy. Our key insight is that motion is a general and unique +prior in video, which should be taken into account during masked pre-training. +Our motion guided masking explicitly incorporates motion information to build +temporal consistent masking volume. Based on this masking volume, we can track +the unmasked tokens in time and sample a set of temporal consistent cubes from +videos. These temporal aligned unmasked tokens will further relieve the +information leakage issue in time and encourage the MGMAE to learn more useful +structure information. We implement our MGMAE with an online efficient optical +flow estimator and backward masking map warping strategy. We perform +experiments on the datasets of Something-Something V2 and Kinetics-400, +demonstrating the superior performance of our MGMAE to the original VideoMAE. +In addition, we provide the visualization analysis to illustrate that our MGMAE +can sample temporal consistent cubes in a motion-adaptive manner for more +effective video pre-training. + +
+
+ comment: ICCV 2023 camera-ready version +
+
+
+
+
+ + ☆ Instruction Tuning for Large Language Models: A Survey + + +
+ This paper surveys research works in the quickly advancing field of +instruction tuning (IT), a crucial technique to enhance the capabilities and +controllability of large language models (LLMs). Instruction tuning refers to +the process of further training LLMs on a dataset consisting of +\textsc{(instruction, output)} pairs in a supervised fashion, which bridges the +gap between the next-word prediction objective of LLMs and the users' objective +of having LLMs adhere to human instructions. In this work, we make a systematic +review of the literature, including the general methodology of IT, the +construction of IT datasets, the training of IT models, and applications to +different modalities, domains and applications, along with an analysis on +aspects that influence the outcome of IT (e.g., generation of instruction +outputs, size of the instruction dataset, etc). We also review the potential +pitfalls of IT along with criticism against it, along with efforts pointing out +current deficiencies of existing strategies and suggest some avenues for +fruitful research. + +
+
+ comment: A Survey paper, Pre-print +
+
+
+
+
+ + ☆ Zero- and Few-Shot Prompting with LLMs: A Comparative Study with + Fine-tuned Models for Bangla Sentiment Analysis + + +
+ The rapid expansion of the digital world has propelled sentiment analysis +into a critical tool across diverse sectors such as marketing, politics, +customer service, and healthcare. While there have been significant +advancements in sentiment analysis for widely spoken languages, low-resource +languages, such as Bangla, remain largely under-researched due to resource +constraints. Furthermore, the recent unprecedented performance of Large +Language Models (LLMs) in various applications highlights the need to evaluate +them in the context of low-resource languages. In this study, we present a +sizeable manually annotated dataset encompassing 33,605 Bangla news tweets and +Facebook comments. We also investigate zero- and few-shot in-context learning +with several language models, including Flan-T5, GPT-4, and Bloomz, offering a +comparative analysis against fine-tuned models. Our findings suggest that +monolingual transformer-based models consistently outperform other models, even +in zero and few-shot scenarios. To foster continued exploration, we intend to +make this dataset and our research tools publicly available to the broader +research community. In the spirit of further research, we plan to make this +dataset and our experimental resources publicly accessible to the wider +research community. + +
+
+ comment: Zero-Shot Prompting, Few-Shot Prompting, LLMs, Comparative Study, + Fine-tuned Models, Bangla, Sentiment Analysis +
+
+
+
+
+ + ☆ Sparse Linear Concept Discovery Models ICCV + + +
+ The recent mass adoption of DNNs, even in safety-critical scenarios, has +shifted the focus of the research community towards the creation of inherently +intrepretable models. Concept Bottleneck Models (CBMs) constitute a popular +approach where hidden layers are tied to human understandable concepts allowing +for investigation and correction of the network's decisions. However, CBMs +usually suffer from: (i) performance degradation and (ii) lower +interpretability than intended due to the sheer amount of concepts contributing +to each decision. In this work, we propose a simple yet highly intuitive +interpretable framework based on Contrastive Language Image models and a single +sparse linear layer. In stark contrast to related approaches, the sparsity in +our framework is achieved via principled Bayesian arguments by inferring +concept presence via a data-driven Bernoulli distribution. As we experimentally +show, our framework not only outperforms recent CBM approaches accuracy-wise, +but it also yields high per example concept sparsity, facilitating the +individual investigation of the emerging concepts. + +
+
+ comment: Accepted @ ICCVW CLVL 2023 +
+
+
+
+
+ + ☆ Mixed-Integer Projections for Automated Data Correction of EMRs Improve + Predictions of Sepsis among Hospitalized Patients + + +
+ Machine learning (ML) models are increasingly pivotal in automating clinical +decisions. Yet, a glaring oversight in prior research has been the lack of +proper processing of Electronic Medical Record (EMR) data in the clinical +context for errors and outliers. Addressing this oversight, we introduce an +innovative projections-based method that seamlessly integrates clinical +expertise as domain constraints, generating important meta-data that can be +used in ML workflows. In particular, by using high-dimensional mixed-integer +programs that capture physiological and biological constraints on patient +vitals and lab values, we can harness the power of mathematical "projections" +for the EMR data to correct patient data. Consequently, we measure the distance +of corrected data from the constraints defining a healthy range of patient +data, resulting in a unique predictive metric we term as "trust-scores". These +scores provide insight into the patient's health status and significantly boost +the performance of ML classifiers in real-life clinical settings. We validate +the impact of our framework in the context of early detection of sepsis using +ML. We show an AUROC of 0.865 and a precision of 0.922, that surpasses +conventional ML models without such projections. + +
+
+
+
+
+ + ☆ Spear and Shield: Adversarial Attacks and Defense Methods for + Model-Based Link Prediction on Continuous-Time Dynamic Graphs + + +
+ Real-world graphs are dynamic, constantly evolving with new interactions, +such as financial transactions in financial networks. Temporal Graph Neural +Networks (TGNNs) have been developed to effectively capture the evolving +patterns in dynamic graphs. While these models have demonstrated their +superiority, being widely adopted in various important fields, their +vulnerabilities against adversarial attacks remain largely unexplored. In this +paper, we propose T-SPEAR, a simple and effective adversarial attack method for +link prediction on continuous-time dynamic graphs, focusing on investigating +the vulnerabilities of TGNNs. Specifically, before the training procedure of a +victim model, which is a TGNN for link prediction, we inject edge perturbations +to the data that are unnoticeable in terms of the four constraints we propose, +and yet effective enough to cause malfunction of the victim model. Moreover, we +propose a robust training approach T-SHIELD to mitigate the impact of +adversarial attacks. By using edge filtering and enforcing temporal smoothness +to node embeddings, we enhance the robustness of the victim model. Our +experimental study shows that T-SPEAR significantly degrades the victim model's +performance on link prediction tasks, and even more, our attacks are +transferable to other TGNNs, which differ from the victim model assumed by the +attacker. Moreover, we demonstrate that T-SHIELD effectively filters out +adversarial edges and exhibits robustness against adversarial attacks, +surpassing the link prediction performance of the naive TGNN by up to 11.2% +under T-SPEAR. + +
+
+
+
+
+ + ☆ A Modular and Adaptive System for Business Email Compromise Detection + + +
+ The growing sophistication of Business Email Compromise (BEC) and spear +phishing attacks poses significant challenges to organizations worldwide. The +techniques featured in traditional spam and phishing detection are insufficient +due to the tailored nature of modern BEC attacks as they often blend in with +the regular benign traffic. Recent advances in machine learning, particularly +in Natural Language Understanding (NLU), offer a promising avenue for combating +such attacks but in a practical system, due to limitations such as data +availability, operational costs, verdict explainability requirements or a need +to robustly evolve the system, it is essential to combine multiple approaches +together. We present CAPE, a comprehensive and efficient system for BEC +detection that has been proven in a production environment for a period of over +two years. Rather than being a single model, CAPE is a system that combines +independent ML models and algorithms detecting BEC-related behaviors across +various email modalities such as text, images, metadata and the email's +communication context. This decomposition makes CAPE's verdicts naturally +explainable. In the paper, we describe the design principles and constraints +behind its architecture, as well as the challenges of model design, evaluation +and adapting the system continuously through a Bayesian approach that combines +limited data with domain knowledge. Furthermore, we elaborate on several +specific behavioral detectors, such as those based on Transformer neural +architectures. + +
+
+
+
+
+ + ☆ GBM-based Bregman Proximal Algorithms for Constrained Learning + + +
+ As the complexity of learning tasks surges, modern machine learning +encounters a new constrained learning paradigm characterized by more intricate +and data-driven function constraints. Prominent applications include +Neyman-Pearson classification (NPC) and fairness classification, which entail +specific risk constraints that render standard projection-based training +algorithms unsuitable. Gradient boosting machines (GBMs) are among the most +popular algorithms for supervised learning; however, they are generally limited +to unconstrained settings. In this paper, we adapt the GBM for constrained +learning tasks within the framework of Bregman proximal algorithms. We +introduce a new Bregman primal-dual method with a global optimality guarantee +when the learning objective and constraint functions are convex. In cases of +nonconvex functions, we demonstrate how our algorithm remains effective under a +Bregman proximal point framework. Distinct from existing constrained learning +algorithms, ours possess a unique advantage in their ability to seamlessly +integrate with publicly available GBM implementations such as XGBoost (Chen and +Guestrin, 2016) and LightGBM (Ke et al., 2017), exclusively relying on their +public interfaces. We provide substantial experimental evidence to showcase the +effectiveness of the Bregman algorithm framework. While our primary focus is on +NPC and fairness ML, our framework holds significant potential for a broader +range of constrained learning applications. The source code is currently freely +available at +https://github.com/zhenweilin/ConstrainedGBM}{https://github.com/zhenweilin/ConstrainedGBM. + +
+
+
+
+
+ + ☆ To Whom are You Talking? A Deep Learning Model to Endow Social Robots + with Addressee Estimation Skills IJCNN + + +
+ Communicating shapes our social word. For a robot to be considered social and +being consequently integrated in our social environment it is fundamental to +understand some of the dynamics that rule human-human communication. In this +work, we tackle the problem of Addressee Estimation, the ability to understand +an utterance's addressee, by interpreting and exploiting non-verbal bodily cues +from the speaker. We do so by implementing an hybrid deep learning model +composed of convolutional layers and LSTM cells taking as input images +portraying the face of the speaker and 2D vectors of the speaker's body +posture. Our implementation choices were guided by the aim to develop a model +that could be deployed on social robots and be efficient in ecological +scenarios. We demonstrate that our model is able to solve the Addressee +Estimation problem in terms of addressee localisation in space, from a robot +ego-centric point of view. + +
+
+ comment: Accepted version of a paper published at 2023 International Joint + Conference on Neural Networks (IJCNN). Please find the published version and + info to cite the paper at https://doi.org/10.1109/IJCNN54540.2023.10191452 . + 10 pages, 8 Figures, 3 Tables +
+
+
+
+
+ + ☆ On the Adversarial Robustness of Multi-Modal Foundation Models ICCV + + +
+ Multi-modal foundation models combining vision and language models such as +Flamingo or GPT-4 have recently gained enormous interest. Alignment of +foundation models is used to prevent models from providing toxic or harmful +output. While malicious users have successfully tried to jailbreak foundation +models, an equally important question is if honest users could be harmed by +malicious third-party content. In this paper we show that imperceivable attacks +on images in order to change the caption output of a multi-modal foundation +model can be used by malicious content providers to harm honest users e.g. by +guiding them to malicious websites or broadcast fake information. This +indicates that countermeasures to adversarial attacks should be used by any +deployed multi-modal foundation model. + +
+
+ comment: ICCV AROW 2023 +
+
+
+
+
+ + ☆ We Don't Need No Adam, All We Need Is EVE: On The Variance of Dual + Learning Rate And Beyond + + +
+ In the rapidly advancing field of deep learning, optimising deep neural +networks is paramount. This paper introduces a novel method, Enhanced Velocity +Estimation (EVE), which innovatively applies different learning rates to +distinct components of the gradients. By bifurcating the learning rate, EVE +enables more nuanced control and faster convergence, addressing the challenges +associated with traditional single learning rate approaches. Utilising a +momentum term that adapts to the learning landscape, the method achieves a more +efficient navigation of the complex loss surface, resulting in enhanced +performance and stability. Extensive experiments demonstrate that EVE +significantly outperforms existing optimisation techniques across various +benchmark datasets and architectures. + +
+
+
+
+
+ + ☆ UGSL: A Unified Framework for Benchmarking Graph Structure Learning + + +
+ Graph neural networks (GNNs) demonstrate outstanding performance in a broad +range of applications. While the majority of GNN applications assume that a +graph structure is given, some recent methods substantially expanded the +applicability of GNNs by showing that they may be effective even when no graph +structure is explicitly provided. The GNN parameters and a graph structure are +jointly learned. Previous studies adopt different experimentation setups, +making it difficult to compare their merits. In this paper, we propose a +benchmarking strategy for graph structure learning using a unified framework. +Our framework, called Unified Graph Structure Learning (UGSL), reformulates +existing models into a single model. We implement a wide range of existing +models in our framework and conduct extensive analyses of the effectiveness of +different components in the framework. Our results provide a clear and concise +understanding of the different methods in this area as well as their strengths +and weaknesses. The benchmark code is available at +https://github.com/google-research/google-research/tree/master/ugsl. + +
+
+
+
+
+ + ☆ Test-time augmentation-based active learning and self-training for + label-efficient segmentation MICCAI + + +
+ Deep learning techniques depend on large datasets whose annotation is +time-consuming. To reduce annotation burden, the self-training (ST) and +active-learning (AL) methods have been developed as well as methods that +combine them in an iterative fashion. However, it remains unclear when each +method is the most useful, and when it is advantageous to combine them. In this +paper, we propose a new method that combines ST with AL using Test-Time +Augmentations (TTA). First, TTA is performed on an initial teacher network. +Then, cases for annotation are selected based on the lowest estimated Dice +score. Cases with high estimated scores are used as soft pseudo-labels for ST. +The selected annotated cases are trained with existing annotated cases and ST +cases with border slices annotations. We demonstrate the method on MRI fetal +body and placenta segmentation tasks with different data variability +characteristics. Our results indicate that ST is highly effective for both +tasks, boosting performance for in-distribution (ID) and out-of-distribution +(OOD) data. However, while self-training improved the performance of +single-sequence fetal body segmentation when combined with AL, it slightly +deteriorated performance of multi-sequence placenta segmentation on ID data. AL +was helpful for the high variability placenta data, but did not improve upon +random selection for the single-sequence body data. For fetal body segmentation +sequence transfer, combining AL with ST following ST iteration yielded a Dice +of 0.961 with only 6 original scans and 2 new sequence scans. Results using +only 15 high-variability placenta cases were similar to those using 50 cases. +Code is available at: https://github.com/Bella31/TTA-quality-estimation-ST-AL + +
+
+ comment: Accepted to MICCAI MILLanD workshop 2023 +
+
+
+
+
+ + ☆ Clustered Linear Contextual Bandits with Knapsacks + + +
+ In this work, we study clustered contextual bandits where rewards and +resource consumption are the outcomes of cluster-specific linear models. The +arms are divided in clusters, with the cluster memberships being unknown to an +algorithm. Pulling an arm in a time period results in a reward and in +consumption for each one of multiple resources, and with the total consumption +of any resource exceeding a constraint implying the termination of the +algorithm. Thus, maximizing the total reward requires learning not only models +about the reward and the resource consumption, but also cluster memberships. We +provide an algorithm that achieves regret sublinear in the number of time +periods, without requiring access to all of the arms. In particular, we show +that it suffices to perform clustering only once to a randomly selected subset +of the arms. To achieve this result, we provide a sophisticated combination of +techniques from the literature of econometrics and of bandits with constraints. + +
+
+
+
+
+ + ☆ CoMIX: A Multi-agent Reinforcement Learning Training Architecture for + Efficient Decentralized Coordination and Independent Decision Making + + +
+ Robust coordination skills enable agents to operate cohesively in shared +environments, together towards a common goal and, ideally, individually without +hindering each other's progress. To this end, this paper presents Coordinated +QMIX (CoMIX), a novel training framework for decentralized agents that enables +emergent coordination through flexible policies, allowing at the same time +independent decision-making at individual level. CoMIX models selfish and +collaborative behavior as incremental steps in each agent's decision process. +This allows agents to dynamically adapt their behavior to different situations +balancing independence and collaboration. Experiments using a variety of +simulation environments demonstrate that CoMIX outperforms baselines on +collaborative tasks. The results validate our incremental policy approach as +effective technique for improving coordination in multi-agent systems. + +
+
+
+
+
+ + ☆ Relax and penalize: a new bilevel approach to mixed-binary + hyperparameter optimization + + +
+ In recent years, bilevel approaches have become very popular to efficiently +estimate high-dimensional hyperparameters of machine learning models. However, +to date, binary parameters are handled by continuous relaxation and rounding +strategies, which could lead to inconsistent solutions. In this context, we +tackle the challenging optimization of mixed-binary hyperparameters by +resorting to an equivalent continuous bilevel reformulation based on an +appropriate penalty term. We propose an algorithmic framework that, under +suitable assumptions, is guaranteed to provide mixed-binary solutions. +Moreover, the generality of the method allows to safely use existing continuous +bilevel solvers within the proposed framework. We evaluate the performance of +our approach for a specific machine learning problem, i.e., the estimation of +the group-sparsity structure in regression problems. Reported results clearly +show that our method outperforms state-of-the-art approaches based on +relaxation and rounding + +
+
+
+
+
+ + ☆ Measuring the Effect of Causal Disentanglement on the Adversarial + Robustness of Neural Network Models + + +
+ Causal Neural Network models have shown high levels of robustness to +adversarial attacks as well as an increased capacity for generalisation tasks +such as few-shot learning and rare-context classification compared to +traditional Neural Networks. This robustness is argued to stem from the +disentanglement of causal and confounder input signals. However, no +quantitative study has yet measured the level of disentanglement achieved by +these types of causal models or assessed how this relates to their adversarial +robustness. + Existing causal disentanglement metrics are not applicable to deterministic +models trained on real-world datasets. We, therefore, utilise metrics of +content/style disentanglement from the field of Computer Vision to measure +different aspects of the causal disentanglement for four state-of-the-art +causal Neural Network models. By re-implementing these models with a common +ResNet18 architecture we are able to fairly measure their adversarial +robustness on three standard image classification benchmarking datasets under +seven common white-box attacks. We find a strong association (r=0.820, p=0.001) +between the degree to which models decorrelate causal and confounder signals +and their adversarial robustness. Additionally, we find a moderate negative +association between the pixel-level information content of the confounder +signal and adversarial robustness (r=-0.597, p=0.040). + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ Sampling From Autoencoders' Latent Space via Quantization And + Probability Mass Function Concepts + + +
+ In this study, we focus on sampling from the latent space of generative +models built upon autoencoders so as the reconstructed samples are lifelike +images. To do to, we introduce a novel post-training sampling algorithm rooted +in the concept of probability mass functions, coupled with a quantization +process. Our proposed algorithm establishes a vicinity around each latent +vector from the input data and then proceeds to draw samples from these defined +neighborhoods. This strategic approach ensures that the sampled latent vectors +predominantly inhabit high-probability regions, which, in turn, can be +effectively transformed into authentic real-world images. A noteworthy point of +comparison for our sampling algorithm is the sampling technique based on +Gaussian mixture models (GMM), owing to its inherent capability to represent +clusters. Remarkably, we manage to improve the time complexity from the +previous $\mathcal{O}(n\times d \times k \times i)$ associated with GMM +sampling to a much more streamlined $\mathcal{O}(n\times d)$, thereby resulting +in substantial speedup during runtime. Moreover, our experimental results, +gauged through the Fr\'echet inception distance (FID) for image generation, +underscore the superior performance of our sampling algorithm across a diverse +range of models and datasets. On the MNIST benchmark dataset, our approach +outperforms GMM sampling by yielding a noteworthy improvement of up to $0.89$ +in FID value. Furthermore, when it comes to generating images of faces and +ocular images, our approach showcases substantial enhancements with FID +improvements of $1.69$ and $0.87$ respectively, as compared to GMM sampling, as +evidenced on the CelebA and MOBIUS datasets. Lastly, we substantiate our +methodology's efficacy in estimating latent space distributions in contrast to +GMM sampling, particularly through the lens of the Wasserstein distance. + +
+
+
+
+
+ + ☆ Cost-Efficient Online Decision Making: A Combinatorial Multi-Armed + Bandit Approach + + +
+ Online decision making plays a crucial role in numerous real-world +applications. In many scenarios, the decision is made based on performing a +sequence of tests on the incoming data points. However, performing all tests +can be expensive and is not always possible. In this paper, we provide a novel +formulation of the online decision making problem based on combinatorial +multi-armed bandits and take the cost of performing tests into account. Based +on this formulation, we provide a new framework for cost-efficient online +decision making which can utilize posterior sampling or BayesUCB for +exploration. We provide a rigorous theoretical analysis for our framework and +present various experimental results that demonstrate its applicability to +real-world problems. + +
+
+
+
+
+ + ☆ Beyond expectations: Residual Dynamic Mode Decomposition and Variance + for Stochastic Dynamical Systems + + +
+ Koopman operators linearize nonlinear dynamical systems, making their +spectral information of crucial interest. Numerous algorithms have been +developed to approximate these spectral properties, and Dynamic Mode +Decomposition (DMD) stands out as the poster child of projection-based methods. +Although the Koopman operator itself is linear, the fact that it acts in an +infinite-dimensional space of observables poses various challenges. These +include spurious modes, essential spectra, and the verification of Koopman mode +decompositions. While recent work has addressed these challenges for +deterministic systems, there remains a notable gap in verified DMD methods +tailored for stochastic systems, where the Koopman operator measures the +expectation of observables. We show that it is necessary to go beyond +expectations to address these issues. By incorporating variance into the +Koopman framework, we address these challenges. Through an additional DMD-type +matrix, we approximate the sum of a squared residual and a variance term, each +of which can be approximated individually using batched snapshot data. This +allows verified computation of the spectral properties of stochastic Koopman +operators, controlling the projection error. We also introduce the concept of +variance-pseudospectra to gauge statistical coherency. Finally, we present a +suite of convergence results for the spectral quantities of stochastic Koopman +operators. Our study concludes with practical applications using both simulated +and experimental data. In neural recordings from awake mice, we demonstrate how +variance-pseudospectra can reveal physiologically significant information +unavailable to standard expectation-based dynamical models. + +
+
+
+
+
+ + ☆ An Improved Best-of-both-worlds Algorithm for Bandits with Delayed + Feedback + + +
+ We propose a new best-of-both-worlds algorithm for bandits with variably +delayed feedback. The algorithm improves on prior work by Masoudian et al. +[2022] by eliminating the need in prior knowledge of the maximal delay +$d_{\mathrm{max}}$ and providing tighter regret bounds in both regimes. The +algorithm and its regret bounds are based on counts of outstanding observations +(a quantity that is observed at action time) rather than delays or the maximal +delay (quantities that are only observed when feedback arrives). One major +contribution is a novel control of distribution drift, which is based on biased +loss estimators and skipping of observations with excessively large delays. +Another major contribution is demonstrating that the complexity of +best-of-both-worlds bandits with delayed feedback is characterized by the +cumulative count of outstanding observations after skipping of observations +with excessively large delays, rather than the delays or the maximal delay. + +
+
+
+
+
+ + ☆ A Safe Deep Reinforcement Learning Approach for Energy Efficient + Federated Learning in Wireless Communication Networks + + +
+ Progressing towards a new era of Artificial Intelligence (AI) - enabled +wireless networks, concerns regarding the environmental impact of AI have been +raised both in industry and academia. Federated Learning (FL) has emerged as a +key privacy preserving decentralized AI technique. Despite efforts currently +being made in FL, its environmental impact is still an open problem. Targeting +the minimization of the overall energy consumption of an FL process, we propose +the orchestration of computational and communication resources of the involved +devices to minimize the total energy required, while guaranteeing a certain +performance of the model. To this end, we propose a Soft Actor Critic Deep +Reinforcement Learning (DRL) solution, where a penalty function is introduced +during training, penalizing the strategies that violate the constraints of the +environment, and ensuring a safe RL process. A device level synchronization +method, along with a computationally cost effective FL environment are +proposed, with the goal of further reducing the energy consumption and +communication overhead. Evaluation results show the effectiveness of the +proposed scheme compared to four state-of-the-art baseline solutions in both +static and dynamic environments, achieving a decrease of up to 94% in the total +energy consumption. + +
+
+ comment: 27 Pages Single Column, 6 Figures, Submitted for possible publication + in the IEEE Transactions on Green Communications and Networking (TGCN). arXiv + admin note: text overlap with arXiv:2306.14237 +
+
+
+
+
+ + ☆ Practical Parallel Algorithms for Non-Monotone Submodular Maximization AAAI-2023 + + +
+ Submodular maximization has found extensive applications in various domains +within the field of artificial intelligence, including but not limited to +machine learning, computer vision, and natural language processing. With the +increasing size of datasets in these domains, there is a pressing need to +develop efficient and parallelizable algorithms for submodular maximization. +One measure of the parallelizability of a submodular maximization algorithm is +its adaptive complexity, which indicates the number of sequential rounds where +a polynomial number of queries to the objective function can be executed in +parallel. In this paper, we study the problem of non-monotone submodular +maximization subject to a knapsack constraint, and propose the first +combinatorial algorithm achieving an $(8+\epsilon)$-approximation under +$\mathcal{O}(\log n)$ adaptive complexity, which is \textit{optimal} up to a +factor of $\mathcal{O}(\log\log n)$. Moreover, we also propose the first +algorithm with both provable approximation ratio and sublinear adaptive +complexity for the problem of non-monotone submodular maximization subject to a +$k$-system constraint. As a by-product, we show that our two algorithms can +also be applied to the special case of submodular maximization subject to a +cardinality constraint, and achieve performance bounds comparable with those of +state-of-the-art algorithms. Finally, the effectiveness of our approach is +demonstrated by extensive experiments on real-world applications. + +
+
+ comment: Part of the contribution appears in AAAI-2023 +
+
+
+
+
+ + ☆ Deep Evidential Learning for Bayesian Quantile Regression + + +
+ It is desirable to have accurate uncertainty estimation from a single +deterministic forward-pass model, as traditional methods for uncertainty +quantification are computationally expensive. However, this is difficult +because single forward-pass models do not sample weights during inference and +often make assumptions about the target distribution, such as assuming it is +Gaussian. This can be restrictive in regression tasks, where the mean and +standard deviation are inadequate to model the target distribution accurately. +This paper proposes a deep Bayesian quantile regression model that can estimate +the quantiles of a continuous target distribution without the Gaussian +assumption. The proposed method is based on evidential learning, which allows +the model to capture aleatoric and epistemic uncertainty with a single +deterministic forward-pass model. This makes the method efficient and scalable +to large models and datasets. We demonstrate that the proposed method achieves +calibrated uncertainties on non-Gaussian distributions, disentanglement of +aleatoric and epistemic uncertainty, and robustness to out-of-distribution +samples. + +
+
+
+
+
+ + ☆ Reinforcement Learning Based Sensor Optimization for Bio-markers + + +
+ Radio frequency (RF) biosensors, in particular those based on inter-digitated +capacitors (IDCs), are pivotal in areas like biomedical diagnosis, remote +sensing, and wireless communication. Despite their advantages of low cost and +easy fabrication, their sensitivity can be hindered by design imperfections, +environmental factors, and circuit noise. This paper investigates enhancing the +sensitivity of IDC-based RF sensors using novel reinforcement learning based +Binary Particle Swarm Optimization (RLBPSO), and it is compared to Ant Colony +Optimization (ACO), and other state-of-the-art methods. By focusing on +optimizing design parameters like electrode design and finger width, the +proposed study found notable improvements in sensor sensitivity. The proposed +RLBPSO method shows best optimized design for various frequency ranges when +compared to current state-of-the-art methods. + +
+
+ comment: 7 pages, 4 tables +
+
+
+
+
+ + ☆ Faster Training of Neural ODEs Using Gauß-Legendre Quadrature + + +
+ Neural ODEs demonstrate strong performance in generative and time-series +modelling. However, training them via the adjoint method is slow compared to +discrete models due to the requirement of numerically solving ODEs. To speed +neural ODEs up, a common approach is to regularise the solutions. However, this +approach may affect the expressivity of the model; when the trajectory itself +matters, this is particularly important. In this paper, we propose an +alternative way to speed up the training of neural ODEs. The key idea is to +speed up the adjoint method by using Gau{\ss}-Legendre quadrature to solve +integrals faster than ODE-based methods while remaining memory efficient. We +also extend the idea to training SDEs using the Wong-Zakai theorem, by training +a corresponding ODE and transferring the parameters. Our approach leads to +faster training of neural ODEs, especially for large models. It also presents a +new way to train SDE-based models. + +
+
+ comment: 32 pages, 16 figures, 7 tables, published in TMLR 2023 +
+
+
+
+
+ + ☆ SCULPT: Shape-Conditioned Unpaired Learning of Pose-dependent Clothed + and Textured Human Meshes + + +
+ We present SCULPT, a novel 3D generative model for clothed and textured 3D +meshes of humans. Specifically, we devise a deep neural network that learns to +represent the geometry and appearance distribution of clothed human bodies. +Training such a model is challenging, as datasets of textured 3D meshes for +humans are limited in size and accessibility. Our key observation is that there +exist medium-sized 3D scan datasets like CAPE, as well as large-scale 2D image +datasets of clothed humans and multiple appearances can be mapped to a single +geometry. To effectively learn from the two data modalities, we propose an +unpaired learning procedure for pose-dependent clothed and textured human +meshes. Specifically, we learn a pose-dependent geometry space from 3D scan +data. We represent this as per vertex displacements w.r.t. the SMPL model. +Next, we train a geometry conditioned texture generator in an unsupervised way +using the 2D image data. We use intermediate activations of the learned +geometry model to condition our texture generator. To alleviate entanglement +between pose and clothing type, and pose and clothing appearance, we condition +both the texture and geometry generators with attribute labels such as clothing +types for the geometry, and clothing colors for the texture generator. We +automatically generated these conditioning labels for the 2D images based on +the visual question answering model BLIP and CLIP. We validate our method on +the SCULPT dataset, and compare to state-of-the-art 3D generative models for +clothed human bodies. We will release the codebase for research purposes. + +
+
+
+
+
+ + ☆ Foundation Model-oriented Robustness: Robust Image Model Evaluation with + Pretrained Models + + +
+ Machine learning has demonstrated remarkable performance over finite +datasets, yet whether the scores over the fixed benchmarks can sufficiently +indicate the model's performance in the real world is still in discussion. In +reality, an ideal robust model will probably behave similarly to the oracle +(e.g., the human users), thus a good evaluation protocol is probably to +evaluate the models' behaviors in comparison to the oracle. In this paper, we +introduce a new robustness measurement that directly measures the image +classification model's performance compared with a surrogate oracle (i.e., a +foundation model). Besides, we design a simple method that can accomplish the +evaluation beyond the scope of the benchmarks. Our method extends the image +datasets with new samples that are sufficiently perturbed to be distinct from +the ones in the original sets, but are still bounded within the same +image-label structure the original test image represents, constrained by a +foundation model pretrained with a large amount of samples. As a result, our +new method will offer us a new way to evaluate the models' robustness +performance, free of limitations of fixed benchmarks or constrained +perturbations, although scoped by the power of the oracle. In addition to the +evaluation results, we also leverage our generated data to understand the +behaviors of the model and our new evaluation strategies. + +
+
+
+
+
+ + ☆ A Homogenization Approach for Gradient-Dominated Stochastic Optimization + + +
+ Gradient dominance property is a condition weaker than strong convexity, yet +it sufficiently ensures global convergence for first-order methods even in +non-convex optimization. This property finds application in various machine +learning domains, including matrix decomposition, linear neural networks, and +policy-based reinforcement learning (RL). In this paper, we study the +stochastic homogeneous second-order descent method (SHSODM) for +gradient-dominated optimization with $\alpha \in [1, 2]$ based on a recently +proposed homogenization approach. Theoretically, we show that SHSODM achieves a +sample complexity of $O(\epsilon^{-7/(2 \alpha) +1})$ for $\alpha \in [1, 3/2)$ +and $\tilde{O}(\epsilon^{-2/\alpha})$ for $\alpha \in [3/2, 2]$. We further +provide a SHSODM with a variance reduction technique enjoying an improved +sample complexity of $O( \epsilon ^{-( 7-3\alpha ) /( 2\alpha )})$ for $\alpha +\in [1,3/2)$. Our results match the state-of-the-art sample complexity bounds +for stochastic gradient-dominated optimization without \emph{cubic +regularization}. Since the homogenization approach only relies on solving +extremal eigenvector problems instead of Newton-type systems, our methods gain +the advantage of cheaper iterations and robustness in ill-conditioned problems. +Numerical experiments on several RL tasks demonstrate the efficiency of SHSODM +compared to other off-the-shelf methods. + +
+
+
+
+
+ + ☆ GaitPT: Skeletons Are All You Need For Gait Recognition + + +
+ The analysis of patterns of walking is an important area of research that has +numerous applications in security, healthcare, sports and human-computer +interaction. Lately, walking patterns have been regarded as a unique +fingerprinting method for automatic person identification at a distance. In +this work, we propose a novel gait recognition architecture called Gait Pyramid +Transformer (GaitPT) that leverages pose estimation skeletons to capture unique +walking patterns, without relying on appearance information. GaitPT adopts a +hierarchical transformer architecture that effectively extracts both spatial +and temporal features of movement in an anatomically consistent manner, guided +by the structure of the human skeleton. Our results show that GaitPT achieves +state-of-the-art performance compared to other skeleton-based gait recognition +works, in both controlled and in-the-wild scenarios. GaitPT obtains 82.6% +average accuracy on CASIA-B, surpassing other works by a margin of 6%. +Moreover, it obtains 52.16% Rank-1 accuracy on GREW, outperforming both +skeleton-based and appearance-based approaches. + +
+
+
+
+
+ + ☆ Weighting by Tying: A New Approach to Weighted Rank Correlation + + +
+ Measures of rank correlation are commonly used in statistics to capture the +degree of concordance between two orderings of the same set of items. Standard +measures like Kendall's tau and Spearman's rho coefficient put equal emphasis +on each position of a ranking. Yet, motivated by applications in which some of +the positions (typically those on the top) are more important than others, a +few weighted variants of these measures have been proposed. Most of these +generalizations fail to meet desirable formal properties, however. Besides, +they are often quite inflexible in the sense of committing to a fixed weighing +scheme. In this paper, we propose a weighted rank correlation measure on the +basis of fuzzy order relations. Our measure, called scaled gamma, is related to +Goodman and Kruskal's gamma rank correlation. It is parametrized by a fuzzy +equivalence relation on the rank positions, which in turn is specified +conveniently by a so-called scaling function. This approach combines soundness +with flexibility: it has a sound formal foundation and allows for weighing rank +positions in a flexible way. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ centroIDA: Cross-Domain Class Discrepancy Minimization Based on + Accumulative Class-Centroids for Imbalanced Domain Adaptation + + +
+ Unsupervised Domain Adaptation (UDA) approaches address the covariate shift +problem by minimizing the distribution discrepancy between the source and +target domains, assuming that the label distribution is invariant across +domains. However, in the imbalanced domain adaptation (IDA) scenario, covariate +and long-tailed label shifts both exist across domains. To tackle the IDA +problem, some current research focus on minimizing the distribution +discrepancies of each corresponding class between source and target domains. +Such methods rely much on the reliable pseudo labels' selection and the feature +distributions estimation for target domain, and the minority classes with +limited numbers makes the estimations more uncertainty, which influences the +model's performance. In this paper, we propose a cross-domain class discrepancy +minimization method based on accumulative class-centroids for IDA (centroIDA). +Firstly, class-based re-sampling strategy is used to obtain an unbiased +classifier on source domain. Secondly, the accumulative class-centroids +alignment loss is proposed for iterative class-centroids alignment across +domains. Finally, class-wise feature alignment loss is used to optimize the +feature representation for a robust classification boundary. A series of +experiments have proved that our method outperforms other SOTA methods on IDA +problem, especially with the increasing degree of label shift. + +
+
+
+
+
+ + ☆ ST-RAP: A Spatio-Temporal Framework for Real Estate Appraisal CIKM'23 + + +
+ In this paper, we introduce ST-RAP, a novel Spatio-Temporal framework for +Real estate APpraisal. ST-RAP employs a hierarchical architecture with a +heterogeneous graph neural network to encapsulate temporal dynamics and spatial +relationships simultaneously. Through comprehensive experiments on a +large-scale real estate dataset, ST-RAP outperforms previous methods, +demonstrating the significant benefits of integrating spatial and temporal +aspects in real estate appraisal. Our code and dataset are available at +https://github.com/dojeon-ai/STRAP. + +
+
+ comment: Accepted to CIKM'23 +
+
+
+
+
+ + ☆ FocalDreamer: Text-driven 3D Editing via Focal-fusion Assembly + + +
+ While text-3D editing has made significant strides in leveraging score +distillation sampling, emerging approaches still fall short in delivering +separable, precise and consistent outcomes that are vital to content creation. +In response, we introduce FocalDreamer, a framework that merges base shape with +editable parts according to text prompts for fine-grained editing within +desired regions. Specifically, equipped with geometry union and dual-path +rendering, FocalDreamer assembles independent 3D parts into a complete object, +tailored for convenient instance reuse and part-wise control. We propose +geometric focal loss and style consistency regularization, which encourage +focal fusion and congruent overall appearance. Furthermore, FocalDreamer +generates high-fidelity geometry and PBR textures which are compatible with +widely-used graphics engines. Extensive experiments have highlighted the +superior editing capabilities of FocalDreamer in both quantitative and +qualitative evaluations. + +
+
+ comment: Project website: https://fantasia3d.github.io +
+
+
+
+
+ + ☆ Analyzing Complex Systems with Cascades Using Continuous-Time Bayesian + Networks + + +
+ Interacting systems of events may exhibit cascading behavior where events +tend to be temporally clustered. While the cascades themselves may be obvious +from the data, it is important to understand which states of the system trigger +them. For this purpose, we propose a modeling framework based on +continuous-time Bayesian networks (CTBNs) to analyze cascading behavior in +complex systems. This framework allows us to describe how events propagate +through the system and to identify likely sentry states, that is, system states +that may lead to imminent cascading behavior. Moreover, CTBNs have a simple +graphical representation and provide interpretable outputs, both of which are +important when communicating with domain experts. We also develop new methods +for knowledge extraction from CTBNs and we apply the proposed methodology to a +data set of alarms in a large industrial system. + +
+
+ comment: 21 pages, 11 figures +
+
+
+
+
+ + ☆ BackTrack: Robust template update via Backward Tracking of candidate + template + + +
+ Variations of target appearance such as deformations, illumination variance, +occlusion, etc., are the major challenges of visual object tracking that +negatively impact the performance of a tracker. An effective method to tackle +these challenges is template update, which updates the template to reflect the +change of appearance in the target object during tracking. However, with +template updates, inadequate quality of new templates or inappropriate timing +of updates may induce a model drift problem, which severely degrades the +tracking performance. Here, we propose BackTrack, a robust and reliable method +to quantify the confidence of the candidate template by backward tracking it on +the past frames. Based on the confidence score of candidates from BackTrack, we +can update the template with a reliable candidate at the right time while +rejecting unreliable candidates. BackTrack is a generic template update scheme +and is applicable to any template-based trackers. Extensive experiments on +various tracking benchmarks verify the effectiveness of BackTrack over existing +template update algorithms, as it achieves SOTA performance on various tracking +benchmarks. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ Improving the Transferability of Adversarial Examples with Arbitrary + Style Transfer + + +
+ Deep neural networks are vulnerable to adversarial examples crafted by +applying human-imperceptible perturbations on clean inputs. Although many +attack methods can achieve high success rates in the white-box setting, they +also exhibit weak transferability in the black-box setting. Recently, various +methods have been proposed to improve adversarial transferability, in which the +input transformation is one of the most effective methods. In this work, we +notice that existing input transformation-based works mainly adopt the +transformed data in the same domain for augmentation. Inspired by domain +generalization, we aim to further improve the transferability using the data +augmented from different domains. Specifically, a style transfer network can +alter the distribution of low-level visual features in an image while +preserving semantic content for humans. Hence, we propose a novel attack method +named Style Transfer Method (STM) that utilizes a proposed arbitrary style +transfer network to transform the images into different domains. To avoid +inconsistent semantic information of stylized images for the classification +network, we fine-tune the style transfer network and mix up the generated +images added by random noise with the original images to maintain semantic +consistency and boost input diversity. Extensive experimental results on the +ImageNet-compatible dataset show that our proposed method can significantly +improve the adversarial transferability on either normally trained models or +adversarially trained models than state-of-the-art input transformation-based +attacks. Code is available at: https://github.com/Zhijin-Ge/STM. + +
+
+ comment: 10 pages, 2 figures, accepted by the 31st ACM International + Conference on Multimedia (MM '23) +
+
+
+
+
+ + ☆ Image-free Classifier Injection for Zero-Shot Classification ICCV 2023 + + +
+ Zero-shot learning models achieve remarkable results on image classification +for samples from classes that were not seen during training. However, such +models must be trained from scratch with specialised methods: therefore, access +to a training dataset is required when the need for zero-shot classification +arises. In this paper, we aim to equip pre-trained models with zero-shot +classification capabilities without the use of image data. We achieve this with +our proposed Image-free Classifier Injection with Semantics (ICIS) that injects +classifiers for new, unseen classes into pre-trained classification models in a +post-hoc fashion without relying on image data. Instead, the existing +classifier weights and simple class-wise descriptors, such as class names or +attributes, are used. ICIS has two encoder-decoder networks that learn to +reconstruct classifier weights from descriptors (and vice versa), exploiting +(cross-)reconstruction and cosine losses to regularise the decoding process. +Notably, ICIS can be cheaply trained and applied directly on top of pre-trained +classification models. Experiments on benchmark ZSL datasets show that ICIS +produces unseen classifier weights that achieve strong (generalised) zero-shot +classification performance. Code is available at +https://github.com/ExplainableML/ImageFreeZSL . + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ RADIANCE: Radio-Frequency Adversarial Deep-learning Inference for + Automated Network Coverage Estimation + + +
+ Radio-frequency coverage maps (RF maps) are extensively utilized in wireless +networks for capacity planning, placement of access points and base stations, +localization, and coverage estimation. Conducting site surveys to obtain RF +maps is labor-intensive and sometimes not feasible. In this paper, we propose +radio-frequency adversarial deep-learning inference for automated network +coverage estimation (RADIANCE), a generative adversarial network (GAN) based +approach for synthesizing RF maps in indoor scenarios. RADIANCE utilizes a +semantic map, a high-level representation of the indoor environment to encode +spatial relationships and attributes of objects within the environment and +guide the RF map generation process. We introduce a new gradient-based loss +function that computes the magnitude and direction of change in received signal +strength (RSS) values from a point within the environment. RADIANCE +incorporates this loss function along with the antenna pattern to capture +signal propagation within a given indoor configuration and generate new +patterns under new configuration, antenna (beam) pattern, and center frequency. +Extensive simulations are conducted to compare RADIANCE with ray-tracing +simulations of RF maps. Our results show that RADIANCE achieves a mean average +error (MAE) of 0.09, root-mean-squared error (RMSE) of 0.29, peak +signal-to-noise ratio (PSNR) of 10.78, and multi-scale structural similarity +index (MS-SSIM) of 0.80. + +
+
+ comment: 6 pages, 6 figures +
+
+
+
+
+ + ☆ Overcoming Overconfidence for Active Learning + + +
+ It is not an exaggeration to say that the recent progress in artificial +intelligence technology depends on large-scale and high-quality data. +Simultaneously, a prevalent issue exists everywhere: the budget for data +labeling is constrained. Active learning is a prominent approach for addressing +this issue, where valuable data for labeling is selected through a model and +utilized to iteratively adjust the model. However, due to the limited amount of +data in each iteration, the model is vulnerable to bias; thus, it is more +likely to yield overconfident predictions. In this paper, we present two novel +methods to address the problem of overconfidence that arises in the active +learning scenario. The first is an augmentation strategy named +Cross-Mix-and-Mix (CMaM), which aims to calibrate the model by expanding the +limited training distribution. The second is a selection strategy named Ranked +Margin Sampling (RankedMS), which prevents choosing data that leads to overly +confident predictions. Through various experiments and analyses, we are able to +demonstrate that our proposals facilitate efficient data selection by +alleviating overconfidence, even though they are readily applicable. + +
+
+
+
+
+ + ☆ Decentralized Riemannian Conjugate Gradient Method on the Stiefel + Manifold + + +
+ The conjugate gradient method is a crucial first-order optimization method +that generally converges faster than the steepest descent method, and its +computational cost is much lower than the second-order methods. However, while +various types of conjugate gradient methods have been studied in Euclidean +spaces and on Riemannian manifolds, there has little study for those in +distributed scenarios. This paper proposes a decentralized Riemannian conjugate +gradient descent (DRCGD) method that aims at minimizing a global function over +the Stiefel manifold. The optimization problem is distributed among a network +of agents, where each agent is associated with a local function, and +communication between agents occurs over an undirected connected graph. Since +the Stiefel manifold is a non-convex set, a global function is represented as a +finite sum of possibly non-convex (but smooth) local functions. The proposed +method is free from expensive Riemannian geometric operations such as +retractions, exponential maps, and vector transports, thereby reducing the +computational complexity required by each agent. To the best of our knowledge, +DRCGD is the first decentralized Riemannian conjugate gradient algorithm to +achieve global convergence over the Stiefel manifold. + +
+
+
+
+
+ + ☆ Towards Accelerated Model Training via Bayesian Data Selection + + +
+ Mislabeled, duplicated, or biased data in real-world scenarios can lead to +prolonged training and even hinder model convergence. Traditional solutions +prioritizing easy or hard samples lack the flexibility to handle such a variety +simultaneously. Recent work has proposed a more reasonable data selection +principle by examining the data's impact on the model's generalization loss. +However, its practical adoption relies on less principled approximations and +additional clean holdout data. This work solves these problems by leveraging a +lightweight Bayesian treatment and incorporating off-the-shelf zero-shot +predictors built on large-scale pre-trained models. The resulting algorithm is +efficient and easy-to-implement. We perform extensive empirical studies on +challenging benchmarks with considerable data noise and imbalance in the online +batch selection scenario, and observe superior training efficiency over +competitive baselines. Notably, on the challenging WebVision benchmark, our +method can achieve similar predictive performance with significantly fewer +training iterations than leading data selection methods. + +
+
+
+
+
+ + ☆ Learning Weakly Convex Regularizers for Convergent Image-Reconstruction + Algorithms + + +
+ We propose to learn non-convex regularizers with a prescribed upper bound on +their weak-convexity modulus. Such regularizers give rise to variational +denoisers that minimize a convex energy. They rely on few parameters (less than +15,000) and offer a signal-processing interpretation as they mimic handcrafted +sparsity-promoting regularizers. Through numerical experiments, we show that +such denoisers outperform convex-regularization methods as well as the popular +BM3D denoiser. Additionally, the learned regularizer can be deployed to solve +inverse problems with iterative schemes that provably converge. For both CT and +MRI reconstruction, the regularizer generalizes well and offers an excellent +tradeoff between performance, number of parameters, guarantees, and +interpretability when compared to other data-driven approaches. + +
+
+
+
+
+ + ☆ KGrEaT: A Framework to Evaluate Knowledge Graphs via Downstream Tasks CIKM'23 + + +
+ In recent years, countless research papers have addressed the topics of +knowledge graph creation, extension, or completion in order to create knowledge +graphs that are larger, more correct, or more diverse. This research is +typically motivated by the argumentation that using such enhanced knowledge +graphs to solve downstream tasks will improve performance. Nonetheless, this is +hardly ever evaluated. Instead, the predominant evaluation metrics - aiming at +correctness and completeness - are undoubtedly valuable but fail to capture the +complete picture, i.e., how useful the created or enhanced knowledge graph +actually is. Further, the accessibility of such a knowledge graph is rarely +considered (e.g., whether it contains expressive labels, descriptions, and +sufficient context information to link textual mentions to the entities of the +knowledge graph). To better judge how well knowledge graphs perform on actual +tasks, we present KGrEaT - a framework to estimate the quality of knowledge +graphs via actual downstream tasks like classification, clustering, or +recommendation. Instead of comparing different methods of processing knowledge +graphs with respect to a single task, the purpose of KGrEaT is to compare +various knowledge graphs as such by evaluating them on a fixed task setup. The +framework takes a knowledge graph as input, automatically maps it to the +datasets to be evaluated on, and computes performance metrics for the defined +tasks. It is built in a modular way to be easily extendable with additional +tasks and datasets. + +
+
+ comment: Accepted for the Short Paper track of CIKM'23, October 21-25, 2023, + Birmingham, United Kingdom +
+
+
+
+
+ + ☆ DPAN: Dynamic Preference-based and Attribute-aware Network for Relevant + Recommendations + + +
+ In e-commerce platforms, the relevant recommendation is a unique scenario +providing related items for a trigger item that users are interested in. +However, users' preferences for the similarity and diversity of recommendation +results are dynamic and vary under different conditions. Moreover, individual +item-level diversity is too coarse-grained since all recommended items are +related to the trigger item. Thus, the two main challenges are to learn +fine-grained representations of similarity and diversity and capture users' +dynamic preferences for them under different conditions. To address these +challenges, we propose a novel method called the Dynamic Preference-based and +Attribute-aware Network (DPAN) for predicting Click-Through Rate (CTR) in +relevant recommendations. Specifically, based on Attribute-aware Activation +Values Generation (AAVG), Bi-dimensional Compression-based Re-expression (BCR) +is designed to obtain similarity and diversity representations of user +interests and item information. Then Shallow and Deep Union-based Fusion (SDUF) +is proposed to capture users' dynamic preferences for the diverse degree of +recommendation results according to various conditions. DPAN has demonstrated +its effectiveness through extensive offline experiments and online A/B testing, +resulting in a significant 7.62% improvement in CTR. Currently, DPAN has been +successfully deployed on our e-commerce platform serving the primary traffic +for relevant recommendations. The code of DPAN has been made publicly +available. + +
+
+
+
+
+ + ☆ Information Theory-Guided Heuristic Progressive Multi-View Coding + + +
+ Multi-view representation learning aims to capture comprehensive information +from multiple views of a shared context. Recent works intuitively apply +contrastive learning to different views in a pairwise manner, which is still +scalable: view-specific noise is not filtered in learning view-shared +representations; the fake negative pairs, where the negative terms are actually +within the same class as the positive, and the real negative pairs are +coequally treated; evenly measuring the similarities between terms might +interfere with optimization. Importantly, few works study the theoretical +framework of generalized self-supervised multi-view learning, especially for +more than two views. To this end, we rethink the existing multi-view learning +paradigm from the perspective of information theory and then propose a novel +information theoretical framework for generalized multi-view learning. Guided +by it, we build a multi-view coding method with a three-tier progressive +architecture, namely Information theory-guided hierarchical Progressive +Multi-view Coding (IPMC). In the distribution-tier, IPMC aligns the +distribution between views to reduce view-specific noise. In the set-tier, IPMC +constructs self-adjusted contrasting pools, which are adaptively modified by a +view filter. Lastly, in the instance-tier, we adopt a designed unified loss to +learn representations and reduce the gradient interference. Theoretically and +empirically, we demonstrate the superiority of IPMC over state-of-the-art +methods. + +
+
+ comment: This paper is accepted y the jourcal of Elsevier Neural Networks by + 2023. arXiv admin note: substantial text overlap with arXiv:2109.02344 +
+
+
+
+
+ + ☆ Performance Enhancement Leveraging Mask-RCNN on Bengali Document Layout + Analysis + + +
+ Understanding digital documents is like solving a puzzle, especially +historical ones. Document Layout Analysis (DLA) helps with this puzzle by +dividing documents into sections like paragraphs, images, and tables. This is +crucial for machines to read and understand these documents.In the DL Sprint +2.0 competition, we worked on understanding Bangla documents. We used a dataset +called BaDLAD with lots of examples. We trained a special model called Mask +R-CNN to help with this understanding. We made this model better by +step-by-step hyperparameter tuning, and we achieved a good dice score of +0.889.However, not everything went perfectly. We tried using a model trained +for English documents, but it didn't fit well with Bangla. This showed us that +each language has its own challenges. Our solution for the DL Sprint 2.0 is +publicly available at +https://www.kaggle.com/competitions/dlsprint2/discussion/432201 along with +notebooks, weights, and inference notebook. + +
+
+ comment: Contest paper, Conest: DL sprint 2.0 (Link: + https://www.kaggle.com/competitions/dlsprint2), Solution link: + https://www.kaggle.com/competitions/dlsprint2/discussion/432201 +
+
+
+
+
+ + ☆ A Clustering Algorithm to Organize Satellite Hotspot Data for the + Purpose of Tracking Bushfires Remotely + + +
+ This paper proposes a spatiotemporal clustering algorithm and its +implementation in the R package spotoroo. This work is motivated by the +catastrophic bushfires in Australia throughout the summer of 2019-2020 and made +possible by the availability of satellite hotspot data. The algorithm is +inspired by two existing spatiotemporal clustering algorithms but makes +enhancements to cluster points spatially in conjunction with their movement +across consecutive time periods. It also allows for the adjustment of key +parameters, if required, for different locations and satellite data sources. +Bushfire data from Victoria, Australia, is used to illustrate the algorithm and +its use within the package. + +
+
+
+
+
+ + ☆ Adaptive Thresholding Heuristic for KPI Anomaly Detection + + +
+ A plethora of outlier detectors have been explored in the time series domain, +however, in a business sense, not all outliers are anomalies of interest. +Existing anomaly detection solutions are confined to certain outlier detectors +limiting their applicability to broader anomaly detection use cases. Network +KPIs (Key Performance Indicators) tend to exhibit stochastic behaviour +producing statistical outliers, most of which do not adversely affect business +operations. Thus, a heuristic is required to capture the business definition of +an anomaly for time series KPI. This article proposes an Adaptive Thresholding +Heuristic (ATH) to dynamically adjust the detection threshold based on the +local properties of the data distribution and adapt to changes in time series +patterns. The heuristic derives the threshold based on the expected periodicity +and the observed proportion of anomalies minimizing false positives and +addressing concept drift. ATH can be used in conjunction with any underlying +seasonality decomposition method and an outlier detector that yields an outlier +score. This method has been tested on EON1-Cell-U, a labeled KPI anomaly +dataset produced by Ericsson, to validate our hypothesis. Experimental results +show that ATH is computationally efficient making it scalable for near real +time anomaly detection and flexible with multiple forecasters and outlier +detectors. + +
+
+
+
+
+ + ☆ GradientCoin: A Peer-to-Peer Decentralized Large Language Models + + +
+ Since 2008, after the proposal of a Bitcoin electronic cash system, Bitcoin +has fundamentally changed the economic system over the last decade. Since 2022, +large language models (LLMs) such as GPT have outperformed humans in many +real-life tasks. However, these large language models have several practical +issues. For example, the model is centralized and controlled by a specific +unit. One weakness is that if that unit decides to shut down the model, it +cannot be used anymore. The second weakness is the lack of guaranteed +discrepancy behind this model, as certain dishonest units may design their own +models and feed them unhealthy training data. + In this work, we propose a purely theoretical design of a decentralized LLM +that operates similarly to a Bitcoin cash system. However, implementing such a +system might encounter various practical difficulties. Furthermore, this new +system is unlikely to perform better than the standard Bitcoin system in +economics. Therefore, the motivation for designing such a system is limited. It +is likely that only two types of people would be interested in setting up a +practical system for it: + $\bullet$ Those who prefer to use a decentralized ChatGPT-like software. + $\bullet$ Those who believe that the purpose of carbon-based life is to +create silicon-based life, such as Optimus Prime in Transformers. + The reason the second type of people may be interested is that it is possible +that one day an AI system like this will awaken and become the next level of +intelligence on this planet. + +
+
+
+
+
+ + ☆ Deep Learning of Delay-Compensated Backstepping for Reaction-Diffusion + PDEs + + +
+ Deep neural networks that approximate nonlinear function-to-function +mappings, i.e., operators, which are called DeepONet, have been demonstrated in +recent articles to be capable of encoding entire PDE control methodologies, +such as backstepping, so that, for each new functional coefficient of a PDE +plant, the backstepping gains are obtained through a simple function +evaluation. These initial results have been limited to single PDEs from a given +class, approximating the solutions of only single-PDE operators for the gain +kernels. In this paper we expand this framework to the approximation of +multiple (cascaded) nonlinear operators. Multiple operators arise in the +control of PDE systems from distinct PDE classes, such as the system in this +paper: a reaction-diffusion plant, which is a parabolic PDE, with input delay, +which is a hyperbolic PDE. The DeepONet-approximated nonlinear operator is a +cascade/composition of the operators defined by one hyperbolic PDE of the +Goursat form and one parabolic PDE on a rectangle, both of which are bilinear +in their input functions and not explicitly solvable. For the delay-compensated +PDE backstepping controller, which employs the learned control operator, +namely, the approximated gain kernel, we guarantee exponential stability in the +$L^2$ norm of the plant state and the $H^1$ norm of the input delay state. +Simulations illustrate the contributed theory. + +
+
+
+
+
+ + ☆ Using Autoencoders and AutoDiff to Reconstruct Missing Variables in a + Set of Time Series + + +
+ Existing black box modeling approaches in machine learning suffer from a +fixed input and output feature combination. In this paper, a new approach to +reconstruct missing variables in a set of time series is presented. An +autoencoder is trained as usual with every feature on both sides and the neural +network parameters are fixed after this training. Then, the searched variables +are defined as missing variables at the autoencoder input and optimized via +automatic differentiation. This optimization is performed with respect to the +available features loss calculation. With this method, different input and +output feature combinations of the trained model can be realized by defining +the searched variables as missing variables and reconstructing them. The +combination can be changed without training the autoencoder again. The approach +is evaluated on the base of a strongly nonlinear electrical component. It is +working well for one of four variables missing and generally even for multiple +missing variables. + +
+
+
+
+
+ + ☆ Deciphering Raw Data in Neuro-Symbolic Learning with Provable Guarantees + + +
+ Neuro-symbolic hybrid systems are promising for integrating machine learning +and symbolic reasoning, where perception models are facilitated with +information inferred from a symbolic knowledge base through logical reasoning. +Despite empirical evidence showing the ability of hybrid systems to learn +accurate perception models, the theoretical understanding of learnability is +still lacking. Hence, it remains unclear why a hybrid system succeeds for a +specific task and when it may fail given a different knowledge base. In this +paper, we introduce a novel way of characterising supervision signals from a +knowledge base, and establish a criterion for determining the knowledge's +efficacy in facilitating successful learning. This, for the first time, allows +us to address the two questions above by inspecting the knowledge base under +investigation. Our analysis suggests that many knowledge bases satisfy the +criterion, thus enabling effective learning, while some fail to satisfy it, +indicating potential failures. Comprehensive experiments confirm the utility of +our criterion on benchmark tasks. + +
+
+
+
+
+ + ☆ Deep Metric Loss for Multimodal Learning + + +
+ Multimodal learning often outperforms its unimodal counterparts by exploiting +unimodal contributions and cross-modal interactions. However, focusing only on +integrating multimodal features into a unified comprehensive representation +overlooks the unimodal characteristics. In real data, the contributions of +modalities can vary from instance to instance, and they often reinforce or +conflict with each other. In this study, we introduce a novel \text{MultiModal} +loss paradigm for multimodal learning, which subgroups instances according to +their unimodal contributions. \text{MultiModal} loss can prevent inefficient +learning caused by overfitting and efficiently optimize multimodal models. On +synthetic data, \text{MultiModal} loss demonstrates improved classification +performance by subgrouping difficult instances within certain modalities. On +four real multimodal datasets, our loss is empirically shown to improve the +performance of recent models. Ablation studies verify the effectiveness of our +loss. Additionally, we show that our loss generates a reliable prediction score +for each modality, which is essential for subgrouping. Our \text{MultiModal} +loss is a novel loss function to subgroup instances according to the +contribution of modalities in multimodal learning and is applicable to a +variety of multimodal models with unimodal decisions. Our code is available at +https://github.com/SehwanMoon/MultiModalLoss. + +
+
+ comment: 18 pages, 9 figures +
+
+
+
+
+ + ☆ An Effective Method using Phrase Mechanism in Neural Machine Translation + + +
+ Machine Translation is one of the essential tasks in Natural Language +Processing (NLP), which has massive applications in real life as well as +contributing to other tasks in the NLP research community. Recently, +Transformer -based methods have attracted numerous researchers in this domain +and achieved state-of-the-art results in most of the pair languages. In this +paper, we report an effective method using a phrase mechanism, +PhraseTransformer, to improve the strong baseline model Transformer in +constructing a Neural Machine Translation (NMT) system for parallel corpora +Vietnamese-Chinese. Our experiments on the MT dataset of the VLSP 2022 +competition achieved the BLEU score of 35.3 on Vietnamese to Chinese and 33.2 +BLEU scores on Chinese to Vietnamese data. Our code is available at +https://github.com/phuongnm94/PhraseTransformer. + +
+
+
+
+
+ + ☆ Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation + with Large Language Models + + +
+ Large Language Models (LLMs) possess impressive capabilities to generate +meaningful code snippets given natural language intents in zero-shot, i.e., +without the need for specific fine-tuning. In the perspective of unleashing +their full potential, prior work has demonstrated the benefits of fine-tuning +the models to task-specific data. However, fine-tuning process demands heavy +computational costs and is intractable when resources are scarce, especially +for models with billions of parameters. In light of these challenges, previous +studies explored In-Context Learning (ICL) as an effective strategy to generate +contextually appropriate code without fine-tuning. However, it operates at +inference time and does not involve learning task-specific parameters, +potentially limiting the model's performance on downstream tasks. In this +context, we foresee that Parameter-Efficient Fine-Tuning (PEFT) techniques +carry a high potential for efficiently specializing LLMs to task-specific data. +In this paper, we deliver a comprehensive study of LLMs with the impact of PEFT +techniques under the automated code generation scenario. Our experimental +results reveal the superiority and potential of such techniques over ICL on a +wide range of LLMs in reducing the computational burden and improving +performance. Therefore, the study opens opportunities for broader applications +of PEFT in software engineering scenarios. + +
+
+ comment: 10+2 pages +
+
+
+
+
+ + ☆ Adaptive Local Steps Federated Learning with Differential Privacy Driven + by Convergence Analysis + + +
+ Federated Learning (FL) is a distributed machine learning technique that +allows model training among multiple devices or organizations without sharing +data. However, while FL ensures that the raw data is not directly accessible to +external adversaries, adversaries can still obtain some statistical information +about the data through differential attacks. Differential Privacy (DP) has been +proposed, which adds noise to the model or gradients to prevent adversaries +from inferring private information from the transmitted parameters. We +reconsider the framework of differential privacy federated learning in +resource-constrained scenarios (privacy budget and communication resources). We +analyze the convergence of federated learning with differential privacy (DPFL) +on resource-constrained scenarios and propose an Adaptive Local Steps +Differential Privacy Federated Learning (ALS-DPFL) algorithm. We experiment our +algorithm on the FashionMNIST and Cifar-10 datasets and achieve quite good +performance relative to previous work. + +
+
+
+
+
+ + ☆ DOMINO++: Domain-aware Loss Regularization for Deep Learning + Generalizability MICCAI + + +
+ Out-of-distribution (OOD) generalization poses a serious challenge for modern +deep learning (DL). OOD data consists of test data that is significantly +different from the model's training data. DL models that perform well on +in-domain test data could struggle on OOD data. Overcoming this discrepancy is +essential to the reliable deployment of DL. Proper model calibration decreases +the number of spurious connections that are made between model features and +class outputs. Hence, calibrated DL can improve OOD generalization by only +learning features that are truly indicative of the respective classes. Previous +work proposed domain-aware model calibration (DOMINO) to improve DL +calibration, but it lacks designs for model generalizability to OOD data. In +this work, we propose DOMINO++, a dual-guidance and dynamic domain-aware loss +regularization focused on OOD generalizability. DOMINO++ integrates +expert-guided and data-guided knowledge in its regularization. Unlike DOMINO +which imposed a fixed scaling and regularization rate, DOMINO++ designs a +dynamic scaling factor and an adaptive regularization rate. Comprehensive +evaluations compare DOMINO++ with DOMINO and the baseline model for head tissue +segmentation from magnetic resonance images (MRIs) on OOD data. The OOD data +consists of synthetic noisy and rotated datasets, as well as real data using a +different MRI scanner from a separate site. DOMINO++'s superior performance +demonstrates its potential to improve the trustworthy deployment of DL on real +clinical data. + +
+
+ comment: 12 pages, 5 figures, 5 tables, Accepted by the International + Conference on Medical Image Computing and Computer Assisted Intervention + (MICCAI) 2023 +
+
+
+
+
+ + ☆ CVFC: Attention-Based Cross-View Feature Consistency for Weakly + Supervised Semantic Segmentation of Pathology Images + + +
+ Histopathology image segmentation is the gold standard for diagnosing cancer, +and can indicate cancer prognosis. However, histopathology image segmentation +requires high-quality masks, so many studies now use imagelevel labels to +achieve pixel-level segmentation to reduce the need for fine-grained +annotation. To solve this problem, we propose an attention-based cross-view +feature consistency end-to-end pseudo-mask generation framework named CVFC +based on the attention mechanism. Specifically, CVFC is a three-branch joint +framework composed of two Resnet38 and one Resnet50, and the independent branch +multi-scale integrated feature map to generate a class activation map (CAM); in +each branch, through down-sampling and The expansion method adjusts the size of +the CAM; the middle branch projects the feature matrix to the query and key +feature spaces, and generates a feature space perception matrix through the +connection layer and inner product to adjust and refine the CAM of each branch; +finally, through the feature consistency loss and feature cross loss to +optimize the parameters of CVFC in co-training mode. After a large number of +experiments, An IoU of 0.7122 and a fwIoU of 0.7018 are obtained on the +WSSS4LUAD dataset, which outperforms HistoSegNet, SEAM, C-CAM, WSSS-Tissue, and +OEEM, respectively. + +
+
+ comment: Submitted to BIBM2023 +
+
+
+
+
+ + ☆ DySuse: Susceptibility Estimation in Dynamic Social Networks + + +
+ Influence estimation aims to predict the total influence spread in social +networks and has received surged attention in recent years. Most current +studies focus on estimating the total number of influenced users in a social +network, and neglect susceptibility estimation that aims to predict the +probability of each user being influenced from the individual perspective. As a +more fine-grained estimation task, susceptibility estimation is full of +attractiveness and practical value. Based on the significance of susceptibility +estimation and dynamic properties of social networks, we propose a task, called +susceptibility estimation in dynamic social networks, which is even more +realistic and valuable in real-world applications. Susceptibility estimation in +dynamic networks has yet to be explored so far and is computationally +intractable to naively adopt Monte Carlo simulation to obtain the results. To +this end, we propose a novel end-to-end framework DySuse based on dynamic graph +embedding technology. Specifically, we leverage a structural feature module to +independently capture the structural information of influence diffusion on each +single graph snapshot. Besides, {we propose the progressive mechanism according +to the property of influence diffusion,} to couple the structural and temporal +information during diffusion tightly. Moreover, a self-attention block {is +designed to} further capture temporal dependency by flexibly weighting +historical timestamps. Experimental results show that our framework is superior +to the existing dynamic graph embedding models and has satisfactory prediction +performance in multiple influence diffusion models. + +
+
+ comment: This paper has been published in Expert Systems With Applications +
+
+
+
+
+ + ☆ Approximately Equivariant Graph Networks + + +
+ Graph neural networks (GNNs) are commonly described as being permutation +equivariant with respect to node relabeling in the graph. This symmetry of GNNs +is often compared to the translation equivariance symmetry of Euclidean +convolution neural networks (CNNs). However, these two symmetries are +fundamentally different: The translation equivariance of CNNs corresponds to +symmetries of the fixed domain acting on the image signal (sometimes known as +active symmetries), whereas in GNNs any permutation acts on both the graph +signals and the graph domain (sometimes described as passive symmetries). In +this work, we focus on the active symmetries of GNNs, by considering a learning +setting where signals are supported on a fixed graph. In this case, the natural +symmetries of GNNs are the automorphisms of the graph. Since real-world graphs +tend to be asymmetric, we relax the notion of symmetries by formalizing +approximate symmetries via graph coarsening. We present a bias-variance formula +that quantifies the tradeoff between the loss in expressivity and the gain in +the regularity of the learned estimator, depending on the chosen symmetry +group. To illustrate our approach, we conduct extensive experiments on image +inpainting, traffic flow prediction, and human pose estimation with different +choices of symmetries. We show theoretically and empirically that the best +generalization performance can be achieved by choosing a suitably larger group +than the graph automorphism group, but smaller than the full permutation group. + +
+
+
+
+
+ + ☆ Federated Learning Robust to Byzantine Attacks: Achieving Zero + Optimality Gap + + +
+ In this paper, we propose a robust aggregation method for federated learning +(FL) that can effectively tackle malicious Byzantine attacks. At each user, +model parameter is firstly updated by multiple steps, which is adjustable over +iterations, and then pushed to the aggregation center directly. This decreases +the number of interactions between the aggregation center and users, allows +each user to set training parameter in a flexible way, and reduces computation +burden compared with existing works that need to combine multiple historical +model parameters. At the aggregation center, geometric median is leveraged to +combine the received model parameters from each user. Rigorous proof shows that +zero optimality gap is achieved by our proposed method with linear convergence, +as long as the fraction of Byzantine attackers is below half. Numerical results +verify the effectiveness of our proposed method. + +
+
+
+
+
+ + ☆ Spatio-Temporal Adaptive Embedding Makes Vanilla Transformer SOTA for + Traffic Forecasting CIKM2023 + + +
+ With the rapid development of the Intelligent Transportation System (ITS), +accurate traffic forecasting has emerged as a critical challenge. The key +bottleneck lies in capturing the intricate spatio-temporal traffic patterns. In +recent years, numerous neural networks with complicated architectures have been +proposed to address this issue. However, the advancements in network +architectures have encountered diminishing performance gains. In this study, we +present a novel component called spatio-temporal adaptive embedding that can +yield outstanding results with vanilla transformers. Our proposed +Spatio-Temporal Adaptive Embedding transformer (STAEformer) achieves +state-of-the-art performance on five real-world traffic forecasting datasets. +Further experiments demonstrate that spatio-temporal adaptive embedding plays a +crucial role in traffic forecasting by effectively capturing intrinsic +spatio-temporal relations and chronological information in traffic time series. + +
+
+ comment: Accepted as CIKM2023 Short Paper +
+
+
+
+
+ + ☆ TokenSplit: Using Discrete Speech Representations for Direct, Refined, + and Transcript-Conditioned Speech Separation and Recognition INTERSPEECH 2023 + + +
+ We present TokenSplit, a speech separation model that acts on discrete token +sequences. The model is trained on multiple tasks simultaneously: separate and +transcribe each speech source, and generate speech from text. The model +operates on transcripts and audio token sequences and achieves multiple tasks +through masking of inputs. The model is a sequence-to-sequence encoder-decoder +model that uses the Transformer architecture. We also present a "refinement" +version of the model that predicts enhanced audio tokens from the audio tokens +of speech separated by a conventional separation model. Using both objective +metrics and subjective MUSHRA listening tests, we show that our model achieves +excellent performance in terms of separation, both with or without transcript +conditioning. We also measure the automatic speech recognition (ASR) +performance and provide audio samples of speech synthesis to demonstrate the +additional utility of our model. + +
+
+ comment: INTERSPEECH 2023, project webpage with audio demos at + https://google-research.github.io/sound-separation/papers/tokensplit +
+
+
+
+
+ + ☆ Federated Learning for Connected and Automated Vehicles: A Survey of + Existing Approaches and Challenges + + +
+ Machine learning (ML) is widely used for key tasks in Connected and Automated +Vehicles (CAV), including perception, planning, and control. However, its +reliance on vehicular data for model training presents significant challenges +related to in-vehicle user privacy and communication overhead generated by +massive data volumes. Federated learning (FL) is a decentralized ML approach +that enables multiple vehicles to collaboratively develop models, broadening +learning from various driving environments, enhancing overall performance, and +simultaneously securing local vehicle data privacy and security. This survey +paper presents a review of the advancements made in the application of FL for +CAV (FL4CAV). First, centralized and decentralized frameworks of FL are +analyzed, highlighting their key characteristics and methodologies. Second, +diverse data sources, models, and data security techniques relevant to FL in +CAVs are reviewed, emphasizing their significance in ensuring privacy and +confidentiality. Third, specific and important applications of FL are explored, +providing insight into the base models and datasets employed for each +application. Finally, existing challenges for FL4CAV are listed and potential +directions for future work are discussed to further enhance the effectiveness +and efficiency of FL in the context of CAV. + +
+
+
+
+
+ + ☆ Label Selection Approach to Learning from Crowds + + +
+ Supervised learning, especially supervised deep learning, requires large +amounts of labeled data. One approach to collect large amounts of labeled data +is by using a crowdsourcing platform where numerous workers perform the +annotation tasks. However, the annotation results often contain label noise, as +the annotation skills vary depending on the crowd workers and their ability to +complete the task correctly. Learning from Crowds is a framework which directly +trains the models using noisy labeled data from crowd workers. In this study, +we propose a novel Learning from Crowds model, inspired by SelectiveNet +proposed for the selective prediction problem. The proposed method called Label +Selection Layer trains a prediction model by automatically determining whether +to use a worker's label for training using a selector network. A major +advantage of the proposed method is that it can be applied to almost all +variants of supervised learning problems by simply adding a selector network +and changing the objective function for existing models, without explicitly +assuming a model of the noise in crowd annotations. The experimental results +show that the performance of the proposed method is almost equivalent to or +better than the Crowd Layer, which is one of the state-of-the-art methods for +Deep Learning from Crowds, except for the regression problem case. + +
+
+ comment: 15 pages, 1 figure +
+
+
+
+
+ + ☆ Stress representations for tensor basis neural networks: alternative + formulations to Finger-Rivlin-Ericksen + + +
+ Data-driven constitutive modeling frameworks based on neural networks and +classical representation theorems have recently gained considerable attention +due to their ability to easily incorporate constitutive constraints and their +excellent generalization performance. In these models, the stress prediction +follows from a linear combination of invariant-dependent coefficient functions +and known tensor basis generators. However, thus far the formulations have been +limited to stress representations based on the classical Rivlin and Ericksen +form, while the performance of alternative representations has yet to be +investigated. In this work, we survey a variety of tensor basis neural network +models for modeling hyperelastic materials in a finite deformation context, +including a number of so far unexplored formulations which use theoretically +equivalent invariants and generators to Finger-Rivlin-Ericksen. Furthermore, we +compare potential-based and coefficient-based approaches, as well as different +calibration techniques. Nine variants are tested against both noisy and +noiseless datasets for three different materials. Theoretical and practical +insights into the performance of each formulation are given. + +
+
+ comment: 32 pages, 20 figures, 4 appendices +
+
+
+
+
+ + ☆ Long-Term Prediction of Natural Video Sequences with Robust Video + Predictors + + +
+ Predicting high dimensional video sequences is a curiously difficult problem. +The number of possible futures for a given video sequence grows exponentially +over time due to uncertainty. This is especially evident when trying to predict +complicated natural video scenes from a limited snapshot of the world. The +inherent uncertainty accumulates the further into the future you predict making +long-term prediction very difficult. In this work we introduce a number of +improvements to existing work that aid in creating Robust Video Predictors +(RoViPs). We show that with a combination of deep Perceptual and +uncertainty-based reconstruction losses we are able to create high quality +short-term predictions. Attention-based skip connections are utilised to allow +for long range spatial movement of input features to further improve +performance. Finally, we show that by simply making the predictor robust to its +own prediction errors, it is possible to produce very long, realistic natural +video sequences using an iterated single-step prediction task. + +
+
+
+
+
+ + ☆ A Deep Dive into the Connections Between the Renormalization Group and + Deep Learning in the Ising Model + + +
+ The renormalization group (RG) is an essential technique in statistical +physics and quantum field theory, which considers scale-invariant properties of +physical theories and how these theories' parameters change with scaling. Deep +learning is a powerful computational technique that uses multi-layered neural +networks to solve a myriad of complicated problems. Previous research suggests +the possibility that unsupervised deep learning may be a form of RG flow, by +being a layer-by-layer coarse graining of the original data. We examined this +connection on a more rigorous basis for the simple example of Kadanoff block +renormalization of the 2D nearest-neighbor Ising model, with our deep learning +accomplished via Restricted Boltzmann Machines (RBMs). We developed extensive +renormalization techniques for the 1D and 2D Ising model to provide a baseline +for comparison. For the 1D Ising model, we successfully used Adam optimization +on a correlation length loss function to learn the group flow, yielding results +consistent with the analytical model for infinite N. For the 2D Ising model, we +successfully generated Ising model samples using the Wolff algorithm, and +performed the group flow using a quasi-deterministic method, validating these +results by calculating the critical exponent \nu. We then examined RBM learning +of the Ising model layer by layer, finding a blocking structure in the learning +that is qualitatively similar to RG. Lastly, we directly compared the weights +of each layer from the learning to Ising spin renormalization, but found +quantitative inconsistencies for the simple case of nearest-neighbor Ising +models. + +
+
+ comment: 103 pages, 87 figures, Senior Thesis, Advisors: Maria Spiropulu and + Joseph Lykken +
+
+
+
+
+ + ☆ Neural Amortized Inference for Nested Multi-agent Reasoning + + +
+ Multi-agent interactions, such as communication, teaching, and bluffing, +often rely on higher-order social inference, i.e., understanding how others +infer oneself. Such intricate reasoning can be effectively modeled through +nested multi-agent reasoning. Nonetheless, the computational complexity +escalates exponentially with each level of reasoning, posing a significant +challenge. However, humans effortlessly perform complex social inferences as +part of their daily lives. To bridge the gap between human-like inference +capabilities and computational limitations, we propose a novel approach: +leveraging neural networks to amortize high-order social inference, thereby +expediting nested multi-agent reasoning. We evaluate our method in two +challenging multi-agent interaction domains. The experimental results +demonstrate that our method is computationally efficient while exhibiting +minimal degradation in accuracy. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ☆ Topological Graph Signal Compression + + +
+ Recently emerged Topological Deep Learning (TDL) methods aim to extend +current Graph Neural Networks (GNN) by naturally processing higher-order +interactions, going beyond the pairwise relations and local neighborhoods +defined by graph representations. In this paper we propose a novel TDL-based +method for compressing signals over graphs, consisting in two main steps: +first, disjoint sets of higher-order structures are inferred based on the +original signal --by clustering $N$ datapoints into $K\ll N$ collections; then, +a topological-inspired message passing gets a compressed representation of the +signal within those multi-element sets. Our results show that our framework +improves both standard GNN and feed-forward architectures in compressing +temporal link-based signals from two real-word Internet Service Provider +Networks' datasets --from $30\%$ up to $90\%$ better reconstruction errors +across all evaluation scenarios--, suggesting that it better captures and +exploits spatial and temporal correlations over the whole graph-based network +structure. + +
+
+ comment: 9 pages, 5 figures, 2 tables +
+
+
+
+
+ + ☆ UnLoc: A Unified Framework for Video Localization Tasks ICCV 2023 + + +
+ While large-scale image-text pretrained models such as CLIP have been used +for multiple video-level tasks on trimmed videos, their use for temporal +localization in untrimmed videos is still a relatively unexplored task. We +design a new approach for this called UnLoc, which uses pretrained image and +text towers, and feeds tokens to a video-text fusion model. The output of the +fusion module are then used to construct a feature pyramid in which each level +connects to a head to predict a per-frame relevancy score and start/end time +displacements. Unlike previous works, our architecture enables Moment +Retrieval, Temporal Localization, and Action Segmentation with a single stage +model, without the need for action proposals, motion based pretrained features +or representation masking. Unlike specialized models, we achieve state of the +art results on all three different localization tasks with a unified approach. +Code will be available at: \url{https://github.com/google-research/scenic}. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Ultra Dual-Path Compression For Joint Echo Cancellation And Noise + Suppression + + +
+ Echo cancellation and noise reduction are essential for full-duplex +communication, yet most existing neural networks have high computational costs +and are inflexible in tuning model complexity. In this paper, we introduce +time-frequency dual-path compression to achieve a wide range of compression +ratios on computational cost. Specifically, for frequency compression, +trainable filters are used to replace manually designed filters for dimension +reduction. For time compression, only using frame skipped prediction causes +large performance degradation, which can be alleviated by a post-processing +network with full sequence modeling. We have found that under fixed compression +ratios, dual-path compression combining both the time and frequency methods +will give further performance improvement, covering compression ratios from 4x +to 32x with little model size change. Moreover, the proposed models show +competitive performance compared with fast FullSubNet and DeepFilterNet. A demo +page can be found at +hangtingchen.github.io/ultra_dual_path_compression.github.io/. + +
+
+ comment: Accepted by Interspeech 2023 +
+
+
+
+
+ + ☆ Harmonization Across Imaging Locations(HAIL): One-Shot Learning for + Brain MRI + + +
+ For machine learning-based prognosis and diagnosis of rare diseases, such as +pediatric brain tumors, it is necessary to gather medical imaging data from +multiple clinical sites that may use different devices and protocols. Deep +learning-driven harmonization of radiologic images relies on generative +adversarial networks (GANs). However, GANs notoriously generate pseudo +structures that do not exist in the original training data, a phenomenon known +as "hallucination". To prevent hallucination in medical imaging, such as +magnetic resonance images (MRI) of the brain, we propose a one-shot learning +method where we utilize neural style transfer for harmonization. At test time, +the method uses one image from a clinical site to generate an image that +matches the intensity scale of the collaborating sites. Our approach combines +learning a feature extractor, neural style transfer, and adaptive instance +normalization. We further propose a novel strategy to evaluate the +effectiveness of image harmonization approaches with evaluation metrics that +both measure image style harmonization and assess the preservation of +anatomical structures. Experimental results demonstrate the effectiveness of +our method in preserving patient anatomy while adjusting the image intensities +to a new clinical site. Our general harmonization model can be used on unseen +data from new sites, making it a valuable tool for real-world medical +applications and clinical trials. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Spurious Correlations and Where to Find Them SC + + +
+ Spurious correlations occur when a model learns unreliable features from the +data and are a well-known drawback of data-driven learning. Although there are +several algorithms proposed to mitigate it, we are yet to jointly derive the +indicators of spurious correlations. As a result, the solutions built upon +standalone hypotheses fail to beat simple ERM baselines. We collect some of the +commonly studied hypotheses behind the occurrence of spurious correlations and +investigate their influence on standard ERM baselines using synthetic datasets +generated from causal graphs. Subsequently, we observe patterns connecting +these hypotheses and model design choices. + +
+
+ comment: 2nd Workshop on SCIS, ICML 2023 +
+
+
+
+
+ + ☆ Split Learning for Distributed Collaborative Training of Deep Learning + Models in Health Informatics + + +
+ Deep learning continues to rapidly evolve and is now demonstrating remarkable +potential for numerous medical prediction tasks. However, realizing deep +learning models that generalize across healthcare organizations is challenging. +This is due, in part, to the inherent siloed nature of these organizations and +patient privacy requirements. To address this problem, we illustrate how split +learning can enable collaborative training of deep learning models across +disparate and privately maintained health datasets, while keeping the original +records and model parameters private. We introduce a new privacy-preserving +distributed learning framework that offers a higher level of privacy compared +to conventional federated learning. We use several biomedical imaging and +electronic health record (EHR) datasets to show that deep learning models +trained via split learning can achieve highly similar performance to their +centralized and federated counterparts while greatly improving computational +efficiency and reducing privacy risks. + +
+
+
+
+
+ + ☆ Extreme Multilabel Classification for Specialist Doctor Recommendation + with Implicit Feedback and Limited Patient Metadata + + +
+ Recommendation Systems (RS) are often used to address the issue of medical +doctor referrals. However, these systems require access to patient feedback and +medical records, which may not always be available in real-world scenarios. Our +research focuses on medical referrals and aims to predict recommendations in +different specialties of physicians for both new patients and those with a +consultation history. We use Extreme Multilabel Classification (XML), commonly +employed in text-based classification tasks, to encode available features and +explore different scenarios. While its potential for recommendation tasks has +often been suggested, this has not been thoroughly explored in the literature. +Motivated by the doctor referral case, we show how to recast a traditional +recommender setting into a multilabel classification problem that current XML +methods can solve. Further, we propose a unified model leveraging patient +history across different specialties. Compared to state-of-the-art RS using the +same features, our approach consistently improves standard recommendation +metrics up to approximately $10\%$ for patients with a previous consultation +history. For new patients, XML proves better at exploiting available features, +outperforming the benchmark in favorable scenarios, with particular emphasis on +recall metrics. Thus, our approach brings us one step closer to creating more +effective and personalized doctor referral systems. Additionally, it highlights +XML as a promising alternative to current hybrid or content-based RS, while +identifying key aspects to take into account when using XML for recommendation +tasks. + +
+
+
+
+
+ + ☆ Multi-Task Hypergraphs for Semi-supervised Learning using Earth + Observations ICCV 2023 + + +
+ There are many ways of interpreting the world and they are highly +interdependent. We exploit such complex dependencies and introduce a powerful +multi-task hypergraph, in which every node is a task and different paths +through the hypergraph reaching a given task become unsupervised teachers, by +forming ensembles that learn to generate reliable pseudolabels for that task. +Each hyperedge is part of an ensemble teacher for a given task and it is also a +student of the self-supervised hypergraph system. We apply our model to one of +the most important problems of our times, that of Earth Observation, which is +highly multi-task and it often suffers from missing ground-truth data. By +performing extensive experiments on the NASA NEO Dataset, spanning a period of +22 years, we demonstrate the value of our multi-task semi-supervised approach, +by consistent improvements over strong baselines and recent work. We also show +that the hypergraph can adapt unsupervised to gradual data distribution shifts +and reliably recover, through its multi-task self-supervision process, the +missing data for several observational layers for up to seven years. + +
+
+ comment: Accepted in ICCV 2023 Workshops +
+
+
+
+
+ + ☆ Instance-based Learning with Prototype Reduction for Real-Time + Proportional Myocontrol: A Randomized User Study Demonstrating + Accuracy-preserving Data Reduction for Prosthetic Embedded Systems + + +
+ This work presents the design, implementation and validation of learning +techniques based on the kNN scheme for gesture detection in prosthetic control. +To cope with high computational demands in instance-based prediction, methods +of dataset reduction are evaluated considering real-time determinism to allow +for the reliable integration into battery-powered portable devices. The +influence of parameterization and varying proportionality schemes is analyzed, +utilizing an eight-channel-sEMG armband. Besides offline cross-validation +accuracy, success rates in real-time pilot experiments (online target +achievement tests) are determined. Based on the assessment of specific dataset +reduction techniques' adequacy for embedded control applications regarding +accuracy and timing behaviour, Decision Surface Mapping (DSM) proves itself +promising when applying kNN on the reduced set. A randomized, double-blind user +study was conducted to evaluate the respective methods (kNN and kNN with +DSM-reduction) against Ridge Regression (RR) and RR with Random Fourier +Features (RR-RFF). The kNN-based methods performed significantly better +(p<0.0005) than the regression techniques. Between DSM-kNN and kNN, there was +no statistically significant difference (significance level 0.05). This is +remarkable in consideration of only one sample per class in the reduced set, +thus yielding a reduction rate of over 99% while preserving success rate. The +same behaviour could be confirmed in an extended user study. With k=1, which +turned out to be an excellent choice, the runtime complexity of both kNN (in +every prediction step) as well as DSM-kNN (in the training phase) becomes +linear concerning the number of original samples, favouring dependable wearable +prosthesis applications. + +
+
+
+
+
+ + ☆ Personalized Event Prediction for Electronic Health Records + + +
+ Clinical event sequences consist of hundreds of clinical events that +represent records of patient care in time. Developing accurate predictive +models of such sequences is of a great importance for supporting a variety of +models for interpreting/classifying the current patient condition, or +predicting adverse clinical events and outcomes, all aimed to improve patient +care. One important challenge of learning predictive models of clinical +sequences is their patient-specific variability. Based on underlying clinical +conditions, each patient's sequence may consist of different sets of clinical +events (observations, lab results, medications, procedures). Hence, simple +population-wide models learned from event sequences for many different patients +may not accurately predict patient-specific dynamics of event sequences and +their differences. To address the problem, we propose and investigate multiple +new event sequence prediction models and methods that let us better adjust the +prediction for individual patients and their specific conditions. The methods +developed in this work pursue refinement of population-wide models to +subpopulations, self-adaptation, and a meta-level model switching that is able +to adaptively select the model with the best chance to support the immediate +prediction. We analyze and test the performance of these models on clinical +event sequences of patients in MIMIC-III database. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2104.01787 +
+
+
+
+
+ + ☆ Using language models in the implicit automated assessment of + mathematical short answer items + + +
+ We propose a new way to assess certain short constructed responses to +mathematics items. Our approach uses a pipeline that identifies the key values +specified by the student in their response. This allows us to determine the +correctness of the response, as well as identify any misconceptions. The +information from the value identification pipeline can then be used to provide +feedback to the teacher and student. The value identification pipeline consists +of two fine-tuned language models. The first model determines if a value is +implicit in the student response. The second model identifies where in the +response the key value is specified. We consider both a generic model that can +be used for any prompt and value, as well as models that are specific to each +prompt and value. The value identification pipeline is a more accurate and +informative way to assess short constructed responses than traditional +rubric-based scoring. It can be used to provide more targeted feedback to +students, which can help them improve their understanding of mathematics. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Autonomous Detection of Methane Emissions in Multispectral Satellite + Data Using Deep Learning + + +
+ Methane is one of the most potent greenhouse gases, and its short atmospheric +half-life makes it a prime target to rapidly curb global warming. However, +current methane emission monitoring techniques primarily rely on approximate +emission factors or self-reporting, which have been shown to often dramatically +underestimate emissions. Although initially designed to monitor surface +properties, satellite multispectral data has recently emerged as a powerful +method to analyze atmospheric content. However, the spectral resolution of +multispectral instruments is poor, and methane measurements are typically very +noisy. Methane data products are also sensitive to absorption by the surface +and other atmospheric gases (water vapor in particular) and therefore provide +noisy maps of potential methane plumes, that typically require extensive human +analysis. Here, we show that the image recognition capabilities of deep +learning methods can be leveraged to automatize the detection of methane leaks +in Sentinel-2 satellite multispectral data, with dramatically reduced false +positive rates compared with state-of-the-art multispectral methane data +products, and without the need for a priori knowledge of potential leak sites. +Our proposed approach paves the way for the automated, high-definition and +high-frequency monitoring of point-source methane emissions across the world. + +
+
+
+
+
+ + ☆ SupEuclid: Extremely Simple, High Quality OoD Detection with Supervised + Contrastive Learning and Euclidean Distance + + +
+ Out-of-Distribution (OoD) detection has developed substantially in the past +few years, with available methods approaching, and in a few cases achieving, +perfect data separation on standard benchmarks. These results generally involve +large or complex models, pretraining, exposure to OoD examples or extra +hyperparameter tuning. Remarkably, it is possible to achieve results that can +exceed many of these state-of-the-art methods with a very simple method. We +demonstrate that ResNet18 trained with Supervised Contrastive Learning (SCL) +produces state-of-the-art results out-of-the-box on near and far OoD detection +benchmarks using only Euclidean distance as a scoring rule. This may obviate +the need in some cases for more sophisticated methods or larger models, and at +the very least provides a very strong, easy to use baseline for further +experimentation and analysis. + +
+
+
+
+
+ + ☆ MRI Field-transfer Reconstruction with Limited Data: Regularization by + Neural Style Transfer + + +
+ Recent works have demonstrated success in MRI reconstruction using deep +learning-based models. However, most reported approaches require training on a +task-specific, large-scale dataset. Regularization by denoising (RED) is a +general pipeline which embeds a denoiser as a prior for image reconstruction. +The potential of RED has been demonstrated for multiple image-related tasks +such as denoising, deblurring and super-resolution. In this work, we propose a +regularization by neural style transfer (RNST) method to further leverage the +priors from the neural transfer and denoising engine. This enables RNST to +reconstruct a high-quality image from a noisy low-quality image with different +image styles and limited data. We validate RNST with clinical MRI scans from +1.5T and 3T and show that RNST can significantly boost image quality. Our +results highlight the capability of the RNST framework for MRI reconstruction +and the potential for reconstruction tasks with limited data. + +
+
+ comment: 30 pages, 8 figures, 2 tables, 1 algorithm chart +
+
+
+
+
+ + ♻ ☆ HLSDataset: Open-Source Dataset for ML-Assisted FPGA Design using High + Level Synthesis + + +
+ Machine Learning (ML) has been widely adopted in design exploration using +high level synthesis (HLS) to give a better and faster performance, and +resource and power estimation at very early stages for FPGA-based design. To +perform prediction accurately, high-quality and large-volume datasets are +required for training ML models.This paper presents a dataset for ML-assisted +FPGA design using HLS, called HLSDataset. The dataset is generated from widely +used HLS C benchmarks including Polybench, Machsuite, CHStone and Rossetta. The +Verilog samples are generated with a variety of directives including loop +unroll, loop pipeline and array partition to make sure optimized and realistic +designs are covered. The total number of generated Verilog samples is nearly +9,000 per FPGA type. To demonstrate the effectiveness of our dataset, we +undertake case studies to perform power estimation and resource usage +estimation with ML models trained with our dataset. All the codes and dataset +are public at the github repo.We believe that HLSDataset can save valuable time +for researchers by avoiding the tedious process of running tools, scripting and +parsing files to generate the dataset, and enable them to spend more time where +it counts, that is, in training ML models. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Bias and Extrapolation in Markovian Linear Stochastic Approximation with + Constant Stepsizes + + +
+ We consider Linear Stochastic Approximation (LSA) with a constant stepsize +and Markovian data. Viewing the joint process of the data and LSA iterate as a +time-homogeneous Markov chain, we prove its convergence to a unique limiting +and stationary distribution in Wasserstein distance and establish +non-asymptotic, geometric convergence rates. Furthermore, we show that the bias +vector of this limit admits an infinite series expansion with respect to the +stepsize. Consequently, the bias is proportional to the stepsize up to higher +order terms. This result stands in contrast with LSA under i.i.d. data, for +which the bias vanishes. In the reversible chain setting, we provide a general +characterization of the relationship between the bias and the mixing time of +the Markovian data, establishing that they are roughly proportional to each +other. + While Polyak-Ruppert tail-averaging reduces the variance of the LSA iterates, +it does not affect the bias. The above characterization allows us to show that +the bias can be reduced using Richardson-Romberg extrapolation with $m\ge 2$ +stepsizes, which eliminates the $m-1$ leading terms in the bias expansion. This +extrapolation scheme leads to an exponentially smaller bias and an improved +mean squared error, both in theory and empirically. Our results immediately +apply to the Temporal Difference learning algorithm with linear function +approximation, Markovian data, and constant stepsizes. + +
+
+ comment: SIGMETRICS 2023 +
+
+
+
+
+ + ♻ ☆ Low-Variance Forward Gradients using Direct Feedback Alignment and + Momentum + + +
+ Supervised learning in deep neural networks is commonly performed using error +backpropagation. However, the sequential propagation of errors during the +backward pass limits its scalability and applicability to low-powered +neuromorphic hardware. Therefore, there is growing interest in finding local +alternatives to backpropagation. Recently proposed methods based on +forward-mode automatic differentiation suffer from high variance in large deep +neural networks, which affects convergence. In this paper, we propose the +Forward Direct Feedback Alignment algorithm that combines Activity-Perturbed +Forward Gradients with Direct Feedback Alignment and momentum. We provide both +theoretical proofs and empirical evidence that our proposed method achieves +lower variance than forward gradient techniques. In this way, our approach +enables faster convergence and better performance when compared to other local +alternatives to backpropagation and opens a new perspective for the development +of online learning algorithms compatible with neuromorphic systems. + +
+
+
+
+
+ + ♻ ☆ A deep complementary energy method for solid mechanics using minimum + complementary energy principle + + +
+ In recent years, the rapid advancement of deep learning has significantly +impacted various fields, particularly in solving partial differential equations +(PDEs) in the realm of solid mechanics, benefiting greatly from the remarkable +approximation capabilities of neural networks. In solving PDEs, +Physics-Informed Neural Networks (PINNs) and the Deep Energy Method (DEM) have +garnered substantial attention. The principle of minimum potential energy and +complementary energy are two important variational principles in solid +mechanics. However, the well-known Deep Energy Method (DEM) is based on the +principle of minimum potential energy, but there lacks the important form of +minimum complementary energy. To bridge this gap, we propose the deep +complementary energy method (DCEM) based on the principle of minimum +complementary energy. The output function of DCEM is the stress function, which +inherently satisfies the equilibrium equation. We present numerical results +using the Prandtl and Airy stress functions, and compare DCEM with existing +PINNs and DEM algorithms when modeling representative mechanical problems. The +results demonstrate that DCEM outperforms DEM in terms of stress accuracy and +efficiency and has an advantage in dealing with complex displacement boundary +conditions, which is supported by theoretical analyses and numerical +simulations. We extend DCEM to DCEM-Plus (DCEM-P), adding terms that satisfy +partial differential equations. Furthermore, we propose a deep complementary +energy operator method (DCEM-O) by combining operator learning with physical +equations. Initially, we train DCEM-O using high-fidelity numerical results and +then incorporate complementary energy. DCEM-P and DCEM-O further enhance the +accuracy and efficiency of DCEM. + +
+
+ comment: 58 pages, 30 figures +
+
+
+
+
+ + ♻ ☆ Diffusion Models for Black-Box Optimization + + +
+ The goal of offline black-box optimization (BBO) is to optimize an expensive +black-box function using a fixed dataset of function evaluations. Prior works +consider forward approaches that learn surrogates to the black-box function and +inverse approaches that directly map function values to corresponding points in +the input domain of the black-box function. These approaches are limited by the +quality of the offline dataset and the difficulty in learning one-to-many +mappings in high dimensions, respectively. We propose Denoising Diffusion +Optimization Models (DDOM), a new inverse approach for offline black-box +optimization based on diffusion models. Given an offline dataset, DDOM learns a +conditional generative model over the domain of the black-box function +conditioned on the function values. We investigate several design choices in +DDOM, such as re-weighting the dataset to focus on high function values and the +use of classifier-free guidance at test-time to enable generalization to +function values that can even exceed the dataset maxima. Empirically, we +conduct experiments on the Design-Bench benchmark and show that DDOM achieves +results competitive with state-of-the-art baselines. + +
+
+ comment: International Conference on Machine Learning 2023 +
+
+
+
+
+ + ♻ ☆ Generative Pretraining for Black-Box Optimization NeurIPS + + +
+ Many problems in science and engineering involve optimizing an expensive +black-box function over a high-dimensional space. For such black-box +optimization (BBO) problems, we typically assume a small budget for online +function evaluations, but also often have access to a fixed, offline dataset +for pretraining. Prior approaches seek to utilize the offline data to +approximate the function or its inverse but are not sufficiently accurate far +from the data distribution. We propose BONET, a generative framework for +pretraining a novel black-box optimizer using offline datasets. In BONET, we +train an autoregressive model on fixed-length trajectories derived from an +offline dataset. We design a sampling strategy to synthesize trajectories from +offline data using a simple heuristic of rolling out monotonic transitions from +low-fidelity to high-fidelity samples. Empirically, we instantiate BONET using +a causally masked Transformer and evaluate it on Design-Bench, where we rank +the best on average, outperforming state-of-the-art baselines. + +
+
+ comment: International Conference for Machine Learning 2023 NeurIPS Workshop + for Foundational Models for Decision Making (Oral) 2022 +
+
+
+
+
+ + ♻ ☆ TACOformer:Token-channel compounded Cross Attention for Multimodal + Emotion Recognition IJCAI 2023 + + +
+ Recently, emotion recognition based on physiological signals has emerged as a +field with intensive research. The utilization of multi-modal, multi-channel +physiological signals has significantly improved the performance of emotion +recognition systems, due to their complementarity. However, effectively +integrating emotion-related semantic information from different modalities and +capturing inter-modal dependencies remains a challenging issue. Many existing +multimodal fusion methods ignore either token-to-token or channel-to-channel +correlations of multichannel signals from different modalities, which limits +the classification capability of the models to some extent. In this paper, we +propose a comprehensive perspective of multimodal fusion that integrates +channel-level and token-level cross-modal interactions. Specifically, we +introduce a unified cross attention module called Token-chAnnel COmpound (TACO) +Cross Attention to perform multimodal fusion, which simultaneously models +channel-level and token-level dependencies between modalities. Additionally, we +propose a 2D position encoding method to preserve information about the spatial +distribution of EEG signal channels, then we use two transformer encoders ahead +of the fusion module to capture long-term temporal dependencies from the EEG +signal and the peripheral physiological signal, respectively. +Subject-independent experiments on emotional dataset DEAP and Dreamer +demonstrate that the proposed model achieves state-of-the-art performance. + +
+
+ comment: Accepted by IJCAI 2023- AI4TS workshop +
+
+
+
+
+ + ♻ ☆ Polynomial-Time Algorithms for Counting and Sampling Markov Equivalent + DAGs with Applications + + +
+ Counting and sampling directed acyclic graphs from a Markov equivalence class +are fundamental tasks in graphical causal analysis. In this paper we show that +these tasks can be performed in polynomial time, solving a long-standing open +problem in this area. Our algorithms are effective and easily implementable. As +we show in experiments, these breakthroughs make thought-to-be-infeasible +strategies in active learning of causal structures and causal effect +identification with regard to a Markov equivalence class practically +applicable. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2012.09679 +
+
+
+
+
+ + ♻ ☆ Adaptive SGD with Polyak stepsize and Line-search: Robust Convergence + and Variance Reduction + + +
+ The recently proposed stochastic Polyak stepsize (SPS) and stochastic +line-search (SLS) for SGD have shown remarkable effectiveness when training +over-parameterized models. However, in non-interpolation settings, both +algorithms only guarantee convergence to a neighborhood of a solution which may +result in a worse output than the initial guess. While artificially decreasing +the adaptive stepsize has been proposed to address this issue (Orvieto et al. +[2022]), this approach results in slower convergence rates for convex and +over-parameterized models. In this work, we make two contributions: Firstly, we +propose two new variants of SPS and SLS, called AdaSPS and AdaSLS, which +guarantee convergence in non-interpolation settings and maintain sub-linear and +linear convergence rates for convex and strongly convex functions when training +over-parameterized models. AdaSLS requires no knowledge of problem-dependent +parameters, and AdaSPS requires only a lower bound of the optimal function +value as input. Secondly, we equip AdaSPS and AdaSLS with a novel variance +reduction technique and obtain algorithms that require +$\smash{\widetilde{\mathcal{O}}}(n+1/\epsilon)$ gradient evaluations to achieve +an $\mathcal{O}(\epsilon)$-suboptimality for convex functions, which improves +upon the slower $\mathcal{O}(1/\epsilon^2)$ rates of AdaSPS and AdaSLS without +variance reduction in the non-interpolation regimes. Moreover, our result +matches the fast rates of AdaSVRG but removes the inner-outer-loop structure, +which is easier to implement and analyze. Finally, numerical experiments on +synthetic and real datasets validate our theory and demonstrate the +effectiveness and robustness of our algorithms. + +
+
+
+
+
+ + ♻ ☆ Rethinking Data Distillation: Do Not Overlook Calibration ICCV 2023 + + +
+ Neural networks trained on distilled data often produce over-confident output +and require correction by calibration methods. Existing calibration methods +such as temperature scaling and mixup work well for networks trained on +original large-scale data. However, we find that these methods fail to +calibrate networks trained on data distilled from large source datasets. In +this paper, we show that distilled data lead to networks that are not +calibratable due to (i) a more concentrated distribution of the maximum logits +and (ii) the loss of information that is semantically meaningful but unrelated +to classification tasks. To address this problem, we propose Masked Temperature +Scaling (MTS) and Masked Distillation Training (MDT) which mitigate the +limitations of distilled data and achieve better calibration results while +maintaining the efficiency of dataset distillation. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ One-Vote Veto: Semi-Supervised Learning for Low-Shot Glaucoma Diagnosis + + +
+ Convolutional neural networks (CNNs) are a promising technique for automated +glaucoma diagnosis from images of the fundus, and these images are routinely +acquired as part of an ophthalmic exam. Nevertheless, CNNs typically require a +large amount of well-labeled data for training, which may not be available in +many biomedical image classification applications, especially when diseases are +rare and where labeling by experts is costly. This article makes two +contributions to address this issue: (1) It extends the conventional Siamese +network and introduces a training method for low-shot learning when labeled +data are limited and imbalanced, and (2) it introduces a novel semi-supervised +learning strategy that uses additional unlabeled training data to achieve +greater accuracy. Our proposed multi-task Siamese network (MTSN) can employ any +backbone CNN, and we demonstrate with four backbone CNNs that its accuracy with +limited training data approaches the accuracy of backbone CNNs trained with a +dataset that is 50 times larger. We also introduce One-Vote Veto (OVV) +self-training, a semi-supervised learning strategy that is designed +specifically for MTSNs. By taking both self-predictions and contrastive +predictions of the unlabeled training data into account, OVV self-training +provides additional pseudo labels for fine-tuning a pre-trained MTSN. Using a +large (imbalanced) dataset with 66,715 fundus photographs acquired over 15 +years, extensive experimental results demonstrate the effectiveness of low-shot +learning with MTSN and semi-supervised learning with OVV self-training. Three +additional, smaller clinical datasets of fundus images acquired under different +conditions (cameras, instruments, locations, populations) are used to +demonstrate the generalizability of the proposed methods. + +
+
+ comment: accepted by IEEE Transactions on Medical Imaging (T-MI). DOI: + 10.1109/TMI.2023.3307689 +
+
+
+
+
+ + ♻ ☆ MMD Aggregated Two-Sample Test + + +
+ We propose two novel nonparametric two-sample kernel tests based on the +Maximum Mean Discrepancy (MMD). First, for a fixed kernel, we construct an MMD +test using either permutations or a wild bootstrap, two popular numerical +procedures to determine the test threshold. We prove that this test controls +the probability of type I error non-asymptotically. Hence, it can be used +reliably even in settings with small sample sizes as it remains +well-calibrated, which differs from previous MMD tests which only guarantee +correct test level asymptotically. When the difference in densities lies in a +Sobolev ball, we prove minimax optimality of our MMD test with a specific +kernel depending on the smoothness parameter of the Sobolev ball. In practice, +this parameter is unknown and, hence, the optimal MMD test with this particular +kernel cannot be used. To overcome this issue, we construct an aggregated test, +called MMDAgg, which is adaptive to the smoothness parameter. The test power is +maximised over the collection of kernels used, without requiring held-out data +for kernel selection (which results in a loss of test power), or arbitrary +kernel choices such as the median heuristic. We prove that MMDAgg still +controls the level non-asymptotically, and achieves the minimax rate over +Sobolev balls, up to an iterated logarithmic term. Our guarantees are not +restricted to a specific type of kernel, but hold for any product of +one-dimensional translation invariant characteristic kernels. We provide a +user-friendly parameter-free implementation of MMDAgg using an adaptive +collection of bandwidths. We demonstrate that MMDAgg significantly outperforms +alternative state-of-the-art MMD-based two-sample tests on synthetic data +satisfying the Sobolev smoothness assumption, and that, on real-world image +data, MMDAgg closely matches the power of tests leveraging the use of models +such as neural networks. + +
+
+ comment: 81 pages +
+
+
+
+
+ + ♻ ☆ AutoML in the Age of Large Language Models: Current Challenges, Future + Opportunities and Risks + + +
+ The fields of both Natural Language Processing (NLP) and Automated Machine +Learning (AutoML) have achieved remarkable results over the past years. In NLP, +especially Large Language Models (LLMs) have experienced a rapid series of +breakthroughs very recently. We envision that the two fields can radically push +the boundaries of each other through tight integration. To showcase this +vision, we explore the potential of a symbiotic relationship between AutoML and +LLMs, shedding light on how they can benefit each other. In particular, we +investigate both the opportunities to enhance AutoML approaches with LLMs from +different perspectives and the challenges of leveraging AutoML to further +improve LLMs. To this end, we survey existing work, and we critically assess +risks. We strongly believe that the integration of the two fields has the +potential to disrupt both fields, NLP and AutoML. By highlighting conceivable +synergies, but also risks, we aim to foster further exploration at the +intersection of AutoML and LLMs. + +
+
+
+
+
+ + ♻ ☆ The prediction of the quality of results in Logic Synthesis using + Transformer and Graph Neural Networks + + +
+ In the logic synthesis stage, structure transformations in the synthesis tool +need to be combined into optimization sequences and act on the circuit to meet +the specified circuit area and delay. However, logic synthesis optimization +sequences are time-consuming to run, and predicting the quality of the results +(QoR) against the synthesis optimization sequence for a circuit can help +engineers find a better optimization sequence faster. In this work, we propose +a deep learning method to predict the QoR of unseen circuit-optimization +sequences pairs. Specifically, the structure transformations are translated +into vectors by embedding methods and advanced natural language processing +(NLP) technology (Transformer) is used to extract the features of the +optimization sequences. In addition, to enable the prediction process of the +model to be generalized from circuit to circuit, the graph representation of +the circuit is represented as an adjacency matrix and a feature matrix. Graph +neural networks(GNN) are used to extract the structural features of the +circuits. For this problem, the Transformer and three typical GNNs are used. +Furthermore, the Transformer and GNNs are adopted as a joint learning policy +for the QoR prediction of the unseen circuit-optimization sequences. The +methods resulting from the combination of Transformer and GNNs are benchmarked. +The experimental results show that the joint learning of Transformer and +GraphSage gives the best results. The Mean Absolute Error (MAE) of the +predicted result is 0.412. + +
+
+
+
+
+ + ♻ ☆ Discriminative Bayesian filtering lends momentum to the stochastic + Newton method for minimizing log-convex functions + + +
+ To minimize the average of a set of log-convex functions, the stochastic +Newton method iteratively updates its estimate using subsampled versions of the +full objective's gradient and Hessian. We contextualize this optimization +problem as sequential Bayesian inference on a latent state-space model with a +discriminatively-specified observation process. Applying Bayesian filtering +then yields a novel optimization algorithm that considers the entire history of +gradients and Hessians when forming an update. We establish matrix-based +conditions under which the effect of older observations diminishes over time, +in a manner analogous to Polyak's heavy ball momentum. We illustrate various +aspects of our approach with an example and review other relevant innovations +for the stochastic Newton method. + +
+
+
+
+
+ + ♻ ☆ Stability of Q-Learning Through Design and Optimism + + +
+ Q-learning has become an important part of the reinforcement learning toolkit +since its introduction in the dissertation of Chris Watkins in the 1980s. The +purpose of this paper is in part a tutorial on stochastic approximation and +Q-learning, providing details regarding the INFORMS APS inaugural Applied +Probability Trust Plenary Lecture, presented in Nancy France, June 2023. + The paper also presents new approaches to ensure stability and potentially +accelerated convergence for these algorithms, and stochastic approximation in +other settings. Two contributions are entirely new: + 1. Stability of Q-learning with linear function approximation has been an +open topic for research for over three decades. It is shown that with +appropriate optimistic training in the form of a modified Gibbs policy, there +exists a solution to the projected Bellman equation, and the algorithm is +stable (in terms of bounded parameter estimates). Convergence remains one of +many open topics for research. + 2. The new Zap Zero algorithm is designed to approximate the Newton-Raphson +flow without matrix inversion. It is stable and convergent under mild +assumptions on the mean flow vector field for the algorithm, and compatible +statistical assumption on an underlying Markov chain. The algorithm is a +general approach to stochastic approximation which in particular applies to +Q-learning with "oblivious" training even with non-linear function +approximation. + +
+
+ comment: Companion paper to the INFORMS APS inaugural Applied Probability + Trust Plenary Lecture, presented in Nancy France, June 2023. Slides available + online, Online, DOI 10.13140/RG.2.2.24897.33127 +
+
+
+
+
+ + ♻ ☆ Towards Fair Graph Neural Networks via Graph Counterfactual + + +
+ Graph neural networks have shown great ability in representation (GNNs) +learning on graphs, facilitating various tasks. Despite their great performance +in modeling graphs, recent works show that GNNs tend to inherit and amplify the +bias from training data, causing concerns of the adoption of GNNs in high-stake +scenarios. Hence, many efforts have been taken for fairness-aware GNNs. +However, most existing fair GNNs learn fair node representations by adopting +statistical fairness notions, which may fail to alleviate bias in the presence +of statistical anomalies. Motivated by causal theory, there are several +attempts utilizing graph counterfactual fairness to mitigate root causes of +unfairness. However, these methods suffer from non-realistic counterfactuals +obtained by perturbation or generation. In this paper, we take a causal view on +fair graph learning problem. Guided by the casual analysis, we propose a novel +framework CAF, which can select counterfactuals from training data to avoid +non-realistic counterfactuals and adopt selected counterfactuals to learn fair +node representations for node classification task. Extensive experiments on +synthetic and real-world datasets show the effectiveness of CAF. Our code is +available at https://github.com/TimeLovercc/CAF-GNN. + +
+
+
+
+
+ + ♻ ☆ SPARF: Large-Scale Learning of 3D Sparse Radiance Fields from Few Input + Images ICCV 2023 + + +
+ Recent advances in Neural Radiance Fields (NeRFs) treat the problem of novel +view synthesis as Sparse Radiance Field (SRF) optimization using sparse voxels +for efficient and fast rendering (plenoxels,InstantNGP). In order to leverage +machine learning and adoption of SRFs as a 3D representation, we present SPARF, +a large-scale ShapeNet-based synthetic dataset for novel view synthesis +consisting of $\sim$ 17 million images rendered from nearly 40,000 shapes at +high resolution (400 X 400 pixels). The dataset is orders of magnitude larger +than existing synthetic datasets for novel view synthesis and includes more +than one million 3D-optimized radiance fields with multiple voxel resolutions. +Furthermore, we propose a novel pipeline (SuRFNet) that learns to generate +sparse voxel radiance fields from only few views. This is done by using the +densely collected SPARF dataset and 3D sparse convolutions. SuRFNet employs +partial SRFs from few/one images and a specialized SRF loss to learn to +generate high-quality sparse voxel radiance fields that can be rendered from +novel views. Our approach achieves state-of-the-art results in the task of +unconstrained novel view synthesis based on few views on ShapeNet as compared +to recent baselines. The SPARF dataset is made public with the code and models +on the project website https://abdullahamdi.com/sparf/ . + +
+
+ comment: published at ICCV 2023 workshop proceedings +
+
+
+
+
+ + ♻ ☆ Some Supervision Required: Incorporating Oracle Policies in + Reinforcement Learning via Epistemic Uncertainty Metrics + + +
+ An inherent problem of reinforcement learning is performing exploration of an +environment through random actions, of which a large portion can be +unproductive. Instead, exploration can be improved by initializing the learning +policy with an existing (previously learned or hard-coded) oracle policy, +offline data, or demonstrations. In the case of using an oracle policy, it can +be unclear how best to incorporate the oracle policy's experience into the +learning policy in a way that maximizes learning sample efficiency. In this +paper, we propose a method termed Critic Confidence Guided Exploration (CCGE) +for incorporating such an oracle policy into standard actor-critic +reinforcement learning algorithms. More specifically, CCGE takes in the oracle +policy's actions as suggestions and incorporates this information into the +learning scheme when uncertainty is high, while ignoring it when the +uncertainty is low. CCGE is agnostic to methods of estimating uncertainty, and +we show that it is equally effective with two different techniques. +Empirically, we evaluate the effect of CCGE on various benchmark reinforcement +learning tasks, and show that this idea can lead to improved sample efficiency +and final performance. Furthermore, when evaluated on sparse reward +environments, CCGE is able to perform competitively against adjacent algorithms +that also leverage an oracle policy. Our experiments show that it is possible +to utilize uncertainty as a heuristic to guide exploration using an oracle in +reinforcement learning. We expect that this will inspire more research in this +direction, where various heuristics are used to determine the direction of +guidance provided to learning. + +
+
+ comment: Under review at TMLR +
+
+
+
+
+ + ♻ ☆ Scalable Stochastic Gradient Riemannian Langevin Dynamics in + Non-Diagonal Metrics + + +
+ Stochastic-gradient sampling methods are often used to perform Bayesian +inference on neural networks. It has been observed that the methods in which +notions of differential geometry are included tend to have better performances, +with the Riemannian metric improving posterior exploration by accounting for +the local curvature. However, the existing methods often resort to simple +diagonal metrics to remain computationally efficient. This loses some of the +gains. We propose two non-diagonal metrics that can be used in +stochastic-gradient samplers to improve convergence and exploration but have +only a minor computational overhead over diagonal metrics. We show that for +fully connected neural networks (NNs) with sparsity-inducing priors and +convolutional NNs with correlated priors, using these metrics can provide +improvements. For some other choices the posterior is sufficiently easy also +for the simpler metrics. + +
+
+
+
+
+ + ♻ ☆ Large Language Models as Superpositions of Cultural Perspectives + + +
+ Large Language Models (LLMs) are often misleadingly recognized as having a +personality or a set of values. We argue that an LLM can be seen as a +superposition of perspectives with different values and personality traits. +LLMs exhibit context-dependent values and personality traits that change based +on the induced perspective (as opposed to humans, who tend to have more +coherent values and personality traits across contexts). We introduce the +concept of perspective controllability, which refers to a model's affordance to +adopt various perspectives with differing values and personality traits. In our +experiments, we use questionnaires from psychology (PVQ, VSM, IPIP) to study +how exhibited values and personality traits change based on different +perspectives. Through qualitative experiments, we show that LLMs express +different values when those are (implicitly or explicitly) implied in the +prompt, and that LLMs express different values even when those are not +obviously implied (demonstrating their context-dependent nature). We then +conduct quantitative experiments to study the controllability of different +models (GPT-4, GPT-3.5, OpenAssistant, StableVicuna, StableLM), the +effectiveness of various methods for inducing perspectives, and the smoothness +of the models' drivability. We conclude by examining the broader implications +of our work and outline a variety of associated scientific questions. The +project website is available at +https://sites.google.com/view/llm-superpositions . + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ GradTree: Learning Axis-Aligned Decision Trees with Gradient Descent + + +
+ Decision Trees (DTs) are commonly used for many machine learning tasks due to +their high degree of interpretability. However, learning a DT from data is a +difficult optimization problem, as it is non-convex and non-differentiable. +Therefore, common approaches learn DTs using a greedy growth algorithm that +minimizes the impurity locally at each internal node. Unfortunately, this +greedy procedure can lead to inaccurate trees. In this paper, we present a +novel approach for learning hard, axis-aligned DTs with gradient descent. The +proposed method uses backpropagation with a straight-through operator on a +dense DT representation, to jointly optimize all tree parameters. Our approach +outperforms existing methods on binary classification benchmarks and achieves +competitive results for multi-class tasks. + +
+
+
+
+
+ + ♻ ☆ Neural Architecture for Online Ensemble Continual Learning + + +
+ Continual learning with an increasing number of classes is a challenging +task. The difficulty rises when each example is presented exactly once, which +requires the model to learn online. Recent methods with classic parameter +optimization procedures have been shown to struggle in such setups or have +limitations like non-differentiable components or memory buffers. For this +reason, we present the fully differentiable ensemble method that allows us to +efficiently train an ensemble of neural networks in the end-to-end regime. The +proposed technique achieves SOTA results without a memory buffer and clearly +outperforms the reference methods. The conducted experiments have also shown a +significant increase in the performance for small ensembles, which demonstrates +the capability of obtaining relatively high classification accuracy with a +reduced number of classifiers. + +
+
+
+
+
+ + ♻ ☆ Graph of Thoughts: Solving Elaborate Problems with Large Language Models + + +
+ We introduce Graph of Thoughts (GoT): a framework that advances prompting +capabilities in large language models (LLMs) beyond those offered by paradigms +such as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary +advantage of GoT is the ability to model the information generated by an LLM as +an arbitrary graph, where units of information ("LLM thoughts") are vertices, +and edges correspond to dependencies between these vertices. This approach +enables combining arbitrary LLM thoughts into synergistic outcomes, distilling +the essence of whole networks of thoughts, or enhancing thoughts using feedback +loops. We illustrate that GoT offers advantages over state of the art on +different tasks, for example increasing the quality of sorting by 62% over ToT, +while simultaneously reducing costs by >31%. We ensure that GoT is extensible +with new thought transformations and thus can be used to spearhead new +prompting schemes. This work brings the LLM reasoning closer to human thinking +or brain mechanisms such as recurrence, both of which form complex networks. + +
+
+
+
+
+ + ♻ ☆ FLARE: Fingerprinting Deep Reinforcement Learning Agents using Universal + Adversarial Masks ACSA + + +
+ We propose FLARE, the first fingerprinting mechanism to verify whether a +suspected Deep Reinforcement Learning (DRL) policy is an illegitimate copy of +another (victim) policy. We first show that it is possible to find +non-transferable, universal adversarial masks, i.e., perturbations, to generate +adversarial examples that can successfully transfer from a victim policy to its +modified versions but not to independently trained policies. FLARE employs +these masks as fingerprints to verify the true ownership of stolen DRL policies +by measuring an action agreement value over states perturbed via such masks. +Our empirical evaluations show that FLARE is effective (100% action agreement +on stolen copies) and does not falsely accuse independent policies (no false +positives). FLARE is also robust to model modification attacks and cannot be +easily evaded by more informed adversaries without negatively impacting agent +performance. We also show that not all universal adversarial masks are suitable +candidates for fingerprints due to the inherent characteristics of DRL +policies. The spatio-temporal dynamics of DRL problems and sequential +decision-making process make characterizing the decision boundary of DRL +policies more difficult, as well as searching for universal masks that capture +the geometry of it. + +
+
+ comment: Will appear in the proceedings of ACSAC 2023; 13 pages, 5 figures, 7 + tables +
+
+
+
+
+ + ♻ ☆ Reinforced Self-Training (ReST) for Language Modeling + + +
+ Reinforcement learning from human feedback (RLHF) can improve the quality of +large language model's (LLM) outputs by aligning them with human preferences. +We propose a simple algorithm for aligning LLMs with human preferences inspired +by growing batch reinforcement learning (RL), which we call Reinforced +Self-Training (ReST). Given an initial LLM policy, ReST produces a dataset by +generating samples from the policy, which are then used to improve the LLM +policy using offline RL algorithms. ReST is more efficient than typical online +RLHF methods because the training dataset is produced offline, which allows +data reuse. While ReST is a general approach applicable to all generative +learning settings, we focus on its application to machine translation. Our +results show that ReST can substantially improve translation quality, as +measured by automated metrics and human evaluation on machine translation +benchmarks in a compute and sample-efficient manner. + +
+
+ comment: 23 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Automatic Classification of Blood Cell Images Using Convolutional Neural + Network + + +
+ Human blood primarily comprises plasma, red blood cells, white blood cells, +and platelets. It plays a vital role in transporting nutrients to different +organs, where it stores essential health-related data about the human body. +Blood cells are utilized to defend the body against diverse infections, +including fungi, viruses, and bacteria. Hence, blood analysis can help +physicians assess an individual's physiological condition. Blood cells have +been sub-classified into eight groups: Neutrophils, eosinophils, basophils, +lymphocytes, monocytes, immature granulocytes (promyelocytes, myelocytes, and +metamyelocytes), erythroblasts, and platelets or thrombocytes on the basis of +their nucleus, shape, and cytoplasm. Traditionally, pathologists and +hematologists in laboratories have examined these blood cells using a +microscope before manually classifying them. The manual approach is slower and +more prone to human error. Therefore, it is essential to automate this process. +In our paper, transfer learning with CNN pre-trained models. VGG16, VGG19, +ResNet-50, ResNet-101, ResNet-152, InceptionV3, MobileNetV2, and DenseNet-20 +applied to the PBC dataset's normal DIB. The overall accuracy achieved with +these models lies between 91.375 and 94.72%. Hence, inspired by these +pre-trained architectures, a model has been proposed to automatically classify +the ten types of blood cells with increased accuracy. A novel CNN-based +framework has been presented to improve accuracy. The proposed CNN model has +been tested on the PBC dataset normal DIB. The outcomes of the experiments +demonstrate that our CNN-based framework designed for blood cell classification +attains an accuracy of 99.91% on the PBC dataset. Our proposed convolutional +neural network model performs competitively when compared to earlier results +reported in the literature. + +
+
+ comment: 15 +
+
+
+
+
+ + ♻ ☆ Learning Domain Invariant Representations by Joint Wasserstein Distance + Minimization + + +
+ Domain shifts in the training data are common in practical applications of +machine learning; they occur for instance when the data is coming from +different sources. Ideally, a ML model should work well independently of these +shifts, for example, by learning a domain-invariant representation. However, +common ML losses do not give strong guarantees on how consistently the ML model +performs for different domains, in particular, whether the model performs well +on a domain at the expense of its performance on another domain. In this paper, +we build new theoretical foundations for this problem, by contributing a set of +mathematical relations between classical losses for supervised ML and the +Wasserstein distance in joint space (i.e. representation and output space). We +show that classification or regression losses, when combined with a GAN-type +discriminator between domains, form an upper-bound to the true Wasserstein +distance between domains. This implies a more invariant representation and also +more stable prediction performance across domains. Theoretical results are +corroborated empirically on several image datasets. Our proposed approach +systematically produces the highest minimum classification accuracy across +domains, and the most invariant representation. + +
+
+ comment: 23 pages + supplement +
+
+
+
+
+ + ♻ ☆ Classification of White Blood Cells Using Machine and Deep Learning + Models: A Systematic Review + + +
+ Machine learning (ML) and deep learning (DL) models have been employed to +significantly improve analyses of medical imagery, with these approaches used +to enhance the accuracy of prediction and classification. Model predictions and +classifications assist diagnoses of various cancers and tumors. This review +presents an in-depth analysis of modern techniques applied within the domain of +medical image analysis for white blood cell classification. The methodologies +that use blood smear images, magnetic resonance imaging (MRI), X-rays, and +similar medical imaging domains are identified and discussed, with a detailed +analysis of ML/DL techniques applied to the classification of white blood cells +(WBCs) representing the primary focus of the review. The data utilized in this +research has been extracted from a collection of 136 primary papers that were +published between the years 2006 and 2023. The most widely used techniques and +best-performing white blood cell classification methods are identified. While +the use of ML and DL for white blood cell classification has concurrently +increased and improved in recent year, significant challenges remain - 1) +Availability of appropriate datasets remain the primary challenge, and may be +resolved using data augmentation techniques. 2) Medical training of researchers +is recommended to improve current understanding of white blood cell structure +and subsequent selection of appropriate classification models. 3) Advanced DL +networks including Generative Adversarial Networks, R-CNN, Fast R-CNN, and +faster R-CNN will likely be increasingly employed to supplement or replace +current techniques. + +
+
+
+
+
+ + ♻ ☆ Federated learning for secure development of AI models for Parkinson's + disease detection using speech from different languages INTERSPEECH 2023 + + +
+ Parkinson's disease (PD) is a neurological disorder impacting a person's +speech. Among automatic PD assessment methods, deep learning models have gained +particular interest. Recently, the community has explored cross-pathology and +cross-language models which can improve diagnostic accuracy even further. +However, strict patient data privacy regulations largely prevent institutions +from sharing patient speech data with each other. In this paper, we employ +federated learning (FL) for PD detection using speech signals from 3 real-world +language corpora of German, Spanish, and Czech, each from a separate +institution. Our results indicate that the FL model outperforms all the local +models in terms of diagnostic accuracy, while not performing very differently +from the model based on centrally combined training sets, with the advantage of +not requiring any data sharing among collaborators. This will simplify +inter-institutional collaborations, resulting in enhancement of patient +outcomes. + +
+
+ comment: INTERSPEECH 2023, pp. 5003--5007, Dublin, Ireland +
+
+
+
+
+ + ♻ ☆ Concept Evolution in Deep Learning Training: A Unified Interpretation + Framework and Discoveries CIKM'23 + + +
+ We present ConceptEvo, a unified interpretation framework for deep neural +networks (DNNs) that reveals the inception and evolution of learned concepts +during training. Our work addresses a critical gap in DNN interpretation +research, as existing methods primarily focus on post-training interpretation. +ConceptEvo introduces two novel technical contributions: (1) an algorithm that +generates a unified semantic space, enabling side-by-side comparison of +different models during training, and (2) an algorithm that discovers and +quantifies important concept evolutions for class predictions. Through a +large-scale human evaluation and quantitative experiments, we demonstrate that +ConceptEvo successfully identifies concept evolutions across different models, +which are not only comprehensible to humans but also crucial for class +predictions. ConceptEvo is applicable to both modern DNN architectures, such as +ConvNeXt, and classic DNNs, such as VGGs and InceptionV3. + +
+
+ comment: Accepted at CIKM'23 +
+
+
+
+
+ + ♻ ☆ The Impossibility of Parallelizing Boosting + + +
+ The aim of boosting is to convert a sequence of weak learners into a strong +learner. At their heart, these methods are fully sequential. In this paper, we +investigate the possibility of parallelizing boosting. Our main contribution is +a strong negative result, implying that significant parallelization of boosting +requires an exponential blow-up in the total computing resources needed for +training. + +
+
+
+
+
+ + ♻ ☆ Meta-Learning with Adaptive Weighted Loss for Imbalanced Cold-Start + Recommendation CIKM 2023 + + +
+ Sequential recommenders have made great strides in capturing a user's +preferences. Nevertheless, the cold-start recommendation remains a fundamental +challenge as they typically involve limited user-item interactions for +personalization. Recently, gradient-based meta-learning approaches have emerged +in the sequential recommendation field due to their fast adaptation and +easy-to-integrate abilities. The meta-learning algorithms formulate the +cold-start recommendation as a few-shot learning problem, where each user is +represented as a task to be adapted. While meta-learning algorithms generally +assume that task-wise samples are evenly distributed over classes or values, +user-item interactions in real-world applications do not conform to such a +distribution (e.g., watching favorite videos multiple times, leaving only +positive ratings without any negative ones). Consequently, imbalanced user +feedback, which accounts for the majority of task training data, may dominate +the user adaptation process and prevent meta-learning algorithms from learning +meaningful meta-knowledge for personalized recommendations. To alleviate this +limitation, we propose a novel sequential recommendation framework based on +gradient-based meta-learning that captures the imbalanced rating distribution +of each user and computes adaptive loss for user-specific learning. Our work is +the first to tackle the impact of imbalanced ratings in cold-start sequential +recommendation scenarios. Through extensive experiments conducted on real-world +datasets, we demonstrate the effectiveness of our framework. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Context-aware multi-head self-attentional neural network model for next + location prediction + + +
+ Accurate activity location prediction is a crucial component of many mobility +applications and is particularly required to develop personalized, sustainable +transportation systems. Despite the widespread adoption of deep learning +models, next location prediction models lack a comprehensive discussion and +integration of mobility-related spatio-temporal contexts. Here, we utilize a +multi-head self-attentional (MHSA) neural network that learns location +transition patterns from historical location visits, their visit time and +activity duration, as well as their surrounding land use functions, to infer an +individual's next location. Specifically, we adopt point-of-interest data and +latent Dirichlet allocation for representing locations' land use contexts at +multiple spatial scales, generate embedding vectors of the spatio-temporal +features, and learn to predict the next location with an MHSA network. Through +experiments on two large-scale GNSS tracking datasets, we demonstrate that the +proposed model outperforms other state-of-the-art prediction models, and reveal +the contribution of various spatio-temporal contexts to the model's +performance. Moreover, we find that the model trained on population data +achieves higher prediction performance with fewer parameters than +individual-level models due to learning from collective movement patterns. We +also reveal mobility conducted in the recent past and one week before has the +largest influence on the current prediction, showing that learning from a +subset of the historical mobility is sufficient to obtain an accurate location +prediction result. We believe that the proposed model is vital for +context-aware mobility prediction. The gained insights will help to understand +location prediction models and promote their implementation for mobility +applications. + +
+
+ comment: updated Discussion section; accepted by Transportation Research Part + C +
+
+
+
+
+ + ♻ ☆ DeepCut: Unsupervised Segmentation using Graph Neural Networks + Clustering + + +
+ Image segmentation is a fundamental task in computer vision. Data annotation +for training supervised methods can be labor-intensive, motivating unsupervised +methods. Current approaches often rely on extracting deep features from +pre-trained networks to construct a graph, and classical clustering methods +like k-means and normalized-cuts are then applied as a post-processing step. +However, this approach reduces the high-dimensional information encoded in the +features to pair-wise scalar affinities. To address this limitation, this study +introduces a lightweight Graph Neural Network (GNN) to replace classical +clustering methods while optimizing for the same clustering objective function. +Unlike existing methods, our GNN takes both the pair-wise affinities between +local image features and the raw features as input. This direct connection +between the raw features and the clustering objective enables us to implicitly +perform classification of the clusters between different graphs, resulting in +part semantic segmentation without the need for additional post-processing +steps. We demonstrate how classical clustering objectives can be formulated as +self-supervised loss functions for training an image segmentation GNN. +Furthermore, we employ the Correlation-Clustering (CC) objective to perform +clustering without defining the number of clusters, allowing for k-less +clustering. We apply the proposed method for object localization, segmentation, +and semantic part segmentation tasks, surpassing state-of-the-art performance +on multiple benchmarks. + +
+
+
+
+
+ + ♻ ☆ A Survey of Self-supervised Learning from Multiple Perspectives: + Algorithms, Applications and Future Trends + + +
+ Deep supervised learning algorithms generally require large numbers of +labeled examples to achieve satisfactory performance. However, collecting and +labeling too many examples can be costly and time-consuming. As a subset of +unsupervised learning, self-supervised learning (SSL) aims to learn useful +features from unlabeled examples without any human-annotated labels. SSL has +recently attracted much attention and many related algorithms have been +developed. However, there are few comprehensive studies that explain the +connections and evolution of different SSL variants. In this paper, we provide +a review of various SSL methods from the perspectives of algorithms, +applications, three main trends, and open questions. First, the motivations of +most SSL algorithms are introduced in detail, and their commonalities and +differences are compared. Second, typical applications of SSL in domains such +as image processing and computer vision (CV), as well as natural language +processing (NLP), are discussed. Finally, the three main trends of SSL and the +open research questions are discussed. A collection of useful materials is +available at https://github.com/guijiejie/SSL. + +
+
+
+
+
+ + ♻ ☆ Generalized Sum Pooling for Metric Learning ICCV + + +
+ A common architectural choice for deep metric learning is a convolutional +neural network followed by global average pooling (GAP). Albeit simple, GAP is +a highly effective way to aggregate information. One possible explanation for +the effectiveness of GAP is considering each feature vector as representing a +different semantic entity and GAP as a convex combination of them. Following +this perspective, we generalize GAP and propose a learnable generalized sum +pooling method (GSP). GSP improves GAP with two distinct abilities: i) the +ability to choose a subset of semantic entities, effectively learning to ignore +nuisance information, and ii) learning the weights corresponding to the +importance of each entity. Formally, we propose an entropy-smoothed optimal +transport problem and show that it is a strict generalization of GAP, i.e., a +specific realization of the problem gives back GAP. We show that this +optimization problem enjoys analytical gradients enabling us to use it as a +direct learnable replacement for GAP. We further propose a zero-shot loss to +ease the learning of GSP. We show the effectiveness of our method with +extensive evaluations on 4 popular metric learning benchmarks. Code is +available at: GSP-DML Framework + +
+
+ comment: Accepted as a conference paper at International Conference on + Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ♻ ☆ IVP-VAE: Modeling EHR Time Series with Initial Value Problem Solvers + + +
+ Continuous-time models such as Neural ODEs and Neural Flows have shown +promising results in analyzing irregularly sampled time series frequently +encountered in electronic health records. Based on these models, time series +are typically processed with a hybrid of an initial value problem (IVP) solver +and a recurrent neural network within the variational autoencoder architecture. +Sequentially solving IVPs makes such models computationally less efficient. In +this paper, we propose to model time series purely with continuous processes +whose state evolution can be approximated directly by IVPs. This eliminates the +need for recurrent computation and enables multiple states to evolve in +parallel. We further fuse the encoder and decoder with one IVP solver utilizing +its invertibility, which leads to fewer parameters and faster convergence. +Experiments on three real-world datasets show that the proposed method can +systematically outperform its predecessors, achieve state-of-the-art results, +and have significant advantages in terms of data efficiency. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ CT Perfusion is All We Need: 4D CNN Segmentation of Penumbra and Core in + Patients With Suspected Ischemic Stroke + + +
+ Precise and fast prediction methods for ischemic areas comprised of dead +tissue, core, and salvageable tissue, penumbra, in acute ischemic stroke (AIS) +patients are of significant clinical interest. They play an essential role in +improving diagnosis and treatment planning. Computed Tomography (CT) scan is +one of the primary modalities for early assessment in patients with suspected +AIS. CT Perfusion (CTP) is often used as a primary assessment to determine +stroke location, severity, and volume of ischemic lesions. Current automatic +segmentation methods for CTP mostly use already processed 3D parametric maps +conventionally used for clinical interpretation by radiologists as input. +Alternatively, the raw CTP data is used on a slice-by-slice basis as 2D+time +input, where the spatial information over the volume is ignored. In addition, +these methods are only interested in segmenting core regions, while predicting +penumbra can be essential for treatment planning. This paper investigates +different methods to utilize the entire 4D CTP as input to fully exploit the +spatio-temporal information, leading us to propose a novel 4D convolution +layer. Our comprehensive experiments on a local dataset of 152 patients divided +into three groups show that our proposed models generate more precise results +than other methods explored. Adopting the proposed 4D mJ-Net, a Dice +Coefficient of 0.53 and 0.23 is achieved for segmenting penumbra and core +areas, respectively. The code is available on +https://github.com/Biomedical-Data-Analysis-Laboratory/4D-mJ-Net.git. + +
+
+
+
+
+ + ♻ ☆ Towards Top-Down Automated Development in Limited Scopes: A + Neuro-Symbolic Framework from Expressibles to Executables + + +
+ Deep code generation is a topic of deep learning for software engineering +(DL4SE), which adopts neural models to generate code for the intended +functions. Since end-to-end neural methods lack domain knowledge and software +hierarchy awareness, they tend to perform poorly w.r.t project-level tasks. To +systematically explore the potential improvements of code generation, we let it +participate in the whole top-down development from \emph{expressibles} to +\emph{executables}, which is possible in limited scopes. In the process, it +benefits from massive samples, features, and knowledge. As the foundation, we +suggest building a taxonomy on code data, namely code taxonomy, leveraging the +categorization of code information. Moreover, we introduce a three-layer +semantic pyramid (SP) to associate text data and code data. It identifies the +information of different abstraction levels, and thus introduces the domain +knowledge on development and reveals the hierarchy of software. Furthermore, we +propose a semantic pyramid framework (SPF) as the approach, focusing on +software of high modularity and low complexity. SPF divides the code generation +process into stages and reserves spots for potential interactions. In addition, +we conceived preliminary applications in software development to confirm the +neuro-symbolic framework. + +
+
+ comment: 5 pages, 3 figures, 2 tables, accepted by ESEC/FSE 2023, the + camera-ready version +
+
+
+
+
+ + ♻ ☆ A Profit-Maximizing Strategy for Advertising on the e-Commerce Platforms + + +
+ The online advertising management platform has become increasingly popular +among e-commerce vendors/advertisers, offering a streamlined approach to reach +target customers. Despite its advantages, configuring advertising strategies +correctly remains a challenge for online vendors, particularly those with +limited resources. Ineffective strategies often result in a surge of +unproductive ``just looking'' clicks, leading to disproportionately high +advertising expenses comparing to the growth of sales. In this paper, we +present a novel profit-maximing strategy for targeting options of online +advertising. The proposed model aims to find the optimal set of features to +maximize the probability of converting targeted audiences into actual buyers. +We address the optimization challenge by reformulating it as a multiple-choice +knapsack problem (MCKP). We conduct an empirical study featuring real-world +data from Tmall to show that our proposed method can effectively optimize the +advertising strategy with budgetary constraints. + +
+
+ comment: Online advertising campaigns +
+
+
+
+
+ + ♻ Consciousness in Artificial Intelligence: Insights from the Science of + Consciousness + + +
+ Whether current or near-term AI systems could be conscious is a topic of +scientific interest and increasing public concern. This report argues for, and +exemplifies, a rigorous and empirically grounded approach to AI consciousness: +assessing existing AI systems in detail, in light of our best-supported +neuroscientific theories of consciousness. We survey several prominent +scientific theories of consciousness, including recurrent processing theory, +global workspace theory, higher-order theories, predictive processing, and +attention schema theory. From these theories we derive "indicator properties" +of consciousness, elucidated in computational terms that allow us to assess AI +systems for these properties. We use these indicator properties to assess +several recent AI systems, and we discuss how future systems might implement +them. Our analysis suggests that no current AI systems are conscious, but also +shows that there are no obvious barriers to building conscious AI systems. + +
+
+
+
+
+ + ♻ ☆ Towards Balanced Active Learning for Multimodal Classification + + +
+ Training multimodal networks requires a vast amount of data due to their +larger parameter space compared to unimodal networks. Active learning is a +widely used technique for reducing data annotation costs by selecting only +those samples that could contribute to improving model performance. However, +current active learning strategies are mostly designed for unimodal tasks, and +when applied to multimodal data, they often result in biased sample selection +from the dominant modality. This unfairness hinders balanced multimodal +learning, which is crucial for achieving optimal performance. To address this +issue, we propose three guidelines for designing a more balanced multimodal +active learning strategy. Following these guidelines, a novel approach is +proposed to achieve more fair data selection by modulating the gradient +embedding with the dominance degree among modalities. Our studies demonstrate +that the proposed method achieves more balanced multimodal learning by avoiding +greedy sample selection from the dominant modality. Our approach outperforms +existing active learning strategies on a variety of multimodal classification +tasks. Overall, our work highlights the importance of balancing sample +selection in multimodal active learning and provides a practical solution for +achieving more balanced active learning for multimodal classification. + +
+
+ comment: 12 pages, accepted by ACMMM 2023 +
+
+
+
+
+ + ♻ ☆ iPLAN: Intent-Aware Planning in Heterogeneous Traffic via Distributed + Multi-Agent Reinforcement Learning + + +
+ Navigating safely and efficiently in dense and heterogeneous traffic +scenarios is challenging for autonomous vehicles (AVs) due to their inability +to infer the behaviors or intentions of nearby drivers. In this work, we +introduce a distributed multi-agent reinforcement learning (MARL) algorithm +that can predict trajectories and intents in dense and heterogeneous traffic +scenarios. Our approach for intent-aware planning, iPLAN, allows agents to +infer nearby drivers' intents solely from their local observations. We model +two distinct incentives for agents' strategies: Behavioral Incentive for +high-level decision-making based on their driving behavior or personality and +Instant Incentive for motion planning for collision avoidance based on the +current traffic state. Our approach enables agents to infer their opponents' +behavior incentives and integrate this inferred information into their +decision-making and motion-planning processes. We perform experiments on two +simulation environments, Non-Cooperative Navigation and Heterogeneous Highway. +In Heterogeneous Highway, results show that, compared with centralized training +decentralized execution (CTDE) MARL baselines such as QMIX and MAPPO, our +method yields a 4.3% and 38.4% higher episodic reward in mild and chaotic +traffic, with 48.1% higher success rate and 80.6% longer survival time in +chaotic traffic. We also compare with a decentralized training decentralized +execution (DTDE) baseline IPPO and demonstrate a higher episodic reward of +12.7% and 6.3% in mild traffic and chaotic traffic, 25.3% higher success rate, +and 13.7% longer survival time. + +
+
+
+
+
+ + ♻ ☆ Optimal Bandwidth Selection for DENCLUE Algorithm + + +
+ In modern day industry, clustering algorithms are daily routines of algorithm +engineers. Although clustering algorithms experienced rapid growth before 2010. +Innovation related to the research topic has stagnated after deep learning +became the de facto industrial standard for machine learning applications. In +2007, a density-based clustering algorithm named DENCLUE was invented to solve +clustering problem for nonlinear data structures. However, its parameter +selection problem was largely neglected until 2011. In this paper, we propose a +new approach to compute the optimal parameters for the DENCLUE algorithm, and +discuss its performance in the experiment section. + +
+
+
+
+
+ + ♻ ☆ Dataset Distillation Using Parameter Pruning + + +
+ In this study, we propose a novel dataset distillation method based on +parameter pruning. The proposed method can synthesize more robust distilled +datasets and improve distillation performance by pruning difficult-to-match +parameters during the distillation process. Experimental results on two +benchmark datasets show the superiority of the proposed method. + +
+
+ comment: Published as a journal paper at IEICE Trans. Fund +
+
+
+
+
+ + ♻ ☆ Adaptive Preferential Attached kNN Graph with Distribution-Awareness + + +
+ Graph-based kNN algorithms have garnered widespread popularity for machine +learning tasks due to their simplicity and effectiveness. However, as factual +data often inherit complex distributions, the conventional kNN graph's reliance +on a unified k-value can hinder its performance. A crucial factor behind this +challenge is the presence of ambiguous samples along decision boundaries that +are inevitably more prone to incorrect classifications. To address the +situation, we propose the Preferential Attached k-Nearest Neighbors Graph +(paNNG), which adopts distribution-aware adaptive-k into graph construction. By +incorporating distribution information as a cohesive entity, paNNG can +significantly improve performance on ambiguous samples by "pulling" them +towards their original classes and hence enhance overall generalization +capability. Through rigorous evaluations on diverse datasets, paNNG outperforms +state-of-the-art algorithms, showcasing its adaptability and efficacy across +various real-world scenarios. + +
+
+
+
+
+ + ♻ ☆ AudioFormer: Audio Transformer learns audio feature representations from + discrete acoustic codes + + +
+ We propose a method named AudioFormer,which learns audio feature +representations through the acquisition of discrete acoustic codes and +subsequently fine-tunes them for audio classification tasks. Initially,we +introduce a novel perspective by considering the audio classification task as a +form of natural language understanding (NLU). Leveraging an existing neural +audio codec model,we generate discrete acoustic codes and utilize them to train +a masked language model (MLM),thereby obtaining audio feature representations. +Furthermore,we pioneer the integration of a Multi-Positive sample Contrastive +(MPC) learning approach. This method enables the learning of joint +representations among multiple discrete acoustic codes within the same audio +input. In our experiments,we treat discrete acoustic codes as textual data and +train a masked language model using a cloze-like methodology,ultimately +deriving high-quality audio representations. Notably,the MPC learning technique +effectively captures collaborative representations among distinct positive +samples. Our research outcomes demonstrate that AudioFormer attains +significantly improved performance compared to prevailing monomodal audio +classification models across multiple datasets,and even outperforms +audio-visual multimodal classification models on select datasets. +Specifically,our approach achieves remarkable results on datasets including +AudioSet (2M,20K),and FSD50K,with performance scores of 53.9,45.1,and +65.6,respectively. We have openly shared both the code and models: +https://github.com/LZH-0225/AudioFormer.git. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Neural Implicit Surface Evolution + + +
+ This work investigates the use of smooth neural networks for modeling dynamic +variations of implicit surfaces under the level set equation (LSE). For this, +it extends the representation of neural implicit surfaces to the space-time +$\mathbb{R}^3\times \mathbb{R}$, which opens up mechanisms for continuous +geometric transformations. Examples include evolving an initial surface towards +general vector fields, smoothing and sharpening using the mean curvature +equation, and interpolations of initial conditions. + The network training considers two constraints. A data term is responsible +for fitting the initial condition to the corresponding time instant, usually +$\mathbb{R}^3 \times \{0\}$. Then, a LSE term forces the network to approximate +the underlying geometric evolution given by the LSE, without any supervision. +The network can also be initialized based on previously trained initial +conditions, resulting in faster convergence compared to the standard approach. + +
+
+
+
+
+ + ♻ ☆ SelfDocSeg: A Self-Supervised vision-based Approach towards Document + Segmentation ICDAR 2023 + + +
+ Document layout analysis is a known problem to the documents research +community and has been vastly explored yielding a multitude of solutions +ranging from text mining, and recognition to graph-based representation, visual +feature extraction, etc. However, most of the existing works have ignored the +crucial fact regarding the scarcity of labeled data. With growing internet +connectivity to personal life, an enormous amount of documents had been +available in the public domain and thus making data annotation a tedious task. +We address this challenge using self-supervision and unlike, the few existing +self-supervised document segmentation approaches which use text mining and +textual labels, we use a complete vision-based approach in pre-training without +any ground-truth label or its derivative. Instead, we generate pseudo-layouts +from the document images to pre-train an image encoder to learn the document +object representation and localization in a self-supervised framework before +fine-tuning it with an object detection model. We show that our pipeline sets a +new benchmark in this context and performs at par with the existing methods and +the supervised counterparts, if not outperforms. The code is made publicly +available at: https://github.com/MaitySubhajit/SelfDocSeg + +
+
+ comment: Accepted at The 17th International Conference on Document Analysis + and Recognition (ICDAR 2023) +
+
+
+
+
+ + ♻ ☆ PCT-CycleGAN: Paired Complementary Temporal Cycle-Consistent Adversarial + Networks for Radar-Based Precipitation Nowcasting CIKM 2023 + + +
+ The precipitation nowcasting methods have been elaborated over the centuries +because rain has a crucial impact on human life. Not only quantitative +precipitation forecast (QPF) models and convolutional long short-term memory +(ConvLSTM), but also various sophisticated methods such as the latest MetNet-2 +are emerging. In this paper, we propose a paired complementary temporal +cycle-consistent adversarial networks (PCT-CycleGAN) for radar-based +precipitation nowcasting, inspired by cycle-consistent adversarial networks +(CycleGAN), which shows strong performance in image-to-image translation. +PCT-CycleGAN generates temporal causality using two generator networks with +forward and backward temporal dynamics in paired complementary cycles. Each +generator network learns a huge number of one-to-one mappings about +time-dependent radar-based precipitation data to approximate a mapping function +representing the temporal dynamics in each direction. To create robust temporal +causality between paired complementary cycles, novel connection loss is +proposed. And torrential loss to cover exceptional heavy rain events is also +proposed. The generator network learning forward temporal dynamics in +PCT-CycleGAN generates radar-based precipitation data 10 minutes from the +current time. Also, it provides a reliable prediction of up to 2 hours with +iterative forecasting. The superiority of PCT-CycleGAN is demonstrated through +qualitative and quantitative comparisons with several previous methods. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Walking Out of the Weisfeiler Leman Hierarchy: Graph Learning Beyond + Message Passing + + +
+ We propose CRaWl, a novel neural network architecture for graph learning. +Like graph neural networks, CRaWl layers update node features on a graph and +thus can freely be combined or interleaved with GNN layers. Yet CRaWl operates +fundamentally different from message passing graph neural networks. CRaWl +layers extract and aggregate information on subgraphs appearing along random +walks through a graph using 1D Convolutions. Thereby it detects long range +interactions and computes non-local features. As the theoretical basis for our +approach, we prove a theorem stating that the expressiveness of CRaWl is +incomparable with that of the Weisfeiler Leman algorithm and hence with graph +neural networks. That is, there are functions expressible by CRaWl, but not by +GNNs and vice versa. This result extends to higher levels of the Weisfeiler +Leman hierarchy and thus to higher-order GNNs. Empirically, we show that CRaWl +matches state-of-the-art GNN architectures across a multitude of benchmark +datasets for classification and regression on graphs. + +
+
+
+
+
+ + ♻ ☆ Task Relation-aware Continual User Representation Learning KDD 2023 + + +
+ User modeling, which learns to represent users into a low-dimensional +representation space based on their past behaviors, got a surge of interest +from the industry for providing personalized services to users. Previous +efforts in user modeling mainly focus on learning a task-specific user +representation that is designed for a single task. However, since learning +task-specific user representations for every task is infeasible, recent studies +introduce the concept of universal user representation, which is a more +generalized representation of a user that is relevant to a variety of tasks. +Despite their effectiveness, existing approaches for learning universal user +representations are impractical in real-world applications due to the data +requirement, catastrophic forgetting and the limited learning capability for +continually added tasks. In this paper, we propose a novel continual user +representation learning method, called TERACON, whose learning capability is +not limited as the number of learned tasks increases while capturing the +relationship between the tasks. The main idea is to introduce an embedding for +each task, i.e., task embedding, which is utilized to generate task-specific +soft masks that not only allow the entire model parameters to be updated until +the end of training sequence, but also facilitate the relationship between the +tasks to be captured. Moreover, we introduce a novel knowledge retention module +with pseudo-labeling strategy that successfully alleviates the long-standing +problem of continual learning, i.e., catastrophic forgetting. Extensive +experiments on public and proprietary real-world datasets demonstrate the +superiority and practicality of TERACON. Our code is available at +https://github.com/Sein-Kim/TERACON. + +
+
+ comment: KDD 2023 +
+
+
+
+
+ + ♻ ☆ Low Rank Matrix Completion via Robust Alternating Minimization in Nearly + Linear Time + + +
+ Given a matrix $M\in \mathbb{R}^{m\times n}$, the low rank matrix completion +problem asks us to find a rank-$k$ approximation of $M$ as $UV^\top$ for $U\in +\mathbb{R}^{m\times k}$ and $V\in \mathbb{R}^{n\times k}$ by only observing a +few entries specified by a set of entries $\Omega\subseteq [m]\times [n]$. In +particular, we examine an approach that is widely used in practice -- the +alternating minimization framework. Jain, Netrapalli and Sanghavi~\cite{jns13} +showed that if $M$ has incoherent rows and columns, then alternating +minimization provably recovers the matrix $M$ by observing a nearly linear in +$n$ number of entries. While the sample complexity has been subsequently +improved~\cite{glz17}, alternating minimization steps are required to be +computed exactly. This hinders the development of more efficient algorithms and +fails to depict the practical implementation of alternating minimization, where +the updates are usually performed approximately in favor of efficiency. + In this paper, we take a major step towards a more efficient and error-robust +alternating minimization framework. To this end, we develop an analytical +framework for alternating minimization that can tolerate moderate amount of +errors caused by approximate updates. Moreover, our algorithm runs in time +$\widetilde O(|\Omega| k)$, which is nearly linear in the time to verify the +solution while preserving the sample complexity. This improves upon all prior +known alternating minimization approaches which require $\widetilde O(|\Omega| +k^2)$ time. + +
+
+ comment: Improve the runtime from $O(mnk)$ to $O|\Omega| k)$ +
+
+
+
+
+ + ♻ ☆ Gradient Descent for Deep Matrix Factorization: Dynamics and Implicit + Bias towards Low Rank + + +
+ In deep learning, it is common to use more network parameters than training +points. In such scenarioof over-parameterization, there are usually multiple +networks that achieve zero training error so that thetraining algorithm induces +an implicit bias on the computed solution. In practice, (stochastic) +gradientdescent tends to prefer solutions which generalize well, which provides +a possible explanation of thesuccess of deep learning. In this paper we analyze +the dynamics of gradient descent in the simplifiedsetting of linear networks +and of an estimation problem. Although we are not in an +overparameterizedscenario, our analysis nevertheless provides insights into the +phenomenon of implicit bias. In fact, wederive a rigorous analysis of the +dynamics of vanilla gradient descent, and characterize the dynamicalconvergence +of the spectrum. We are able to accurately locate time intervals where the +effective rankof the iterates is close to the effective rank of a low-rank +projection of the ground-truth matrix. Inpractice, those intervals can be used +as criteria for early stopping if a certain regularity is desired. Wealso +provide empirical evidence for implicit bias in more general scenarios, such as +matrix sensing andrandom initialization. This suggests that deep learning +prefers trajectories whose complexity (measuredin terms of effective rank) is +monotonically increasing, which we believe is a fundamental concept for +thetheoretical understanding of deep learning. + +
+
+
+
+
+ + ♻ ☆ Accelerating Antimicrobial Peptide Discovery with Latent Structure KDD 2023 + + +
+ Antimicrobial peptides (AMPs) are promising therapeutic approaches against +drug-resistant pathogens. Recently, deep generative models are used to discover +new AMPs. However, previous studies mainly focus on peptide sequence attributes +and do not consider crucial structure information. In this paper, we propose a +latent sequence-structure model for designing AMPs (LSSAMP). LSSAMP exploits +multi-scale vector quantization in the latent space to represent secondary +structures (e.g. alpha helix and beta sheet). By sampling in the latent space, +LSSAMP can simultaneously generate peptides with ideal sequence attributes and +secondary structures. Experimental results show that the peptides generated by +LSSAMP have a high probability of antimicrobial activity. Our wet laboratory +experiments verified that two of the 21 candidates exhibit strong antimicrobial +activity. The code is released at https://github.com/dqwang122/LSSAMP. + +
+
+ comment: KDD 2023 +
+
+
+
+
+ + ♻ ☆ ISEE.U: Distributed online active target localization with unpredictable + targets + + +
+ This paper addresses target localization with an online active learning +algorithm defined by distributed, simple and fast computations at each node, +with no parameters to tune and where the estimate of the target position at +each agent is asymptotically equal in expectation to the centralized +maximum-likelihood estimator. ISEE.U takes noisy distances at each agent and +finds a control that maximizes localization accuracy. We do not assume specific +target dynamics and, thus, our method is robust when facing unpredictable +targets. Each agent computes the control that maximizes overall target position +accuracy via a local estimate of the Fisher Information Matrix. We compared the +proposed method with a state of the art algorithm outperforming it when the +target movements do not follow a prescribed trajectory, with x100 less +computation time, even when our method is running in one central CPU. + +
+
+
+
+
+ + ♻ ☆ Distributed Black-box Attack against Image Classification Cloud Services + + +
+ Black-box adversarial attacks can fool image classifiers into misclassifying +images without requiring access to model structure and weights. Recent studies +have reported attack success rates of over 95% with less than 1,000 queries. +The question then arises of whether black-box attacks have become a real threat +against IoT devices that rely on cloud APIs to achieve image classification. To +shed some light on this, note that prior research has primarily focused on +increasing the success rate and reducing the number of queries. However, +another crucial factor for black-box attacks against cloud APIs is the time +required to perform the attack. This paper applies black-box attacks directly +to cloud APIs rather than to local models, thereby avoiding mistakes made in +prior research that applied the perturbation before image encoding and +pre-processing. Further, we exploit load balancing to enable distributed +black-box attacks that can reduce the attack time by a factor of about five for +both local search and gradient estimation methods. + +
+
+ comment: 8 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Differential Privacy Amplification in Quantum and Quantum-inspired + Algorithms + + +
+ Differential privacy provides a theoretical framework for processing a +dataset about $n$ users, in a way that the output reveals a minimal information +about any single user. Such notion of privacy is usually ensured by +noise-adding mechanisms and amplified by several processes, including +subsampling, shuffling, iteration, mixing and diffusion. In this work, we +provide privacy amplification bounds for quantum and quantum-inspired +algorithms. In particular, we show for the first time, that algorithms running +on quantum encoding of a classical dataset or the outcomes of quantum-inspired +classical sampling, amplify differential privacy. Moreover, we prove that a +quantum version of differential privacy is amplified by the composition of +quantum channels, provided that they satisfy some mixing conditions. + +
+
+ comment: This article is superseded by arXiv:2307.04733 +
+
+
+
+
+ + ♻ ☆ Quantum Local Differential Privacy and Quantum Statistical Query Model + + +
+ Quantum statistical queries provide a theoretical framework for investigating +the computational power of a learner with limited quantum resources. This model +is particularly relevant in the current context, where available quantum +devices are subject to severe noise and have limited quantum memory. On the +other hand, the framework of quantum differential privacy demonstrates that +noise can, in some cases, benefit the computation, enhancing robustness and +statistical security. In this work, we establish an equivalence between quantum +statistical queries and quantum differential privacy in the local model, +extending a celebrated classical result to the quantum setting. Furthermore, we +derive strong data processing inequalities for the quantum relative entropy +under local differential privacy and apply this result to the task of +asymmetric hypothesis testing with restricted measurements. Finally, we +consider the task of quantum multi-party computation under local differential +privacy. As a proof of principle, we demonstrate that the parity function is +efficiently learnable in this model, whereas the corresponding classical task +requires exponentially many samples. + +
+
+ comment: This version significantly extends the previous one with new entropic + inequalities under local privacy, a private version of the quantum Stein's + Lemma and an application to private multi-party quantum computation +
+
+
+
+
+ + ♻ ☆ CVRecon: Rethinking 3D Geometric Feature Learning For Neural + Reconstruction ICCV 2023 + + +
+ Recent advances in neural reconstruction using posed image sequences have +made remarkable progress. However, due to the lack of depth information, +existing volumetric-based techniques simply duplicate 2D image features of the +object surface along the entire camera ray. We contend this duplication +introduces noise in empty and occluded spaces, posing challenges for producing +high-quality 3D geometry. Drawing inspiration from traditional multi-view +stereo methods, we propose an end-to-end 3D neural reconstruction framework +CVRecon, designed to exploit the rich geometric embedding in the cost volumes +to facilitate 3D geometric feature learning. Furthermore, we present +Ray-contextual Compensated Cost Volume (RCCV), a novel 3D geometric feature +representation that encodes view-dependent information with improved integrity +and robustness. Through comprehensive experiments, we demonstrate that our +approach significantly improves the reconstruction quality in various metrics +and recovers clear fine details of the 3D geometries. Our extensive ablation +studies provide insights into the development of effective 3D geometric feature +learning schemes. Project page: https://cvrecon.ziyue.cool/ + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Risk of re-identification for shared clinical speech recordings + + +
+ Large, curated datasets are required to leverage speech-based tools in +healthcare. These are costly to produce, resulting in increased interest in +data sharing. As speech can potentially identify speakers (i.e., voiceprints), +sharing recordings raises privacy concerns. We examine the re-identification +risk for speech recordings, without reference to demographic or metadata, using +a state-of-the-art speaker recognition system. We demonstrate that the risk is +inversely related to the number of comparisons an adversary must consider, +i.e., the search space. Risk is high for a small search space but drops as the +search space grows ($precision >0.85$ for $<1*10^{6}$ comparisons, $precision +<0.5$ for $>3*10^{6}$ comparisons). Next, we show that the nature of a speech +recording influences re-identification risk, with non-connected speech (e.g., +vowel prolongation) being harder to identify. Our findings suggest that speaker +recognition systems can be used to re-identify participants in specific +circumstances, but in practice, the re-identification risk appears low. + +
+
+ comment: 24 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Designing Discontinuities ICML + + +
+ Discontinuities can be fairly arbitrary but also cause a significant impact +on outcomes in social systems. Indeed, their arbitrariness is why they have +been used to infer causal relationships among variables in numerous settings. +Regression discontinuity from econometrics assumes the existence of a +discontinuous variable that splits the population into distinct partitions to +estimate the causal effects of a given phenomenon. Here we consider the design +of partitions for a given discontinuous variable to optimize a certain effect +previously studied using regression discontinuity. To do so, we propose a +quantization-theoretic approach to optimize the effect of interest, first +learning the causal effect size of a given discontinuous variable and then +applying dynamic programming for optimal quantization design of discontinuities +that balance the gain and loss in the effect size. We also develop a +computationally-efficient reinforcement learning algorithm for the dynamic +programming formulation of optimal quantization. We demonstrate our approach by +designing optimal time zone borders for counterfactuals of social capital, +social mobility, and health. This is based on regression discontinuity analyses +we perform on novel data, which may be of independent empirical interest in +showing a causal relationship between sunset time and social capital. + +
+
+ comment: A short version is accepted in Neural Compression ICML Worksop July + 19th, 2023 +
+
+
+
+
+ + ♻ ☆ FairDP: Certified Fairness with Differential Privacy + + +
+ This paper introduces FairDP, a novel mechanism designed to achieve certified +fairness with differential privacy (DP). FairDP independently trains models for +distinct individual groups, using group-specific clipping terms to assess and +bound the disparate impacts of DP. Throughout the training process, the +mechanism progressively integrates knowledge from group models to formulate a +comprehensive model that balances privacy, utility, and fairness in downstream +tasks. Extensive theoretical and empirical analyses validate the efficacy of +FairDP and improved trade-offs between model utility, privacy, and fairness +compared with existing methods. + +
+
+
+
+
+ + ♻ ☆ Average-Hard Attention Transformers are Constant-Depth Uniform Threshold + Circuits + + +
+ Transformers have emerged as a widely used neural network model for various +natural language processing tasks. Previous research explored their +relationship with constant-depth threshold circuits, making two assumptions: +average-hard attention and logarithmic precision for internal computations +relative to input length. Merrill et al. (2022) prove that average-hard +attention transformers recognize languages that fall within the complexity +class TC0, denoting the set of languages that can be recognized by +constant-depth polynomial-size threshold circuits. Likewise, Merrill and +Sabharwal (2023) show that log-precision transformers recognize languages +within the class of uniform TC0. This shows that both transformer models can be +simulated by constant-depth threshold circuits, with the latter being more +robust due to generating a uniform circuit family. Our paper shows that the +first result can be extended to yield uniform circuits as well. + +
+
+
+
+
+ + ♻ ☆ Empirical Analysis of a Segmentation Foundation Model in Prostate + Imaging + + +
+ Most state-of-the-art techniques for medical image segmentation rely on +deep-learning models. These models, however, are often trained on +narrowly-defined tasks in a supervised fashion, which requires expensive +labeled datasets. Recent advances in several machine learning domains, such as +natural language generation have demonstrated the feasibility and utility of +building foundation models that can be customized for various downstream tasks +with little to no labeled data. This likely represents a paradigm shift for +medical imaging, where we expect that foundation models may shape the future of +the field. In this paper, we consider a recently developed foundation model for +medical image segmentation, UniverSeg. We conduct an empirical evaluation study +in the context of prostate imaging and compare it against the conventional +approach of training a task-specific segmentation model. Our results and +discussion highlight several important factors that will likely be important in +the development and adoption of foundation models for medical image +segmentation. + +
+
+ comment: Accepted +
+
+
+
+
+ + ♻ ☆ Critical Points ++: An Agile Point Cloud Importance Measure for Robust + Classification, Adversarial Defense and Explainable AI + + +
+ The ability to cope accurately and fast with Out-Of-Distribution (OOD) +samples is crucial in real-world safety demanding applications. In this work we +first study the interplay between critical points of 3D point clouds and OOD +samples. Our findings are that common corruptions and outliers are often +interpreted as critical points. We generalize the notion of critical points +into importance measures. We show that training a classification network based +only on less important points dramatically improves robustness, at a cost of +minor performance loss on the clean set. We observe that normalized entropy is +highly informative for corruption analysis. An adaptive threshold based on +normalized entropy is suggested for selecting the set of uncritical points. Our +proposed importance measure is extremely fast to compute. We show it can be +used for a variety of applications, such as Explainable AI (XAI), Outlier +Removal, Uncertainty Estimation, Robust Classification and Adversarial Defense. +We reach SOTA results on the two latter tasks. Code is available at: +https://github.com/yossilevii100/critical_points2 + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ PACS: Prediction and analysis of cancer subtypes from multi-omics data + based on a multi-head attention mechanism model + + +
+ Due to the high heterogeneity and clinical characteristics of cancer, there +are significant differences in multi-omic data and clinical characteristics +among different cancer subtypes. Therefore, accurate classification of cancer +subtypes can help doctors choose the most appropriate treatment options, +improve treatment outcomes, and provide more accurate patient survival +predictions. In this study, we propose a supervised multi-head attention +mechanism model (SMA) to classify cancer subtypes successfully. The attention +mechanism and feature sharing module of the SMA model can successfully learn +the global and local feature information of multi-omics data. Second, it +enriches the parameters of the model by deeply fusing multi-head attention +encoders from Siamese through the fusion module. Validated by extensive +experiments, the SMA model achieves the highest accuracy, F1 macroscopic, F1 +weighted, and accurate classification of cancer subtypes in simulated, +single-cell, and cancer multiomics datasets compared to AE, CNN, and GNN-based +models. Therefore, we contribute to future research on multiomics data using +our attention-based approach. + +
+
+ comment: Submitted to BIBM2023 +
+
+
+
+
+ + ♻ ☆ TACOformer:Token-channel compounded Cross Attention for Multimodal + Emotion Recognition IJCAI 2023 + + +
+ Recently, emotion recognition based on physiological signals has emerged as a +field with intensive research. The utilization of multi-modal, multi-channel +physiological signals has significantly improved the performance of emotion +recognition systems, due to their complementarity. However, effectively +integrating emotion-related semantic information from different modalities and +capturing inter-modal dependencies remains a challenging issue. Many existing +multimodal fusion methods ignore either token-to-token or channel-to-channel +correlations of multichannel signals from different modalities, which limits +the classification capability of the models to some extent. In this paper, we +propose a comprehensive perspective of multimodal fusion that integrates +channel-level and token-level cross-modal interactions. Specifically, we +introduce a unified cross attention module called Token-chAnnel COmpound (TACO) +Cross Attention to perform multimodal fusion, which simultaneously models +channel-level and token-level dependencies between modalities. Additionally, we +propose a 2D position encoding method to preserve information about the spatial +distribution of EEG signal channels, then we use two transformer encoders ahead +of the fusion module to capture long-term temporal dependencies from the EEG +signal and the peripheral physiological signal, respectively. +Subject-independent experiments on emotional dataset DEAP and Dreamer +demonstrate that the proposed model achieves state-of-the-art performance. + +
+
+ comment: Accepted by IJCAI 2023- AI4TS workshop +
+
+
+
+
+ + ♻ ☆ Multi-scale Target-Aware Framework for Constrained Image Splicing + Detection and Localization + + +
+ Constrained image splicing detection and localization (CISDL) is a +fundamental task of multimedia forensics, which detects splicing operation +between two suspected images and localizes the spliced region on both images. +Recent works regard it as a deep matching problem and have made significant +progress. However, existing frameworks typically perform feature extraction and +correlation matching as separate processes, which may hinder the model's +ability to learn discriminative features for matching and can be susceptible to +interference from ambiguous background pixels. In this work, we propose a +multi-scale target-aware framework to couple feature extraction and correlation +matching in a unified pipeline. In contrast to previous methods, we design a +target-aware attention mechanism that jointly learns features and performs +correlation matching between the probe and donor images. Our approach can +effectively promote the collaborative learning of related patches, and perform +mutual promotion of feature learning and correlation matching. Additionally, in +order to handle scale transformations, we introduce a multi-scale projection +method, which can be readily integrated into our target-aware framework that +enables the attention process to be conducted between tokens containing +information of varying scales. Our experiments demonstrate that our model, +which uses a unified pipeline, outperforms state-of-the-art methods on several +benchmark datasets and is robust against scale transformations. + +
+
+ comment: accepted by ACMMM2023 +
+
+
+
+
+ + ♻ ☆ AMD: Autoregressive Motion Diffusion + + +
+ Human motion generation aims to produce plausible human motion sequences +according to various conditional inputs, such as text or audio. Despite the +feasibility of existing methods in generating motion based on short prompts and +simple motion patterns, they encounter difficulties when dealing with long +prompts or complex motions. The challenges are two-fold: 1) the scarcity of +human motion-captured data for long prompts and complex motions. 2) the high +diversity of human motions in the temporal domain and the substantial +divergence of distributions from conditional modalities, leading to a +many-to-many mapping problem when generating motion with complex and long +texts. In this work, we address these gaps by 1) elaborating the first dataset +pairing long textual descriptions and 3D complex motions (HumanLong3D), and 2) +proposing an autoregressive motion diffusion model (AMD). Specifically, AMD +integrates the text prompt at the current timestep with the text prompt and +action sequences at the previous timestep as conditional information to predict +the current action sequences in an iterative manner. Furthermore, we present +its generalization for X-to-Motion with "No Modality Left Behind", enabling the +generation of high-definition and high-fidelity human motions based on +user-defined modality input. + +
+
+
+
+
+ + ♻ ☆ V2A-Mapper: A Lightweight Solution for Vision-to-Audio Generation by + Connecting Foundation Models + + +
+ Building artificial intelligence (AI) systems on top of a set of foundation +models (FMs) is becoming a new paradigm in AI research. Their representative +and generative abilities learnt from vast amounts of data can be easily adapted +and transferred to a wide range of downstream tasks without extra training from +scratch. However, leveraging FMs in cross-modal generation remains +under-researched when audio modality is involved. On the other hand, +automatically generating semantically-relevant sound from visual input is an +important problem in cross-modal generation studies. To solve this +vision-to-audio (V2A) generation problem, existing methods tend to design and +build complex systems from scratch using modestly sized datasets. In this +paper, we propose a lightweight solution to this problem by leveraging +foundation models, specifically CLIP, CLAP, and AudioLDM. We first investigate +the domain gap between the latent space of the visual CLIP and the auditory +CLAP models. Then we propose a simple yet effective mapper mechanism +(V2A-Mapper) to bridge the domain gap by translating the visual input between +CLIP and CLAP spaces. Conditioned on the translated CLAP embedding, pretrained +audio generative FM AudioLDM is adopted to produce high-fidelity and +visually-aligned sound. Compared to previous approaches, our method only +requires a quick training of the V2A-Mapper. We further analyze and conduct +extensive experiments on the choice of the V2A-Mapper and show that a +generative mapper is better at fidelity and variability (FD) while a regression +mapper is slightly better at relevance (CS). Both objective and subjective +evaluation on two V2A datasets demonstrate the superiority of our proposed +method compared to current state-of-the-art approaches - trained with 86% fewer +parameters but achieving 53% and 19% improvement in FD and CS, respectively. + +
+
+ comment: 13 pages, 10 figures. Demo page: https://v2a-mapper.github.io/ +
+
+
+
+
+ + ♻ ☆ Towards Balanced Active Learning for Multimodal Classification + + +
+ Training multimodal networks requires a vast amount of data due to their +larger parameter space compared to unimodal networks. Active learning is a +widely used technique for reducing data annotation costs by selecting only +those samples that could contribute to improving model performance. However, +current active learning strategies are mostly designed for unimodal tasks, and +when applied to multimodal data, they often result in biased sample selection +from the dominant modality. This unfairness hinders balanced multimodal +learning, which is crucial for achieving optimal performance. To address this +issue, we propose three guidelines for designing a more balanced multimodal +active learning strategy. Following these guidelines, a novel approach is +proposed to achieve more fair data selection by modulating the gradient +embedding with the dominance degree among modalities. Our studies demonstrate +that the proposed method achieves more balanced multimodal learning by avoiding +greedy sample selection from the dominant modality. Our approach outperforms +existing active learning strategies on a variety of multimodal classification +tasks. Overall, our work highlights the importance of balancing sample +selection in multimodal active learning and provides a practical solution for +achieving more balanced active learning for multimodal classification. + +
+
+ comment: 12 pages, accepted by ACMMM 2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 27 + +
+
+
+ + ☆ LibriSQA: Pioneering Free-form and Open-ended Spoken Question Answering + with a Novel Dataset and Framework + + +
+ While Large Language Models (LLMs) have demonstrated commendable performance +across a myriad of domains and tasks, existing LLMs still exhibit a palpable +deficit in handling multimodal functionalities, especially for the Spoken +Question Answering (SQA) task which necessitates precise alignment and deep +interaction between speech and text features. To address the SQA challenge on +LLMs, we initially curated the free-form and open-ended LibriSQA dataset from +Librispeech, comprising Part I with natural conversational formats and Part II +encompassing multiple-choice questions followed by answers and analytical +segments. Both parts collectively include 107k SQA pairs that cover various +topics. Given the evident paucity of existing speech-text LLMs, we propose a +lightweight, end-to-end framework to execute the SQA task on the LibriSQA, +witnessing significant results. By reforming ASR into the SQA format, we +further substantiate our framework's capability in handling ASR tasks. Our +empirical findings bolster the LLMs' aptitude for aligning and comprehending +multimodal information, paving the way for the development of universal +multimodal LLMs. The dataset and demo can be found at +https://github.com/ZihanZhaoSJTU/LibriSQA. + +
+
+
+
+
+ + ☆ A Human-on-the-Loop Optimization Autoformalism Approach for + Sustainability + + +
+ This paper outlines a natural conversational approach to solving personalized +energy-related problems using large language models (LLMs). We focus on +customizable optimization problems that necessitate repeated solving with +slight variations in modeling and are user-specific, hence posing a challenge +to devising a one-size-fits-all model. We put forward a strategy that augments +an LLM with an optimization solver, enhancing its proficiency in understanding +and responding to user specifications and preferences while providing nonlinear +reasoning capabilities. Our approach pioneers the novel concept of human-guided +optimization autoformalism, translating a natural language task specification +automatically into an optimization instance. This enables LLMs to analyze, +explain, and tackle a variety of instance-specific energy-related problems, +pushing beyond the limits of current prompt-based techniques. + Our research encompasses various commonplace tasks in the energy sector, from +electric vehicle charging and Heating, Ventilation, and Air Conditioning (HVAC) +control to long-term planning problems such as cost-benefit evaluations for +installing rooftop solar photovoltaics (PVs) or heat pumps. This pilot study +marks an essential stride towards the context-based formulation of optimization +using LLMs, with the potential to democratize optimization processes. As a +result, stakeholders are empowered to optimize their energy consumption, +promoting sustainable energy practices customized to personal needs and +preferences. + +
+
+
+
+
+ + ☆ Algorithm of Thoughts: Enhancing Exploration of Ideas in Large Language + Models + + +
+ Current literature, aiming to surpass the "Chain-of-Thought" approach, often +resorts to an external modus operandi involving halting, modifying, and then +resuming the generation process to boost Large Language Models' (LLMs) +reasoning capacities. This mode escalates the number of query requests, leading +to increased costs, memory, and computational overheads. Addressing this, we +propose the Algorithm of Thoughts -- a novel strategy that propels LLMs through +algorithmic reasoning pathways, pioneering a new mode of in-context learning. +By employing algorithmic examples, we exploit the innate recurrence dynamics of +LLMs, expanding their idea exploration with merely one or a few queries. Our +technique outperforms earlier single-query methods and stands on par with a +recent multi-query strategy that employs an extensive tree search algorithm. +Intriguingly, our results suggest that instructing an LLM using an algorithm +can lead to performance surpassing that of the algorithm itself, hinting at +LLM's inherent ability to weave its intuition into optimized searches. We probe +into the underpinnings of our method's efficacy and its nuances in application. + +
+
+
+
+
+ + ☆ cantnlp@LT-EDI@RANLP-2023: Homophobia/Transphobia Detection in Social + Media Comments using Spatio-Temporally Retrained Language Models + + +
+ This paper describes our multiclass classification system developed as part +of the LTEDI@RANLP-2023 shared task. We used a BERT-based language model to +detect homophobic and transphobic content in social media comments across five +language conditions: English, Spanish, Hindi, Malayalam, and Tamil. We +retrained a transformer-based crosslanguage pretrained language model, +XLMRoBERTa, with spatially and temporally relevant social media language data. +We also retrained a subset of models with simulated script-mixed social media +language data with varied performance. We developed the best performing +seven-label classification system for Malayalam based on weighted macro +averaged F1 score (ranked first out of six) with variable performance for other +language and class-label conditions. We found the inclusion of this +spatio-temporal data improved the classification performance for all language +and task conditions when compared with the baseline. The results suggests that +transformer-based language classification systems are sensitive to +register-specific and language-specific retraining. + +
+
+
+
+
+ + ☆ Imaginations of WALL-E : Reconstructing Experiences with an + Imagination-Inspired Module for Advanced AI Systems + + +
+ In this paper, we introduce a novel Artificial Intelligence (AI) system +inspired by the philosophical and psychoanalytical concept of imagination as a +``Re-construction of Experiences". Our AI system is equipped with an +imagination-inspired module that bridges the gap between textual inputs and +other modalities, enriching the derived information based on previously learned +experiences. A unique feature of our system is its ability to formulate +independent perceptions of inputs. This leads to unique interpretations of a +concept that may differ from human interpretations but are equally valid, a +phenomenon we term as ``Interpretable Misunderstanding". We employ large-scale +models, specifically a Multimodal Large Language Model (MLLM), enabling our +proposed system to extract meaningful information across modalities while +primarily remaining unimodal. We evaluated our system against other large +language models across multiple tasks, including emotion recognition and +question-answering, using a zero-shot methodology to ensure an unbiased +scenario that may happen by fine-tuning. Significantly, our system outperformed +the best Large Language Models (LLM) on the MELD, IEMOCAP, and CoQA datasets, +achieving Weighted F1 (WF1) scores of 46.74%, 25.23%, and Overall F1 (OF1) +score of 17%, respectively, compared to 22.89%, 12.28%, and 7% from the +well-performing LLM. The goal is to go beyond the statistical view of language +processing and tie it to human concepts such as philosophy and psychoanalysis. +This work represents a significant advancement in the development of +imagination-inspired AI systems, opening new possibilities for AI to generate +deep and interpretable information across modalities, thereby enhancing +human-AI interaction. + +
+
+ comment: 18 pages, +
+
+
+
+
+ + ☆ A Study on Robustness and Reliability of Large Language Model Code + Generation + + +
+ Recently, the large language models (LLMs) have shown extraordinary ability +in understanding natural language and generating programming code. It has been +a common practice of software engineers to consult LLMs when encountering +coding questions. Although efforts have been made to avoid syntax errors and +align the code with the intended semantics, the reliability and robustness of +the code generationfrom LLMs have not yet been thoroughly studied. The +executable code is not equivalent to the reliable and robust code, especially +in the context of real-world software development.The misuse of APIs in the +generated code could lead to severe problem, such as resource leaks, program +crashes, etc.To make things worse, the users of LLM code generation services +are actually the developers that are most vulnerable to these code that seems +right -- They are always novice developers that are not familiar with the APIs +that LLMs generate code for them. Therefore, they could hardly tell the misuse +in the code generated by LLMs, which further facilitates the incorrect code +applied in real-world software. Existing code evaluation benchmark and datasets +focus on crafting small tasks such as programming questions in coding +interviews, which however deviates from the problem that developers would ask +LLM for real-world coding help. To fill the missing piece, in this work, we +propose a dataset RobustAPI for evaluating the reliability and robustness of +code generated by LLMs. We collect 1208 coding questions from StackOverflow on +24 representative Java APIs. We summarize thecommon misuse patterns of these +APIs and evaluate them oncurrent popular LLMs. The evaluation results show that +evenfor GPT-4, 62% of the generated code contains API misuses,which would cause +unexpected consequences if the code isintroduced into real-world software. + +
+
+
+
+
+ + ☆ Economic Policy Uncertainty: A Review on Applications and Measurement + Methods with Focus on Text Mining Methods + + +
+ Economic Policy Uncertainty (EPU) represents the uncertainty realized by the +investors during economic policy alterations. EPU is a critical indicator in +economic studies to predict future investments, the unemployment rate, and +recessions. EPU values can be estimated based on financial parameters directly +or implied uncertainty indirectly using the text mining methods. Although EPU +is a well-studied topic within the economy, the methods utilized to measure it +are understudied. In this article, we define the EPU briefly and review the +methods used to measure the EPU, and survey the areas influenced by the changes +in EPU level. We divide the EPU measurement methods into three major groups +with respect to their input data. Examples of each group of methods are +enlisted, and the pros and cons of the groups are discussed. Among the EPU +measures, text mining-based ones are dominantly studied. These methods measure +the realized uncertainty by taking into account the uncertainty represented in +the news and publicly available sources of financial information. Finally, we +survey the research areas that rely on measuring the EPU index with the hope +that studying the impacts of uncertainty would attract further attention of +researchers from various research fields. In addition, we propose a list of +future research approaches focusing on measuring EPU using textual material. + +
+
+ comment: JEL Classification: C53, C38, A13, O38, H50 +
+
+
+
+
+ + ☆ CharacterChat: Learning towards Conversational AI with Personalized + Social Support + + +
+ In our modern, fast-paced, and interconnected world, the importance of mental +well-being has grown into a matter of great urgency. However, traditional +methods such as Emotional Support Conversations (ESC) face challenges in +effectively addressing a diverse range of individual personalities. In +response, we introduce the Social Support Conversation (S2Conv) framework. It +comprises a series of support agents and the interpersonal matching mechanism, +linking individuals with persona-compatible virtual supporters. Utilizing +persona decomposition based on the MBTI (Myers-Briggs Type Indicator), we have +created the MBTI-1024 Bank, a group that of virtual characters with distinct +profiles. Through improved role-playing prompts with behavior preset and +dynamic memory, we facilitate the development of the MBTI-S2Conv dataset, which +contains conversations between the characters in the MBTI-1024 Bank. Building +upon these foundations, we present CharacterChat, a comprehensive S2Conv +system, which includes a conversational model driven by personas and memories, +along with an interpersonal matching plugin model that dispatches the optimal +supporters from the MBTI-1024 Bank for individuals with specific personas. +Empirical results indicate the remarkable efficacy of CharacterChat in +providing personalized social support and highlight the substantial advantages +derived from interpersonal matching. The source code is available in +\url{https://github.com/morecry/CharacterChat}. + +
+
+ comment: 10 pages, 6 figures, 5 tables +
+
+
+
+
+ + ☆ Scaled-up Discovery of Latent Concepts in Deep NLP Models + + +
+ Pre-trained language models (pLMs) learn intricate patterns and contextual +dependencies via unsupervised learning on vast text data, driving breakthroughs +across NLP tasks. Despite these achievements, these models remain black boxes, +necessitating research into understanding their decision-making processes. +Recent studies explore representation analysis by clustering latent spaces +within pre-trained models. However, these approaches are limited in terms of +scalability and the scope of interpretation because of high computation costs +of clustering algorithms. This study focuses on comparing clustering algorithms +for the purpose of scaling encoded concept discovery of representations from +pLMs. Specifically, we compare three algorithms in their capacity to unveil the +encoded concepts through their alignment to human-defined ontologies: +Agglomerative Hierarchical Clustering, Leaders Algorithm, and K-Means +Clustering. Our results show that K-Means has the potential to scale to very +large datasets, allowing rich latent concept discovery, both on the word and +phrase level. + +
+
+
+
+
+ + ☆ How Good Are Large Language Models at Out-of-Distribution Detection? + + +
+ Out-of-distribution (OOD) detection plays a vital role in enhancing the +reliability of machine learning (ML) models. The emergence of large language +models (LLMs) has catalyzed a paradigm shift within the ML community, +showcasing their exceptional capabilities across diverse natural language +processing tasks. While existing research has probed OOD detection with smaller +encoder-based Transformers like BERT and RoBERTa, the stark differences in +scales, pre-training objectives, and inference paradigms call into question the +applicability of these findings to LLMs. This paper embarks on a pioneering +empirical investigation of OOD detection in the domain of LLMs, focusing on +LLaMA series ranging from 7B to 65B in size. We thoroughly evaluate +commonly-used OOD detectors, scrutinizing their performance in both zero-grad +and fine-tuning scenarios. Notably, we alter previous discriminative +in-distribution fine-tuning into generative fine-tuning, aligning the +pre-training objective of LLMs with downstream tasks. Our findings unveil that +a simple cosine distance OOD detector demonstrates superior efficacy, +outperforming other OOD detectors. We provide an intriguing explanation for +this phenomenon by highlighting the isotropic nature of the embedding spaces of +LLMs, which distinctly contrasts with the anisotropic property observed in +smaller BERT family models. The new insight enhances our understanding of how +LLMs detect OOD data, thereby enhancing their adaptability and reliability in +dynamic environments. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ StableLLaVA: Enhanced Visual Instruction Tuning with Synthesized + Image-Dialogue Data + + +
+ The remarkable multimodal capabilities demonstrated by OpenAI's GPT-4 have +sparked significant interest in the development of multimodal Large Language +Models (LLMs). A primary research objective of such models is to align visual +and textual modalities effectively while comprehending human instructions. +Current methodologies often rely on annotations derived from benchmark datasets +to construct image-dialogue datasets for training purposes, akin to instruction +tuning in LLMs. However, these datasets often exhibit domain bias, potentially +constraining the generative capabilities of the models. In an effort to +mitigate these limitations, we propose a novel data collection methodology that +synchronously synthesizes images and dialogues for visual instruction tuning. +This approach harnesses the power of generative models, marrying the abilities +of ChatGPT and text-to-image generative models to yield a diverse and +controllable dataset with varied image content. This not only provides greater +flexibility compared to existing methodologies but also significantly enhances +several model capabilities. Our research includes comprehensive experiments +conducted on various datasets using the open-source LLAVA model as a testbed +for our proposed pipeline. Our results underscore marked enhancements across +more than ten commonly assessed capabilities, + +
+
+ comment: Project page: https://github.com/icoz69/StableLLAVA +
+
+
+
+
+ + ☆ LMTuner: An user-friendly and highly-integrable Training Framework for + fine-tuning Large Language Models + + +
+ With the burgeoning development in the realm of large language models (LLMs), +the demand for efficient incremental training tailored to specific industries +and domains continues to increase. Currently, the predominantly employed +frameworks lack modular design, it often takes a lot of coding work to +kickstart the training of LLM. To address this, we present "LMTuner", a highly +usable, integrable, and scalable system for training LLMs expeditiously and +with minimal user-input. LMTuner comprises three main modules - the +Interaction, Training, and Inference Modules. We advocate that LMTuner's +usability and integrality alleviate the complexities in training large language +models. Remarkably, even a novice user could commence training large language +models within five minutes. Furthermore, it integrates DeepSpeed frameworks and +supports Efficient Fine-Tuning methodologies like Low Rank Adaptation (LoRA), +Quantized LoRA (QLoRA), etc., enabling the training of language models scaling +from 300M to a whopping 130B parameters using a single server. The LMTuner's +homepage (https://wengsyx.github.io/LMTuner/)and screencast video +(https://youtu.be/nsXmWOmN3rE) are now publicly available. + +
+
+
+
+
+ + ☆ Activation Addition: Steering Language Models Without Optimization + + +
+ Reliably controlling the behavior of large language models (LLMs) is a +pressing open problem. Existing methods include supervised finetuning, +reinforcement learning from human feedback (RLHF), prompt engineering and +guided decoding. We instead investigate activation engineering: modifying +activations at inference time to predictably alter model behavior. In +particular, we bias the forward pass with an added 'steering vector' implicitly +specified through natural language. + Unlike past work which learned these steering vectors (Subramani, Suresh, and +Peters 2022; Hernandez, Li, and Andreas 2023), our Activation Addition (ActAdd) +method computes them by taking the activation differences that result from +pairs of prompts. We demonstrate ActAdd on GPT-2 on OpenWebText and ConceptNet. +Our inference-time approach yields control over high-level properties of output +and preserves off-target model performance. It involves far less compute and +implementation effort compared to finetuning or RLHF, allows users to provide +natural language specifications, and its overhead scales naturally with model +size. + +
+
+
+
+
+ + ☆ WMFormer++: Nested Transformer for Visible Watermark Removal via Implict + Joint Learning + + +
+ Watermarking serves as a widely adopted approach to safeguard media +copyright. In parallel, the research focus has extended to watermark removal +techniques, offering an adversarial means to enhance watermark robustness and +foster advancements in the watermarking field. Existing watermark removal +methods often rely on UNet architectures with multiple decoder branches -- one +for watermark localization and the other for background image restoration. +These methods involve complex module designs to guide information flow for +respective tasks, which can lead to suboptimal performance and an overly +cumbersome model. To simplify the existing framework, we propose a novel +Transformer-based approach with a unified decoder branch, treating watermark +extraction and background restoration as a single task and allowing thenetwork +to learn information flow between them without artificial design patterns. +Additionally, we utilize nested structures to facilitate multi-scale feature +fusion, forming a parallel ensemble of nested structures that constitute the +UNet. Supervision is applied to UNets with varying depths to facilitate +knowledge learning across all levels. Extensive experiments are conducted on +various challenging benchmarks to validate the effectiveness of our proposed +method. The results demonstrate that our approach achieves state-of-the-art +performance and produces high-quality images. + +
+
+
+
+
+ + ☆ FoodGPT: A Large Language Model in Food Testing Domain with Incremental + Pre-training and Knowledge Graph Prompt + + +
+ Currently, the construction of large language models in specific domains is +done by fine-tuning on a base model. Some models also incorporate knowledge +bases without the need for pre-training. This is because the base model already +contains domain-specific knowledge during the pre-training process. We build a +large language model for food testing. Unlike the above approach, a significant +amount of data in this domain exists in Scanning format for domain standard +documents. In addition, there is a large amount of untrained structured +knowledge. Therefore, we introduce an incremental pre-training step to inject +this knowledge into a large language model. In this paper, we propose a method +for handling structured knowledge and scanned documents in incremental +pre-training. To overcome the problem of machine hallucination, we constructe a +knowledge graph to serve as an external knowledge base for supporting retrieval +in the large language model. It is worth mentioning that this paper is a +technical report of our pre-release version, and we will report our specific +experimental data in future versions. + +
+
+
+
+
+ + ☆ FashionNTM: Multi-turn Fashion Image Retrieval via Cascaded Memory ICCV-2023 + + +
+ Multi-turn textual feedback-based fashion image retrieval focuses on a +real-world setting, where users can iteratively provide information to refine +retrieval results until they find an item that fits all their requirements. In +this work, we present a novel memory-based method, called FashionNTM, for such +a multi-turn system. Our framework incorporates a new Cascaded Memory Neural +Turing Machine (CM-NTM) approach for implicit state management, thereby +learning to integrate information across all past turns to retrieve new images, +for a given turn. Unlike vanilla Neural Turing Machine (NTM), our CM-NTM +operates on multiple inputs, which interact with their respective memories via +individual read and write heads, to learn complex relationships. Extensive +evaluation results show that our proposed method outperforms the previous +state-of-the-art algorithm by 50.5%, on Multi-turn FashionIQ -- the only +existing multi-turn fashion dataset currently, in addition to having a relative +improvement of 12.6% on Multi-turn Shoes -- an extension of the single-turn +Shoes dataset that we created in this work. Further analysis of the model in a +real-world interactive setting demonstrates two important capabilities of our +model -- memory retention across turns, and agnosticity to turn order for +non-contradictory feedback. Finally, user study results show that images +retrieved by FashionNTM were favored by 83.1% over other multi-turn models. +Project page: https://sites.google.com/eng.ucsd.edu/fashionntm + +
+
+ comment: Paper accepted at ICCV-2023 +
+
+
+
+
+ + ☆ Head-to-Tail: How Knowledgeable are Large Language Models (LLM)? A.K.A. + Will LLMs Replace Knowledge Graphs? + + +
+ Since the recent prosperity of Large Language Models (LLMs), there have been +interleaved discussions regarding how to reduce hallucinations from LLM +responses, how to increase the factuality of LLMs, and whether Knowledge Graphs +(KGs), which store the world knowledge in a symbolic form, will be replaced +with LLMs. In this paper, we try to answer these questions from a new angle: +How knowledgeable are LLMs? + To answer this question, we constructed Head-to-Tail, a benchmark that +consists of 18K question-answer (QA) pairs regarding head, torso, and tail +facts in terms of popularity. We designed an automated evaluation method and a +set of metrics that closely approximate the knowledge an LLM confidently +internalizes. Through a comprehensive evaluation of 14 publicly available LLMs, +we show that existing LLMs are still far from being perfect in terms of their +grasp of factual knowledge, especially for facts of torso-to-tail entities. + +
+
+
+
+
+ + ☆ A Survey on Fairness in Large Language Models + + +
+ Large language models (LLMs) have shown powerful performance and development +prospect and are widely deployed in the real world. However, LLMs can capture +social biases from unprocessed training data and propagate the biases to +downstream tasks. Unfair LLM systems have undesirable social impacts and +potential harms. In this paper, we provide a comprehensive review of related +research on fairness in LLMs. First, for medium-scale LLMs, we introduce +evaluation metrics and debiasing methods from the perspectives of intrinsic +bias and extrinsic bias, respectively. Then, for large-scale LLMs, we introduce +recent fairness research, including fairness evaluation, reasons for bias, and +debiasing methods. Finally, we discuss and provide insight on the challenges +and future directions for the development of fairness in LLMs. + +
+
+ comment: 12 pages, 2 figures, 101 references +
+
+
+
+
+ + ☆ ExpeL: LLM Agents Are Experiential Learners + + +
+ The recent surge in research interest in applying large language models +(LLMs) to decision-making tasks has flourished by leveraging the extensive +world knowledge embedded in LLMs. While there is a growing demand to tailor +LLMs for custom decision-making tasks, finetuning them for specific tasks is +resource-intensive and may diminish the model's generalization capabilities. +Moreover, state-of-the-art language models like GPT-4 and Claude are primarily +accessible through API calls, with their parametric weights remaining +proprietary and unavailable to the public. This scenario emphasizes the growing +need for new methodologies that allow learning from agent experiences without +requiring parametric updates. To address these problems, we introduce the +Experiential Learning (ExpeL) agent. Our agent autonomously gathers experiences +and extracts knowledge using natural language from a collection of training +tasks. At inference, the agent recalls its extracted insights and past +experiences to make informed decisions. Our empirical results highlight the +robust learning efficacy of the ExpeL agent, indicating a consistent +enhancement in its performance as it accumulates experiences. We further +explore the emerging capabilities and transfer learning potential of the ExpeL +agent through qualitative observations and additional experiments. + +
+
+
+
+
+ + ☆ LegalBench: A Collaboratively Built Benchmark for Measuring Legal + Reasoning in Large Language Models + + +
+ The advent of large language models (LLMs) and their adoption by the legal +community has given rise to the question: what types of legal reasoning can +LLMs perform? To enable greater study of this question, we present LegalBench: +a collaboratively constructed legal reasoning benchmark consisting of 162 tasks +covering six different types of legal reasoning. LegalBench was built through +an interdisciplinary process, in which we collected tasks designed and +hand-crafted by legal professionals. Because these subject matter experts took +a leading role in construction, tasks either measure legal reasoning +capabilities that are practically useful, or measure reasoning skills that +lawyers find interesting. To enable cross-disciplinary conversations about LLMs +in the law, we additionally show how popular legal frameworks for describing +legal reasoning -- which distinguish between its many forms -- correspond to +LegalBench tasks, thus giving lawyers and LLM developers a common vocabulary. +This paper describes LegalBench, presents an empirical evaluation of 20 +open-source and commercial LLMs, and illustrates the types of research +explorations LegalBench enables. + +
+
+ comment: 143 pages, 79 tables, 4 figures +
+
+
+
+
+ + ☆ Indonesian Automatic Speech Recognition with XLSR-53 + + +
+ This study focuses on the development of Indonesian Automatic Speech +Recognition (ASR) using the XLSR-53 pre-trained model, the XLSR stands for +cross-lingual speech representations. The use of this XLSR-53 pre-trained model +is to significantly reduce the amount of training data in non-English languages +required to achieve a competitive Word Error Rate (WER). The total amount of +data used in this study is 24 hours, 18 minutes, and 1 second: (1) TITML-IDN 14 +hours and 31 minutes; (2) Magic Data 3 hours and 33 minutes; and (3) Common +Voice 6 hours, 14 minutes, and 1 second. With a WER of 20%, the model built in +this study can compete with similar models using the Common Voice dataset split +test. WER can be decreased by around 8% using a language model, resulted in WER +from 20% to 12%. Thus, the results of this study have succeeded in perfecting +previous research in contributing to the creation of a better Indonesian ASR +with a smaller amount of data. + +
+
+
+
+
+ + ♻ ☆ Exploring the Landscape of Natural Language Processing Research + + +
+ As an efficient approach to understand, generate, and process natural +language texts, research in natural language processing (NLP) has exhibited a +rapid spread and wide adoption in recent years. Given the increasing research +work in this area, several NLP-related approaches have been surveyed in the +research community. However, a comprehensive study that categorizes established +topics, identifies trends, and outlines areas for future research remains +absent. Contributing to closing this gap, we have systematically classified and +analyzed research papers in the ACL Anthology. As a result, we present a +structured overview of the research landscape, provide a taxonomy of fields of +study in NLP, analyze recent developments in NLP, summarize our findings, and +highlight directions for future work. + +
+
+ comment: Accepted to the 14th International Conference on Recent Advances in + Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ Efficient Domain Adaptation of Sentence Embeddings Using Adapters + + +
+ Sentence embeddings enable us to capture the semantic similarity of short +texts. Most sentence embedding models are trained for general semantic textual +similarity tasks. Therefore, to use sentence embeddings in a particular domain, +the model must be adapted to it in order to achieve good results. Usually, this +is done by fine-tuning the entire sentence embedding model for the domain of +interest. While this approach yields state-of-the-art results, all of the +model's weights are updated during fine-tuning, making this method +resource-intensive. Therefore, instead of fine-tuning entire sentence embedding +models for each target domain individually, we propose to train lightweight +adapters. These domain-specific adapters do not require fine-tuning all +underlying sentence embedding model parameters. Instead, we only train a small +number of additional parameters while keeping the weights of the underlying +sentence embedding model fixed. Training domain-specific adapters allows always +using the same base model and only exchanging the domain-specific adapters to +adapt sentence embeddings to a specific domain. We show that using adapters for +parameter-efficient domain adaptation of sentence embeddings yields competitive +performance within 1% of a domain-adapted, entirely fine-tuned sentence +embedding model while only training approximately 3.6% of the parameters. + +
+
+ comment: Accepted to the 14th International Conference on Recent Advances in + Natural Language Processing (RANLP 2023) +
+
+
+
+
+ + ♻ ☆ Data Augmentation using Transformers and Similarity Measures for + Improving Arabic Text Classification + + +
+ The performance of learning models heavily relies on the availability and +adequacy of training data. To address the dataset adequacy issue, researchers +have extensively explored data augmentation (DA) as a promising approach. DA +generates new data instances through transformations applied to the available +data, thereby increasing dataset size and variability. This approach has +enhanced model performance and accuracy, particularly in addressing class +imbalance problems in classification tasks. However, few studies have explored +DA for the Arabic language, relying on traditional approaches such as +paraphrasing or noising-based techniques. In this paper, we propose a new +Arabic DA method that employs the recent powerful modeling technique, namely +the AraGPT-2, for the augmentation process. The generated sentences are +evaluated in terms of context, semantics, diversity, and novelty using the +Euclidean, cosine, Jaccard, and BLEU distances. Finally, the AraBERT +transformer is used on sentiment classification tasks to evaluate the +classification performance of the augmented Arabic dataset. The experiments +were conducted on four sentiment Arabic datasets: AraSarcasm, ASTD, ATT, and +MOVIE. The selected datasets vary in size, label number, and unbalanced +classes. The results show that the proposed methodology enhanced the Arabic +sentiment text classification on all datasets with an increase in F1 score by +4% in AraSarcasm, 6% in ASTD, 9% in ATT, and 13% in MOVIE. + +
+
+ comment: 15 pages, 16 Figures, this work has been submitted to the IEEE Access + Journal for possible publication +
+
+
+
+
+ + ♻ ☆ Reducing Sensitivity on Speaker Names for Text Generation from Dialogues ACL'23 + + +
+ Changing speaker names consistently throughout a dialogue should not affect +its meaning and corresponding outputs for text generation from dialogues. +However, pre-trained language models, serving as the backbone for +dialogue-processing tasks, have shown to be sensitive to nuances. This may +result in unfairness in real-world applications. No comprehensive analysis of +this problem has been done in the past. In this work, we propose to +quantitatively measure a model's sensitivity on speaker names, and +comprehensively evaluate a number of known methods for reducing speaker name +sensitivity, including a novel approach of our own. Extensive experiments on +multiple datasets provide a benchmark for this problem and show the favorable +performance of our approach in sensitivity reduction and quality of generation. + +
+
+ comment: findings of ACL'23 +
+
+
+
+
+ + ♻ ☆ MPI-rical: Data-Driven MPI Distributed Parallelism Assistance with + Transformers + + +
+ Automatic source-to-source parallelization of serial code for shared and +distributed memory systems is a challenging task in high-performance computing. +While many attempts were made to translate serial code into parallel code for a +shared memory environment (usually using OpenMP), none has managed to do so for +a distributed memory environment. In this paper, we propose a novel approach, +called MPI-rical, for automated MPI code generation using a transformer-based +model trained on approximately 25,000 serial code snippets and their +corresponding parallelized MPI code out of more than 50,000 code snippets in +our corpus (MPICodeCorpus). To evaluate the performance of the model, we first +break down the serial code to MPI-based parallel code translation problem into +two sub-problems and develop two research objectives: code completion defined +as given a location in the source code, predict the MPI function for that +location, and code translation defined as predicting an MPI function as well as +its location in the source code. We evaluate MPI-rical on MPICodeCorpus dataset +and on real-world scientific code benchmarks and compare its performance +between the code completion and translation tasks. Our experimental results +show that while MPI-rical performs better on the code completion task than the +code translation task, the latter is better suited for real-world programming +assistance, in which the tool suggests the need for an MPI function regardless +of prior knowledge. Overall, our approach represents a significant step forward +in automating the parallelization of serial code for distributed memory +systems, which can save valuable time and resources for software developers and +researchers. The source code used in this work, as well as other relevant +sources, are available at: +https://github.com/Scientific-Computing-Lab-NRCN/MPI-rical + +
+
+
+
+
+ + ♻ ☆ PreSTU: Pre-Training for Scene-Text Understanding ICCV 2023 + + +
+ The ability to recognize and reason about text embedded in visual inputs is +often lacking in vision-and-language (V&L) models, perhaps because V&L +pre-training methods have often failed to include such an ability in their +training objective. In this paper, we propose PreSTU, a novel pre-training +recipe dedicated to scene-text understanding (STU). PreSTU introduces OCR-aware +pre-training objectives that encourage the model to recognize text from an +image and connect it to the rest of the image content. We implement PreSTU +using a simple transformer-based encoder-decoder architecture, combined with +large-scale image-text datasets with scene text obtained from an off-the-shelf +OCR system. We empirically demonstrate the effectiveness of this pre-training +approach on eight visual question answering and four image captioning +benchmarks. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 12 + +
+
+
+ + ☆ Towards Generalizable Morph Attack Detection with Consistency + Regularization + + +
+ Though recent studies have made significant progress in morph attack +detection by virtue of deep neural networks, they often fail to generalize well +to unseen morph attacks. With numerous morph attacks emerging frequently, +generalizable morph attack detection has gained significant attention. This +paper focuses on enhancing the generalization capability of morph attack +detection from the perspective of consistency regularization. Consistency +regularization operates under the premise that generalizable morph attack +detection should output consistent predictions irrespective of the possible +variations that may occur in the input space. In this work, to reach this +objective, two simple yet effective morph-wise augmentations are proposed to +explore a wide space of realistic morph transformations in our consistency +regularization. Then, the model is regularized to learn consistently at the +logit as well as embedding levels across a wide range of morph-wise augmented +images. The proposed consistency regularization aligns the abstraction in the +hidden layers of our model across the morph attack images which are generated +from diverse domains in the wild. Experimental results demonstrate the superior +generalization and robustness performance of our proposed method compared to +the state-of-the-art studies. + +
+
+ comment: Accepted to the IEEE International Joint Conference on Biometrics + (IJCB), 2023 +
+
+
+
+
+ + ☆ False Negative/Positive Control for SAM on Noisy Medical Images + + +
+ The Segment Anything Model (SAM) is a recently developed all-range foundation +model for image segmentation. It can use sparse manual prompts such as bounding +boxes to generate pixel-level segmentation in natural images but struggles in +medical images such as low-contrast, noisy ultrasound images. We propose a +refined test-phase prompt augmentation technique designed to improve SAM's +performance in medical image segmentation. The method couples multi-box prompt +augmentation and an aleatoric uncertainty-based false-negative (FN) and +false-positive (FP) correction (FNPC) strategy. We evaluate the method on two +ultrasound datasets and show improvement in SAM's performance and robustness to +inaccurate prompts, without the necessity for further training or tuning. +Moreover, we present the Single-Slice-to-Volume (SS2V) method, enabling 3D +pixel-level segmentation using only the bounding box annotation from a single +2D slice. Our results allow efficient use of SAM in even noisy, low-contrast +medical images. The source code will be released soon. + +
+
+
+
+
+ + ☆ HoSNN: Adversarially-Robust Homeostatic Spiking Neural Networks with + Adaptive Firing Thresholds + + +
+ Spiking neural networks (SNNs) offer promise for efficient and powerful +neurally inspired computation. Common to other types of neural networks, +however, SNNs face the severe issue of vulnerability to adversarial attacks. We +present the first study that draws inspiration from neural homeostasis to +develop a bio-inspired solution that counters the susceptibilities of SNNs to +adversarial onslaughts. At the heart of our approach is a novel +threshold-adapting leaky integrate-and-fire (TA-LIF) neuron model, which we +adopt to construct the proposed adversarially robust homeostatic SNN (HoSNN). +Distinct from traditional LIF models, our TA-LIF model incorporates a +self-stabilizing dynamic thresholding mechanism, curtailing adversarial noise +propagation and safeguarding the robustness of HoSNNs in an unsupervised +manner. Theoretical analysis is presented to shed light on the stability and +convergence properties of the TA-LIF neurons, underscoring their superior +dynamic robustness under input distributional shifts over traditional LIF +neurons. Remarkably, without explicit adversarial training, our HoSNNs +demonstrate inherent robustness on CIFAR-10, with accuracy improvements to +72.6% and 54.19% against FGSM and PGD attacks, up from 20.97% and 0.6%, +respectively. Furthermore, with minimal FGSM adversarial training, our HoSNNs +surpass previous models by 29.99% under FGSM and 47.83% under PGD attacks on +CIFAR-10. Our findings offer a new perspective on harnessing biological +principles for bolstering SNNs adversarial robustness and defense, paving the +way to more resilient neuromorphic computing. + +
+
+
+
+
+ + ☆ Developing a Machine Learning-Based Clinical Decision Support Tool for + Uterine Tumor Imaging + + +
+ Uterine leiomyosarcoma (LMS) is a rare but aggressive malignancy. On imaging, +it is difficult to differentiate LMS from, for example, degenerated leiomyoma +(LM), a prevalent but benign condition. We curated a data set of 115 axial +T2-weighted MRI images from 110 patients (mean [range] age=45 [17-81] years) +with UTs that included five different tumor types. These data were randomly +split stratifying on tumor volume into training (n=85) and test sets (n=30). An +independent second reader (reader 2) provided manual segmentations for all test +set images. To automate segmentation, we applied nnU-Net and explored the +effect of training set size on performance by randomly generating subsets with +25, 45, 65 and 85 training set images. We evaluated the ability of radiomic +features to distinguish between types of UT individually and when combined +through feature selection and machine learning. Using the entire training set +the mean [95% CI] fibroid DSC was measured as 0.87 [0.59-1.00] and the +agreement between the two readers was 0.89 [0.77-1.0] on the test set. When +classifying degenerated LM from LMS we achieve a test set F1-score of 0.80. +Classifying UTs based on radiomic features we identify classifiers achieving +F1-scores of 0.53 [0.45, 0.61] and 0.80 [0.80, 0.80] on the test set for the +benign versus malignant, and degenerated LM versus LMS tasks. We show that it +is possible to develop an automated method for 3D segmentation of the uterus +and UT that is close to human-level performance with fewer than 150 annotated +images. For distinguishing UT types, while we train models that merit further +investigation with additional data, reliable automatic differentiation of UTs +remains a challenge. + +
+
+
+
+
+ + ☆ Prediction of Pneumonia and COVID-19 Using Deep Neural Networks + + +
+ Pneumonia, caused by bacteria and viruses, is a rapidly spreading viral +infection with global implications. Prompt identification of infected +individuals is crucial for containing its transmission. This study explores the +potential of medical image analysis to address this challenge. We propose +machine-learning techniques for predicting Pneumonia from chest X-ray images. +Chest X-ray imaging is vital for Pneumonia diagnosis due to its accessibility +and cost-effectiveness. However, interpreting X-rays for Pneumonia detection +can be complex, as radiographic features can overlap with other respiratory +conditions. We evaluate the performance of different machine learning models, +including DenseNet121, Inception Resnet-v2, Inception Resnet-v3, Resnet50, and +Xception, using chest X-ray images of pneumonia patients. Performance measures +and confusion matrices are employed to assess and compare the models. The +findings reveal that DenseNet121 outperforms other models, achieving an +accuracy rate of 99.58%. This study underscores the significance of machine +learning in the accurate detection of Pneumonia, leveraging chest X-ray images. +Our study offers insights into the potential of technology to mitigate the +spread of pneumonia through precise diagnostics. + +
+
+
+
+
+ + ☆ Vehicle Cameras Guide mmWave Beams: Approach and Real-World V2V + Demonstration + + +
+ Accurately aligning millimeter-wave (mmWave) and terahertz (THz) narrow beams +is essential to satisfy reliability and high data rates of 5G and beyond +wireless communication systems. However, achieving this objective is difficult, +especially in vehicle-to-vehicle (V2V) communication scenarios, where both +transmitter and receiver are constantly mobile. Recently, additional sensing +modalities, such as visual sensors, have attracted significant interest due to +their capability to provide accurate information about the wireless +environment. To that end, in this paper, we develop a deep learning solution +for V2V scenarios to predict future beams using images from a 360 camera +attached to the vehicle. The developed solution is evaluated on a real-world +multi-modal mmWave V2V communication dataset comprising co-existing 360 camera +and mmWave beam training data. The proposed vision-aided solution achieves +$\approx 85\%$ top-5 beam prediction accuracy while significantly reducing the +beam training overhead. This highlights the potential of utilizing vision for +enabling highly-mobile V2V communications. + +
+
+ comment: Dataset and code files are available on the DeepSense 6G website + https://deepsense6g.net/ +
+
+
+
+
+ + ♻ ☆ CircNet: Meshing 3D Point Clouds with Circumcenter Detection ICLR2023 + + +
+ Reconstructing 3D point clouds into triangle meshes is a key problem in +computational geometry and surface reconstruction. Point cloud triangulation +solves this problem by providing edge information to the input points. Since no +vertex interpolation is involved, it is beneficial to preserve sharp details on +the surface. Taking advantage of learning-based techniques in triangulation, +existing methods enumerate the complete combinations of candidate triangles, +which is both complex and inefficient. In this paper, we leverage the duality +between a triangle and its circumcenter, and introduce a deep neural network +that detects the circumcenters to achieve point cloud triangulation. +Specifically, we introduce multiple anchor priors to divide the neighborhood +space of each point. The neural network then learns to predict the presences +and locations of circumcenters under the guidance of those anchors. We extract +the triangles dual to the detected circumcenters to form a primitive mesh, from +which an edge-manifold mesh is produced via simple post-processing. Unlike +existing learning-based triangulation methods, the proposed method bypasses an +exhaustive enumeration of triangle combinations and local surface +parameterization. We validate the efficiency, generalization, and robustness of +our method on prominent datasets of both watertight and open surfaces. The code +and trained models are provided at https://github.com/EnyaHermite/CircNet. + +
+
+ comment: accepted to ICLR2023 +
+
+
+
+
+ + ♻ ☆ DiffFacto: Controllable Part-Based 3D Point Cloud Generation with Cross + Diffusion + + +
+ While the community of 3D point cloud generation has witnessed a big growth +in recent years, there still lacks an effective way to enable intuitive user +control in the generation process, hence limiting the general utility of such +methods. Since an intuitive way of decomposing a shape is through its parts, we +propose to tackle the task of controllable part-based point cloud generation. +We introduce DiffFacto, a novel probabilistic generative model that learns the +distribution of shapes with part-level control. We propose a factorization that +models independent part style and part configuration distributions and presents +a novel cross-diffusion network that enables us to generate coherent and +plausible shapes under our proposed factorization. Experiments show that our +method is able to generate novel shapes with multiple axes of control. It +achieves state-of-the-art part-level generation quality and generates plausible +and coherent shapes while enabling various downstream editing applications such +as shape interpolation, mixing, and transformation editing. Project website: +https://difffacto.github.io/ + +
+
+
+
+
+ + ♻ ☆ Ethosight: A Reasoning-Guided Iterative Learning System for Nuanced + Perception based on Joint-Embedding & Contextual Label Affinity + + +
+ Traditional computer vision models often necessitate extensive data +acquisition, annotation, and validation. These models frequently struggle in +real-world applications, resulting in high false positive and negative rates, +and exhibit poor adaptability to new scenarios, often requiring costly +retraining. To address these issues, we present Ethosight, a flexible and +adaptable zero-shot video analytics system. Ethosight begins from a clean slate +based on user-defined video analytics, specified through natural language or +keywords, and leverages joint embedding models and reasoning mechanisms +informed by ontologies such as WordNet and ConceptNet. Ethosight operates +effectively on low-cost edge devices and supports enhanced runtime adaptation, +thereby offering a new approach to continuous learning without catastrophic +forgetting. We provide empirical validation of Ethosight's promising +effectiveness across diverse and complex use cases, while highlighting areas +for further improvement. A significant contribution of this work is the release +of all source code and datasets to enable full reproducibility and to foster +further innovation in both the research and commercial domains. + +
+
+
+
+
+ + ♻ ☆ NeSyFOLD: Neurosymbolic Framework for Interpretable Image Classification + + +
+ Deep learning models such as CNNs have surpassed human performance in +computer vision tasks such as image classification. However, despite their +sophistication, these models lack interpretability which can lead to biased +outcomes reflecting existing prejudices in the data. We aim to make predictions +made by a CNN interpretable. Hence, we present a novel framework called +NeSyFOLD to create a neurosymbolic (NeSy) model for image classification tasks. +The model is a CNN with all layers following the last convolutional layer +replaced by a stratified answer set program (ASP). A rule-based machine +learning algorithm called FOLD-SE-M is used to derive the stratified answer set +program from binarized filter activations of the last convolutional layer. The +answer set program can be viewed as a rule-set, wherein the truth value of each +predicate depends on the activation of the corresponding kernel in the CNN. The +rule-set serves as a global explanation for the model and is interpretable. A +justification for the predictions made by the NeSy model can be obtained using +an ASP interpreter. We also use our NeSyFOLD framework with a CNN that is +trained using a sparse kernel learning technique called Elite BackProp (EBP). +This leads to a significant reduction in rule-set size without compromising +accuracy or fidelity thus improving scalability of the NeSy model and +interpretability of its rule-set. Evaluation is done on datasets with varied +complexity and sizes. To make the rule-set more intuitive to understand, we +propose a novel algorithm for labelling each kernel's corresponding predicate +in the rule-set with the semantic concept(s) it learns. We evaluate the +performance of our "semantic labelling algorithm" to quantify the efficacy of +the semantic labelling for both the NeSy model and the NeSy-EBP model. + +
+
+
+
+
+ + ♻ ☆ Towards Explainable Land Cover Mapping: a Counterfactual-based Strategy + + +
+ Counterfactual explanations are an emerging tool to enhance interpretability +of deep learning models. Given a sample, these methods seek to find and display +to the user similar samples across the decision boundary. In this paper, we +propose a generative adversarial counterfactual approach for satellite image +time series in a multi-class setting for the land cover classification task. +One of the distinctive features of the proposed approach is the lack of prior +assumption on the targeted class for a given counterfactual explanation. This +inherent flexibility allows for the discovery of interesting information on the +relationship between land cover classes. The other feature consists of +encouraging the counterfactual to differ from the original sample only in a +small and compact temporal segment. These time-contiguous perturbations allow +for a much sparser and, thus, interpretable solution. Furthermore, +plausibility/realism of the generated counterfactual explanations is enforced +via the proposed adversarial learning strategy. + +
+
+
+
+
+ + ♻ ☆ High-performance Data Management for Whole Slide Image Analysis in + Digital Pathology + + +
+ When dealing with giga-pixel digital pathology in whole-slide imaging, a +notable proportion of data records holds relevance during each analysis +operation. For instance, when deploying an image analysis algorithm on +whole-slide images (WSI), the computational bottleneck often lies in the +input-output (I/O) system. This is particularly notable as patch-level +processing introduces a considerable I/O load onto the computer system. +However, this data management process could be further paralleled, given the +typical independence of patch-level image processes across different patches. +This paper details our endeavors in tackling this data access challenge by +implementing the Adaptable IO System version 2 (ADIOS2). Our focus has been +constructing and releasing a digital pathology-centric pipeline using ADIOS2, +which facilitates streamlined data management across WSIs. Additionally, we've +developed strategies aimed at curtailing data retrieval times. The performance +evaluation encompasses two key scenarios: (1) a pure CPU-based image analysis +scenario ("CPU scenario"), and (2) a GPU-based deep learning framework scenario +("GPU scenario"). Our findings reveal noteworthy outcomes. Under the CPU +scenario, ADIOS2 showcases an impressive two-fold speed-up compared to the +brute-force approach. In the GPU scenario, its performance stands on par with +the cutting-edge GPU I/O acceleration framework, NVIDIA Magnum IO GPU Direct +Storage (GDS). From what we know, this appears to be among the initial +instances, if any, of utilizing ADIOS2 within the field of digital pathology. +The source code has been made publicly available at +https://github.com/hrlblab/adios. + +
+
+
+
+
+
+
+
+ + Information Retrieval 3 + +
+
+
+ + ☆ Enhancing Transformers without Self-supervised Learning: A Loss + Landscape Perspective in Sequential Recommendation + + +
+ Transformer and its variants are a powerful class of architectures for +sequential recommendation, owing to their ability of capturing a user's dynamic +interests from their past interactions. Despite their success, +Transformer-based models often require the optimization of a large number of +parameters, making them difficult to train from sparse data in sequential +recommendation. To address the problem of data sparsity, previous studies have +utilized self-supervised learning to enhance Transformers, such as pre-training +embeddings from item attributes or contrastive data augmentations. However, +these approaches encounter several training issues, including initialization +sensitivity, manual data augmentations, and large batch-size memory +bottlenecks. + In this work, we investigate Transformers from the perspective of loss +geometry, aiming to enhance the models' data efficiency and generalization in +sequential recommendation. We observe that Transformers (e.g., SASRec) can +converge to extremely sharp local minima if not adequately regularized. +Inspired by the recent Sharpness-Aware Minimization (SAM), we propose SAMRec, +which significantly improves the accuracy and robustness of sequential +recommendation. SAMRec performs comparably to state-of-the-art self-supervised +Transformers, such as S$^3$Rec and CL4SRec, without the need for pre-training +or strong data augmentations. + +
+
+
+
+
+ + ☆ Designing and Evaluating Presentation Strategies for Fact-Checked + Content CIKM '23 + + +
+ With the rapid growth of online misinformation, it is crucial to have +reliable fact-checking methods. Recent research on finding check-worthy claims +and automated fact-checking have made significant advancements. However, +limited guidance exists regarding the presentation of fact-checked content to +effectively convey verified information to users. We address this research gap +by exploring the critical design elements in fact-checking reports and +investigating whether credibility and presentation-based design improvements +can enhance users' ability to interpret the report accurately. We co-developed +potential content presentation strategies through a workshop involving +fact-checking professionals, communication experts, and researchers. The +workshop examined the significance and utility of elements such as veracity +indicators and explored the feasibility of incorporating interactive components +for enhanced information disclosure. Building on the workshop outcomes, we +conducted an online experiment involving 76 crowd workers to assess the +efficacy of different design strategies. The results indicate that proposed +strategies significantly improve users' ability to accurately interpret the +verdict of fact-checking articles. Our findings underscore the critical role of +effective presentation of fact reports in addressing the spread of +misinformation. By adopting appropriate design enhancements, the effectiveness +of fact-checking reports can be maximized, enabling users to make informed +judgments. + +
+
+ comment: Accepted to the 32nd ACM International Conference on Information and + Knowledge Management (CIKM '23) +
+
+
+
+
+ + ☆ Offline Pseudo Relevance Feedback for Efficient and Effective + Single-pass Dense Retrieval SIGIR2023 + + +
+ Dense retrieval has made significant advancements in information retrieval +(IR) by achieving high levels of effectiveness while maintaining online +efficiency during a single-pass retrieval process. However, the application of +pseudo relevance feedback (PRF) to further enhance retrieval effectiveness +results in a doubling of online latency. To address this challenge, this paper +presents a single-pass dense retrieval framework that shifts the PRF process +offline through the utilization of pre-generated pseudo-queries. As a result, +online retrieval is reduced to a single matching with the pseudo-queries, hence +providing faster online retrieval. The effectiveness of the proposed approach +is evaluated on the standard TREC DL and HARD datasets, and the results +demonstrate its promise. Our code is openly available at +https://github.com/Rosenberg37/OPRF. + +
+
+ comment: Accepted at SIGIR2023 +
+
+
+
+
+
+
+
+ + Machine Learning 17 + +
+
+
+ + ☆ Unsupervised Opinion Aggregation -- A Statistical Perspective + + +
+ Complex decision-making systems rarely have direct access to the current +state of the world and they instead rely on opinions to form an understanding +of what the ground truth could be. Even in problems where experts provide +opinions without any intention to manipulate the decision maker, it is +challenging to decide which expert's opinion is more reliable -- a challenge +that is further amplified when decision-maker has limited, delayed, or no +access to the ground truth after the fact. This paper explores a statistical +approach to infer the competence of each expert based on their opinions without +any need for the ground truth. Echoing the logic behind what is commonly +referred to as \textit{the wisdom of crowds}, we propose measuring the +competence of each expert by their likeliness to agree with their peers. We +further show that the more reliable an expert is the more likely it is that +they agree with their peers. We leverage this fact to propose a completely +unsupervised version of the na\"{i}ve Bayes classifier and show that the +proposed technique is asymptotically optimal for a large class of problems. In +addition to aggregating a large block of opinions, we further apply our +technique for online opinion aggregation and for decision-making based on a +limited the number of opinions. + +
+
+ comment: This research was conducted during Noyan Sevuktekin's time at + University of Illinois at Urbana-Champaign and the results were first + presented in Chapter 3 of his dissertation, entitled "Learning From + Opinions". Permalink: https://hdl.handle.net/2142/110814 +
+
+
+
+
+ + ☆ HoSNN: Adversarially-Robust Homeostatic Spiking Neural Networks with + Adaptive Firing Thresholds + + +
+ Spiking neural networks (SNNs) offer promise for efficient and powerful +neurally inspired computation. Common to other types of neural networks, +however, SNNs face the severe issue of vulnerability to adversarial attacks. We +present the first study that draws inspiration from neural homeostasis to +develop a bio-inspired solution that counters the susceptibilities of SNNs to +adversarial onslaughts. At the heart of our approach is a novel +threshold-adapting leaky integrate-and-fire (TA-LIF) neuron model, which we +adopt to construct the proposed adversarially robust homeostatic SNN (HoSNN). +Distinct from traditional LIF models, our TA-LIF model incorporates a +self-stabilizing dynamic thresholding mechanism, curtailing adversarial noise +propagation and safeguarding the robustness of HoSNNs in an unsupervised +manner. Theoretical analysis is presented to shed light on the stability and +convergence properties of the TA-LIF neurons, underscoring their superior +dynamic robustness under input distributional shifts over traditional LIF +neurons. Remarkably, without explicit adversarial training, our HoSNNs +demonstrate inherent robustness on CIFAR-10, with accuracy improvements to +72.6% and 54.19% against FGSM and PGD attacks, up from 20.97% and 0.6%, +respectively. Furthermore, with minimal FGSM adversarial training, our HoSNNs +surpass previous models by 29.99% under FGSM and 47.83% under PGD attacks on +CIFAR-10. Our findings offer a new perspective on harnessing biological +principles for bolstering SNNs adversarial robustness and defense, paving the +way to more resilient neuromorphic computing. + +
+
+
+
+
+ + ☆ Developing a Machine Learning-Based Clinical Decision Support Tool for + Uterine Tumor Imaging + + +
+ Uterine leiomyosarcoma (LMS) is a rare but aggressive malignancy. On imaging, +it is difficult to differentiate LMS from, for example, degenerated leiomyoma +(LM), a prevalent but benign condition. We curated a data set of 115 axial +T2-weighted MRI images from 110 patients (mean [range] age=45 [17-81] years) +with UTs that included five different tumor types. These data were randomly +split stratifying on tumor volume into training (n=85) and test sets (n=30). An +independent second reader (reader 2) provided manual segmentations for all test +set images. To automate segmentation, we applied nnU-Net and explored the +effect of training set size on performance by randomly generating subsets with +25, 45, 65 and 85 training set images. We evaluated the ability of radiomic +features to distinguish between types of UT individually and when combined +through feature selection and machine learning. Using the entire training set +the mean [95% CI] fibroid DSC was measured as 0.87 [0.59-1.00] and the +agreement between the two readers was 0.89 [0.77-1.0] on the test set. When +classifying degenerated LM from LMS we achieve a test set F1-score of 0.80. +Classifying UTs based on radiomic features we identify classifiers achieving +F1-scores of 0.53 [0.45, 0.61] and 0.80 [0.80, 0.80] on the test set for the +benign versus malignant, and degenerated LM versus LMS tasks. We show that it +is possible to develop an automated method for 3D segmentation of the uterus +and UT that is close to human-level performance with fewer than 150 annotated +images. For distinguishing UT types, while we train models that merit further +investigation with additional data, reliable automatic differentiation of UTs +remains a challenge. + +
+
+
+
+
+ + ☆ SE(3) Equivariant Augmented Coupling Flows + + +
+ Coupling normalizing flows allow for fast sampling and density evaluation, +making them the tool of choice for probabilistic modeling of physical systems. +However, the standard coupling architecture precludes endowing flows that +operate on the Cartesian coordinates of atoms with the SE(3) and permutation +invariances of physical systems. This work proposes a coupling flow that +preserves SE(3) and permutation equivariance by performing coordinate splits +along additional augmented dimensions. At each layer, the flow maps atoms' +positions into learned SE(3) invariant bases, where we apply standard flow +transformations, such as monotonic rational-quadratic splines, before returning +to the original basis. Crucially, our flow preserves fast sampling and density +evaluation, and may be used to produce unbiased estimates of expectations with +respect to the target distribution via importance sampling. When trained on the +DW4, LJ13 and QM9-positional datasets, our flow is competitive with equivariant +continuous normalizing flows, while allowing sampling two orders of magnitude +faster. Moreover, to the best of our knowledge, we are the first to learn the +full Boltzmann distribution of alanine dipeptide by only modeling the Cartesian +positions of its atoms. Lastly, we demonstrate that our flow can be trained to +approximately sample from the Boltzmann distribution of the DW4 and LJ13 +particle systems using only their energy functions. + +
+
+
+
+
+ + ☆ Can Large Language Models Find And Fix Vulnerable Software? + + +
+ In this study, we evaluated the capability of Large Language Models (LLMs), +particularly OpenAI's GPT-4, in detecting software vulnerabilities, comparing +their performance against traditional static code analyzers like Snyk and +Fortify. Our analysis covered numerous repositories, including those from NASA +and the Department of Defense. GPT-4 identified approximately four times the +vulnerabilities than its counterparts. Furthermore, it provided viable fixes +for each vulnerability, demonstrating a low rate of false positives. Our tests +encompassed 129 code samples across eight programming languages, revealing the +highest vulnerabilities in PHP and JavaScript. GPT-4's code corrections led to +a 90% reduction in vulnerabilities, requiring only an 11% increase in code +lines. A critical insight was LLMs' ability to self-audit, suggesting fixes for +their identified vulnerabilities and underscoring their precision. Future +research should explore system-level vulnerabilities and integrate multiple +static code analyzers for a holistic perspective on LLMs' potential. + +
+
+
+
+
+ + ☆ A Comprehensive Empirical Evaluation on Online Continual Learning ICCV + + +
+ Online continual learning aims to get closer to a live learning experience by +learning directly on a stream of data with temporally shifting distribution and +by storing a minimum amount of data from that stream. In this empirical +evaluation, we evaluate various methods from the literature that tackle online +continual learning. More specifically, we focus on the class-incremental +setting in the context of image classification, where the learner must learn +new classes incrementally from a stream of data. We compare these methods on +the Split-CIFAR100 and Split-TinyImagenet benchmarks, and measure their average +accuracy, forgetting, stability, and quality of the representations, to +evaluate various aspects of the algorithm at the end but also during the whole +training period. We find that most methods suffer from stability and +underfitting issues. However, the learned representations are comparable to +i.i.d. training under the same computational budget. No clear winner emerges +from the results and basic experience replay, when properly tuned and +implemented, is a very strong baseline. We release our modular and extensible +codebase at https://github.com/AlbinSou/ocl_survey based on the avalanche +framework to reproduce our results and encourage future research. + +
+
+ comment: ICCV Visual Continual Learning Workshop 2023 accepted paper +
+
+
+
+
+ + ☆ Quantum State Tomography using Quantum Machine Learning + + +
+ Quantum State Tomography (QST) is a fundamental technique in Quantum +Information Processing (QIP) for reconstructing unknown quantum states. +However, the conventional QST methods are limited by the number of measurements +required, which makes them impractical for large-scale quantum systems. To +overcome this challenge, we propose the integration of Quantum Machine Learning +(QML) techniques to enhance the efficiency of QST. In this paper, we conduct a +comprehensive investigation into various approaches for QST, encompassing both +classical and quantum methodologies; We also implement different QML approaches +for QST and demonstrate their effectiveness on various simulated and +experimental quantum systems, including multi-qubit networks. Our results show +that our QML-based QST approach can achieve high fidelity (98%) with +significantly fewer measurements than conventional methods, making it a +promising tool for practical QIP applications. + +
+
+ comment: 18 pages, 19 figures +
+
+
+
+
+ + ☆ Homogenising SoHO/EIT and SDO/AIA 171Å$~$ Images: A Deep Learning + Approach + + +
+ Extreme Ultraviolet images of the Sun are becoming an integral part of space +weather prediction tasks. However, having different surveys requires the +development of instrument-specific prediction algorithms. As an alternative, it +is possible to combine multiple surveys to create a homogeneous dataset. In +this study, we utilize the temporal overlap of SoHO/EIT and SDO/AIA 171~\AA +~surveys to train an ensemble of deep learning models for creating a single +homogeneous survey of EUV images for 2 solar cycles. Prior applications of deep +learning have focused on validating the homogeneity of the output while +overlooking the systematic estimation of uncertainty. We use an approach called +`Approximate Bayesian Ensembling' to generate an ensemble of models whose +uncertainty mimics that of a fully Bayesian neural network at a fraction of the +cost. We find that ensemble uncertainty goes down as the training set size +increases. Additionally, we show that the model ensemble adds immense value to +the prediction by showing higher uncertainty in test data that are not well +represented in the training data. + +
+
+ comment: 20 pages, 8 figures, accepted for publication in ApJS +
+
+
+
+
+ + ☆ Towards Sustainable Development: A Novel Integrated Machine Learning + Model for Holistic Environmental Health Monitoring + + +
+ Urbanization enables economic growth but also harms the environment through +degradation. Traditional methods of detecting environmental issues have proven +inefficient. Machine learning has emerged as a promising tool for tracking +environmental deterioration by identifying key predictive features. Recent +research focused on developing a predictive model using pollutant levels and +particulate matter as indicators of environmental state in order to outline +challenges. Machine learning was employed to identify patterns linking areas +with worse conditions. This research aims to assist governments in identifying +intervention points, improving planning and conservation efforts, and +ultimately contributing to sustainable development. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ A Trainable Approach to Zero-delay Smoothing Spline Interpolation + + +
+ The task of reconstructing smooth signals from streamed data in the form of +signal samples arises in various applications. This work addresses such a task +subject to a zero-delay response; that is, the smooth signal must be +reconstructed sequentially as soon as a data sample is available and without +having access to subsequent data. State-of-the-art approaches solve this +problem by interpolating consecutive data samples using splines. Here, each +interpolation step yields a piece that ensures a smooth signal reconstruction +while minimizing a cost metric, typically a weighted sum between the squared +residual and a derivative-based measure of smoothness. As a result, a +zero-delay interpolation is achieved in exchange for an almost certainly higher +cumulative cost as compared to interpolating all data samples together. This +paper presents a novel approach to further reduce this cumulative cost on +average. First, we formulate a zero-delay smoothing spline interpolation +problem from a sequential decision-making perspective, allowing us to model the +future impact of each interpolated piece on the average cumulative cost. Then, +an interpolation method is proposed to exploit the temporal dependencies +between the streamed data samples. Our method is assisted by a recurrent neural +network and accordingly trained to reduce the accumulated cost on average over +a set of example data samples collected from the same signal source generating +the signal to be reconstructed. Finally, we present extensive experimental +results for synthetic and real data showing how our approach outperforms the +abovementioned state-of-the-art. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ NeSyFOLD: Neurosymbolic Framework for Interpretable Image Classification + + +
+ Deep learning models such as CNNs have surpassed human performance in +computer vision tasks such as image classification. However, despite their +sophistication, these models lack interpretability which can lead to biased +outcomes reflecting existing prejudices in the data. We aim to make predictions +made by a CNN interpretable. Hence, we present a novel framework called +NeSyFOLD to create a neurosymbolic (NeSy) model for image classification tasks. +The model is a CNN with all layers following the last convolutional layer +replaced by a stratified answer set program (ASP). A rule-based machine +learning algorithm called FOLD-SE-M is used to derive the stratified answer set +program from binarized filter activations of the last convolutional layer. The +answer set program can be viewed as a rule-set, wherein the truth value of each +predicate depends on the activation of the corresponding kernel in the CNN. The +rule-set serves as a global explanation for the model and is interpretable. A +justification for the predictions made by the NeSy model can be obtained using +an ASP interpreter. We also use our NeSyFOLD framework with a CNN that is +trained using a sparse kernel learning technique called Elite BackProp (EBP). +This leads to a significant reduction in rule-set size without compromising +accuracy or fidelity thus improving scalability of the NeSy model and +interpretability of its rule-set. Evaluation is done on datasets with varied +complexity and sizes. To make the rule-set more intuitive to understand, we +propose a novel algorithm for labelling each kernel's corresponding predicate +in the rule-set with the semantic concept(s) it learns. We evaluate the +performance of our "semantic labelling algorithm" to quantify the efficacy of +the semantic labelling for both the NeSy model and the NeSy-EBP model. + +
+
+
+
+
+ + ♻ ☆ Towards Explainable Land Cover Mapping: a Counterfactual-based Strategy + + +
+ Counterfactual explanations are an emerging tool to enhance interpretability +of deep learning models. Given a sample, these methods seek to find and display +to the user similar samples across the decision boundary. In this paper, we +propose a generative adversarial counterfactual approach for satellite image +time series in a multi-class setting for the land cover classification task. +One of the distinctive features of the proposed approach is the lack of prior +assumption on the targeted class for a given counterfactual explanation. This +inherent flexibility allows for the discovery of interesting information on the +relationship between land cover classes. The other feature consists of +encouraging the counterfactual to differ from the original sample only in a +small and compact temporal segment. These time-contiguous perturbations allow +for a much sparser and, thus, interpretable solution. Furthermore, +plausibility/realism of the generated counterfactual explanations is enforced +via the proposed adversarial learning strategy. + +
+
+
+
+
+ + ♻ ☆ Rapid-INR: Storage Efficient CPU-free DNN Training Using Implicit Neural + Representation + + +
+ Implicit Neural Representation (INR) is an innovative approach for +representing complex shapes or objects without explicitly defining their +geometry or surface structure. Instead, INR represents objects as continuous +functions. Previous research has demonstrated the effectiveness of using neural +networks as INR for image compression, showcasing comparable performance to +traditional methods such as JPEG. However, INR holds potential for various +applications beyond image compression. This paper introduces Rapid-INR, a novel +approach that utilizes INR for encoding and compressing images, thereby +accelerating neural network training in computer vision tasks. Our methodology +involves storing the whole dataset directly in INR format on a GPU, mitigating +the significant data communication overhead between the CPU and GPU during +training. Additionally, the decoding process from INR to RGB format is highly +parallelized and executed on-the-fly. To further enhance compression, we +propose iterative and dynamic pruning, as well as layer-wise quantization, +building upon previous work. We evaluate our framework on the image +classification task, utilizing the ResNet-18 backbone network and three +commonly used datasets with varying image sizes. Rapid-INR reduces memory +consumption to only 5% of the original dataset size and achieves a maximum +6$\times$ speedup over the PyTorch training pipeline, as well as a maximum 1.2x +speedup over the DALI training pipeline, with only a marginal decrease in +accuracy. Importantly, Rapid-INR can be readily applied to other computer +vision tasks and backbone networks with reasonable engineering efforts. Our +implementation code is publicly available at +https://github.com/sharc-lab/Rapid-INR. + +
+
+ comment: Accepted by ICCAD 2023 +
+
+
+
+
+ + ♻ ☆ Continual Learning as Computationally Constrained Reinforcement Learning + + +
+ An agent that efficiently accumulates knowledge to develop increasingly +sophisticated skills over a long lifetime could advance the frontier of +artificial intelligence capabilities. The design of such agents, which remains +a long-standing challenge of artificial intelligence, is addressed by the +subject of continual learning. This monograph clarifies and formalizes concepts +of continual learning, introducing a framework and set of tools to stimulate +further research. + +
+
+
+
+
+ + ♻ ☆ Investigating Conversion from Mild Cognitive Impairment to Alzheimer's + Disease using Latent Space Manipulation + + +
+ Alzheimer's disease is the most common cause of dementia that affects +millions of lives worldwide. Investigating the underlying causes and risk +factors of Alzheimer's disease is essential to prevent its progression. Mild +Cognitive Impairment (MCI) is considered an intermediate stage before +Alzheimer's disease. Early prediction of the conversion from the MCI to +Alzheimer's is crucial to take necessary precautions for decelerating the +progression and developing suitable treatments. In this study, we propose a +deep learning framework to discover the variables which are identifiers of the +conversion from MCI to Alzheimer's disease. In particular, the latent space of +a variational auto-encoder network trained with the MCI and Alzheimer's +patients is manipulated to obtain the significant attributes and decipher their +behavior that leads to the conversion from MCI to Alzheimer's disease. By +utilizing a generative decoder and the dimensions that lead to the Alzheimer's +diagnosis, we generate synthetic dementia patients from MCI patients in the +dataset. Experimental results show promising quantitative and qualitative +results on one of the most extensive and commonly used Alzheimer's disease +neuroimaging datasets in literature. + +
+
+
+
+
+ + ♻ ☆ Non-separable Covariance Kernels for Spatiotemporal Gaussian Processes + based on a Hybrid Spectral Method and the Harmonic Oscillator + + +
+ Gaussian processes provide a flexible, non-parametric framework for the +approximation of functions in high-dimensional spaces. The covariance kernel is +the main engine of Gaussian processes, incorporating correlations that underpin +the predictive distribution. For applications with spatiotemporal datasets, +suitable kernels should model joint spatial and temporal dependence. Separable +space-time covariance kernels offer simplicity and computational efficiency. +However, non-separable kernels include space-time interactions that better +capture observed correlations. Most non-separable kernels that admit explicit +expressions are based on mathematical considerations (admissibility conditions) +rather than first-principles derivations. We present a hybrid spectral approach +for generating covariance kernels which is based on physical arguments. We use +this approach to derive a new class of physically motivated, non-separable +covariance kernels which have their roots in the stochastic, linear, damped, +harmonic oscillator (LDHO). The new kernels incorporate functions with both +monotonic and oscillatory decay of space-time correlations. The LDHO covariance +kernels involve space-time interactions which are introduced by dispersion +relations that modulate the oscillator coefficients. We derive explicit +relations for the spatiotemporal covariance kernels in the three oscillator +regimes (underdamping, critical damping, overdamping) and investigate their +properties. + +
+
+ comment: 56 pages, 12 figures, five appendices +
+
+
+
+
+ + ♻ ☆ FedAVO: Improving Communication Efficiency in Federated Learning with + African Vultures Optimizer + + +
+ Federated Learning (FL), a distributed machine learning technique has +recently experienced tremendous growth in popularity due to its emphasis on +user data privacy. However, the distributed computations of FL can result in +constrained communication and drawn-out learning processes, necessitating the +client-server communication cost optimization. The ratio of chosen clients and +the quantity of local training passes are two hyperparameters that have a +significant impact on FL performance. Due to different training preferences +across various applications, it can be difficult for FL practitioners to +manually select such hyperparameters. In our research paper, we introduce +FedAVO, a novel FL algorithm that enhances communication effectiveness by +selecting the best hyperparameters leveraging the African Vulture Optimizer +(AVO). Our research demonstrates that the communication costs associated with +FL operations can be substantially reduced by adopting AVO for FL +hyperparameter adjustment. Through extensive evaluations of FedAVO on benchmark +datasets, we show that FedAVO achieves significant improvement in terms of +model accuracy and communication round, particularly with realistic cases of +Non-IID datasets. Our extensive evaluation of the FedAVO algorithm identifies +the optimal hyperparameters that are appropriately fitted for the benchmark +datasets, eventually increasing global model accuracy by 6% in comparison to +the state-of-the-art FL algorithms (such as FedAvg, FedProx, FedPSO, etc.). + +
+
+ comment: 19 pages +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Neural Architectures Learning Fourier Transforms, Signal Processing and + Much More.... + + +
+ This report will explore and answer fundamental questions about taking +Fourier Transforms and tying it with recent advances in AI and neural +architecture. One interpretation of the Fourier Transform is decomposing a +signal into its constituent components by projecting them onto complex +exponentials. Variants exist, such as discrete cosine transform that does not +operate on the complex domain and projects an input signal to only cosine +functions oscillating at different frequencies. However, this is a fundamental +limitation, and it needs to be more suboptimal. The first one is that all +kernels are sinusoidal: What if we could have some kernels adapted or learned +according to the problem? What if we can use neural architectures for this? We +show how one can learn these kernels from scratch for audio signal processing +applications. We find that the neural architecture not only learns sinusoidal +kernel shapes but discovers all kinds of incredible signal-processing +properties. E.g., windowing functions, onset detectors, high pass filters, low +pass filters, modulations, etc. Further, upon analysis of the filters, we find +that the neural architecture has a comb filter-like structure on top of the +learned kernels. Comb filters that allow harmonic frequencies to pass through +are one of the core building blocks/types of filters similar to high-pass, +low-pass, and band-pass filters of various traditional signal processing +algorithms. Further, we can also use the convolution operation with a signal to +be learned from scratch, and we will explore papers in the literature that uses +this with that robust Transformer architectures. Further, we would also explore +making the learned kernel's content adaptive, i.e., learning different kernels +for different inputs. + +
+
+ comment: 12 pages, 6 figures. Technical Report at Stanford University; + Presented on 14th August 2023 +
+
+
+
+
+ + ☆ WMFormer++: Nested Transformer for Visible Watermark Removal via Implict + Joint Learning + + +
+ Watermarking serves as a widely adopted approach to safeguard media +copyright. In parallel, the research focus has extended to watermark removal +techniques, offering an adversarial means to enhance watermark robustness and +foster advancements in the watermarking field. Existing watermark removal +methods often rely on UNet architectures with multiple decoder branches -- one +for watermark localization and the other for background image restoration. +These methods involve complex module designs to guide information flow for +respective tasks, which can lead to suboptimal performance and an overly +cumbersome model. To simplify the existing framework, we propose a novel +Transformer-based approach with a unified decoder branch, treating watermark +extraction and background restoration as a single task and allowing thenetwork +to learn information flow between them without artificial design patterns. +Additionally, we utilize nested structures to facilitate multi-scale feature +fusion, forming a parallel ensemble of nested structures that constitute the +UNet. Supervision is applied to UNets with varying depths to facilitate +knowledge learning across all levels. Extensive experiments are conducted on +various challenging benchmarks to validate the effectiveness of our proposed +method. The results demonstrate that our approach achieves state-of-the-art +performance and produces high-quality images. + +
+
+
+
+
+ + ♻ ☆ VoxBlink: X-Large Speaker Verification Dataset on Camera ICASSP2023 + + +
+ In this paper, we contribute a novel and extensive dataset for speaker +verification, which contains noisy 38k identities/1.45M utterances (VoxBlink) +and relatively cleaned 18k identities/1.02M (VoxBlink-Clean) utterances for +training. Firstly, we accumulate a 60K+ users' list with their avatars and +download their short videos on YouTube. We then established an automatic and +scalable pipeline to extract relevant speech and video segments from these +videos. To our knowledge, the VoxBlink dataset is one of the largest speaker +recognition datasets available. Secondly, we conduct a series of experiments +based on different backbones trained on a mix of the VoxCeleb2 and the +VoxBlink-Clean. Our findings highlight a notable performance improvement, +ranging from 13% to 30%, across different backbone architectures upon +integrating our dataset for training. The dataset will be made publicly +available shortly. + +
+
+ comment: submit to ICASSP2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 26 + +
+
+
+ + ☆ Bayes Risk Transducer: Transducer with Controllable Alignment Prediction + + +
+ Automatic speech recognition (ASR) based on transducers is widely used. In +training, a transducer maximizes the summed posteriors of all paths. The path +with the highest posterior is commonly defined as the predicted alignment +between the speech and the transcription. While the vanilla transducer does not +have a prior preference for any of the valid paths, this work intends to +enforce the preferred paths and achieve controllable alignment prediction. +Specifically, this work proposes Bayes Risk Transducer (BRT), which uses a +Bayes risk function to set lower risk values to the preferred paths so that the +predicted alignment is more likely to satisfy specific desired properties. We +further demonstrate that these predicted alignments with intentionally designed +properties can provide practical advantages over the vanilla transducer. +Experimentally, the proposed BRT saves inference cost by up to 46% for +non-streaming ASR and reduces overall system latency by 41% for streaming ASR. + +
+
+
+
+
+ + ☆ ASPIRE: Language-Guided Augmentation for Robust Image Classification + + +
+ Neural image classifiers can often learn to make predictions by overly +relying on non-predictive features that are spuriously correlated with the +class labels in the training data. This leads to poor performance in real-world +atypical scenarios where such features are absent. Supplementing the training +dataset with images without such spurious features can aid robust learning +against spurious correlations via better generalization. This paper presents +ASPIRE (Language-guided data Augmentation for SPurIous correlation REmoval), a +simple yet effective solution for expanding the training dataset with synthetic +images without spurious features. ASPIRE, guided by language, generates these +images without requiring any form of additional supervision or existing +examples. Precisely, we employ LLMs to first extract foreground and background +features from textual descriptions of an image, followed by advanced +language-guided image editing to discover the features that are spuriously +correlated with the class label. Finally, we personalize a text-to-image +generation model to generate diverse in-domain images without spurious +features. We demonstrate the effectiveness of ASPIRE on 4 datasets, including +the very challenging Hard ImageNet dataset, and 9 baselines and show that +ASPIRE improves the classification accuracy of prior methods by 1% - 38%. Code +soon at: https://github.com/Sreyan88/ASPIRE. + +
+
+ comment: Pre-print Under Review +
+
+
+
+
+ + ☆ Open, Closed, or Small Language Models for Text Classification? + + +
+ Recent advancements in large language models have demonstrated remarkable +capabilities across various NLP tasks. But many questions remain, including +whether open-source models match closed ones, why these models excel or +struggle with certain tasks, and what types of practical procedures can improve +performance. We address these questions in the context of classification by +evaluating three classes of models using eight datasets across three distinct +tasks: named entity recognition, political party prediction, and misinformation +detection. While larger LLMs often lead to improved performance, open-source +models can rival their closed-source counterparts by fine-tuning. Moreover, +supervised smaller models, like RoBERTa, can achieve similar or even greater +performance in many datasets compared to generative LLMs. On the other hand, +closed models maintain an advantage in hard tasks that demand the most +generalizability. This study underscores the importance of model selection +based on task requirements + +
+
+ comment: 14 pages, 15 Tables, 1 Figure +
+
+
+
+
+ + ☆ PACE: Improving Prompt with Actor-Critic Editing for Large Language + Model + + +
+ Large language models (LLMs) have showcased remarkable potential across +various tasks by conditioning on prompts. However, the quality of different +human-written prompts leads to substantial discrepancies in LLMs' performance, +and improving prompts usually necessitates considerable human effort and +expertise. To this end, this paper proposes Prompt with Actor-Critic Editing +(PACE) for LLMs to enable automatic prompt editing. Drawing inspiration from +the actor-critic algorithm in reinforcement learning, PACE leverages LLMs as +the dual roles of actors and critics, conceptualizing prompt as a type of +policy. PACE refines prompt, taking into account the feedback from both actors +performing prompt and critics criticizing response. This process helps LLMs +better align prompt to a specific task, thanks to real responses and thinking +from LLMs. We conduct extensive experiments on 24 instruction induction tasks +and 21 big-bench tasks. Experimental results indicate that PACE elevates the +relative performance of medium/low-quality human-written prompts by up to 98\%, +which has comparable performance to high-quality human-written prompts. +Moreover, PACE also exhibits notable efficacy for prompt generation. + +
+
+
+
+
+ + ☆ An Empirical Study of CLIP for Text-based Person Search + + +
+ Text-based Person Search (TBPS) aims to retrieve the person images using +natural language descriptions. Recently, Contrastive Language Image Pretraining +(CLIP), a universal large cross-modal vision-language pre-training model, has +remarkably performed over various cross-modal downstream tasks due to its +powerful cross-modal semantic learning capacity. TPBS, as a fine-grained +cross-modal retrieval task, is also facing the rise of research on the +CLIP-based TBPS. In order to explore the potential of the visual-language +pre-training model for downstream TBPS tasks, this paper makes the first +attempt to conduct a comprehensive empirical study of CLIP for TBPS and thus +contribute a straightforward, incremental, yet strong TBPS-CLIP baseline to the +TBPS community. We revisit critical design considerations under CLIP, including +data augmentation and loss function. The model, with the aforementioned designs +and practical training tricks, can attain satisfactory performance without any +sophisticated modules. Also, we conduct the probing experiments of TBPS-CLIP in +model generalization and model compression, demonstrating the effectiveness of +TBPS-CLIP from various aspects. This work is expected to provide empirical +insights and highlight future CLIP-based TBPS research. + +
+
+ comment: 13 pages, 5 fiugres and 17 tables. Code is available at + https://github.com/Flame-Chasers/TBPS-CLIP +
+
+
+
+
+ + ☆ GameEval: Evaluating LLMs on Conversational Games + + +
+ The rapid advancements in large language models (LLMs) have presented +challenges in evaluating those models. Existing evaluation methods are either +reference-based or preference based, which inevitably need human intervention +or introduce test bias caused by evaluator models. In this paper, we propose +GameEval, a novel approach to evaluating LLMs through goal-driven +conversational games, overcoming the limitations of previous methods. GameEval +treats LLMs as game players and assigns them distinct roles with specific goals +achieved by launching conversations of various forms, including discussion, +question answering, and voting. We design three unique games with cooperative +or adversarial objectives, accompanied by corresponding evaluation metrics, to +show how this new paradigm comprehensively evaluates model performance.Through +extensive experiments, we show that GameEval can effectively differentiate the +capabilities of various LLMs, providing a comprehensive assessment of their +integrated abilities to solve complex problems. Our public anonymous code is +available at https://github.com/GameEval/GameEval. + +
+
+
+
+
+ + ☆ ControlRetriever: Harnessing the Power of Instructions for Controllable + Retrieval + + +
+ Recent studies have shown that dense retrieval models, lacking dedicated +training data, struggle to perform well across diverse retrieval tasks, as +different retrieval tasks often entail distinct search intents. To address this +challenge, in this work we introduce ControlRetriever, a generic and efficient +approach with a parameter isolated architecture, capable of controlling dense +retrieval models to directly perform varied retrieval tasks, harnessing the +power of instructions that explicitly describe retrieval intents in natural +language. Leveraging the foundation of ControlNet, which has proven powerful in +text-to-image generation, ControlRetriever imbues different retrieval models +with the new capacity of controllable retrieval, all while being guided by +task-specific instructions. Furthermore, we propose a novel LLM guided +Instruction Synthesizing and Iterative Training strategy, which iteratively +tunes ControlRetriever based on extensive automatically-generated retrieval +data with diverse instructions by capitalizing the advancement of large +language models. Extensive experiments show that in the BEIR benchmark, with +only natural language descriptions of specific retrieval intent for each task, +ControlRetriever, as a unified multi-task retrieval system without +task-specific tuning, significantly outperforms baseline methods designed with +task-specific retrievers and also achieves state-of-the-art zero-shot +performance. + +
+
+
+
+
+ + ☆ HICL: Hashtag-Driven In-Context Learning for Social Media Natural + Language Understanding + + +
+ Natural language understanding (NLU) is integral to various social media +applications. However, existing NLU models rely heavily on context for semantic +learning, resulting in compromised performance when faced with short and noisy +social media content. To address this issue, we leverage in-context learning +(ICL), wherein language models learn to make inferences by conditioning on a +handful of demonstrations to enrich the context and propose a novel +hashtag-driven in-context learning (HICL) framework. Concretely, we pre-train a +model #Encoder, which employs #hashtags (user-annotated topic labels) to drive +BERT-based pre-training through contrastive learning. Our objective here is to +enable #Encoder to gain the ability to incorporate topic-related semantic +information, which allows it to retrieve topic-related posts to enrich contexts +and enhance social media NLU with noisy contexts. To further integrate the +retrieved context with the source text, we employ a gradient-based method to +identify trigger terms useful in fusing information from both sources. For +empirical studies, we collected 45M tweets to set up an in-context NLU +benchmark, and the experimental results on seven downstream tasks show that +HICL substantially advances the previous state-of-the-art results. Furthermore, +we conducted extensive analyzes and found that: (1) combining source input with +a top-retrieved post from #Encoder is more effective than using semantically +similar posts; (2) trigger words can largely benefit in merging context from +the source and retrieved posts. + +
+
+ comment: https://github.com/albertan017/HICL +
+
+
+
+
+ + ☆ FinEval: A Chinese Financial Domain Knowledge Evaluation Benchmark for + Large Language Models + + +
+ Large language models (LLMs) have demonstrated exceptional performance in +various natural language processing tasks, yet their efficacy in more +challenging and domain-specific tasks remains largely unexplored. This paper +presents FinEval, a benchmark specifically designed for the financial domain +knowledge in the LLMs. FinEval is a collection of high-quality multiple-choice +questions covering Finance, Economy, Accounting, and Certificate. It includes +4,661 questions spanning 34 different academic subjects. To ensure a +comprehensive model performance evaluation, FinEval employs a range of prompt +types, including zero-shot and few-shot prompts, as well as answer-only and +chain-of-thought prompts. Evaluating state-of-the-art Chinese and English LLMs +on FinEval, the results show that only GPT-4 achieved an accuracy close to 70% +in different prompt settings, indicating significant growth potential for LLMs +in the financial domain knowledge. Our work offers a more comprehensive +financial knowledge evaluation benchmark, utilizing data of mock exams and +covering a wide range of evaluated LLMs. + +
+
+
+
+
+ + ☆ Tackling Vision Language Tasks Through Learning Inner Monologues + + +
+ Visual language tasks require AI models to comprehend and reason with both +visual and textual content. Driven by the power of Large Language Models +(LLMs), two prominent methods have emerged: (1) the hybrid integration between +LLMs and Vision-Language Models (VLMs), where visual inputs are firstly +converted into language descriptions by VLMs, serving as inputs for LLMs to +generate final answer(s); (2) visual feature alignment in language space, where +visual inputs are encoded as embeddings and projected to LLMs' language space +via further supervised fine-tuning. The first approach provides light training +costs and interpretability but is hard to be optimized in an end-to-end +fashion. The second approach presents decent performance, but feature alignment +usually requires large amounts of training data and lacks interpretability. To +tackle this dilemma, we propose a novel approach, Inner Monologue Multi-Modal +Optimization (IMMO), to solve complex vision language problems by simulating +inner monologue processes, a cognitive process in which an individual engages +in silent verbal communication with themselves. We enable LLMs and VLMs to +interact through natural language conversation and propose to use a two-stage +training process to learn how to do the inner monologue (self-asking questions +and answering questions). IMMO is evaluated on two popular tasks and the +results suggest by emulating the cognitive phenomenon of internal dialogue, our +approach can enhance reasoning and explanation abilities, contributing to the +more effective fusion of vision and language models. More importantly, instead +of using predefined human-crafted monologues, IMMO learns this process within +the deep learning models, promising wider applicability to many different AI +problems beyond vision language tasks. + +
+
+
+
+
+ + ☆ Data-to-text Generation for Severely Under-Resourced Languages with + GPT-3.5: A Bit of Help Needed from Google Translate + + +
+ LLMs like GPT are great at tasks involving English which dominates in their +training data. In this paper, we look at how they cope with tasks involving +languages that are severely under-represented in their training data, in the +context of data-to-text generation for Irish, Maltese, Welsh and Breton. During +the prompt-engineering phase we tested a range of prompt types and formats on +GPT-3.5 and~4 with a small sample of example input/output pairs. We then fully +evaluated the two most promising prompts in two scenarios: (i) direct +generation into the under-resourced language, and (ii) generation into English +followed by translation into the under-resourced language. We find that +few-shot prompting works better for direct generation into under-resourced +languages, but that the difference disappears when pivoting via English. The +few-shot + translation system variants were submitted to the WebNLG 2023 shared +task where they outperformed competitor systems by substantial margins in all +languages on all metrics. We conclude that good performance on under-resourced +languages can be achieved out-of-the box with state-of-the-art LLMs. However, +our best results (for Welsh) remain well below the lowest ranked English system +at WebNLG'20. + +
+
+
+
+
+ + ☆ Eva-KELLM: A New Benchmark for Evaluating Knowledge Editing of LLMs + + +
+ Large language models (LLMs) possess a wealth of knowledge encoded in their +parameters. However, this knowledge may become outdated or unsuitable over +time. As a result, there has been a growing interest in knowledge editing for +LLMs and evaluating its effectiveness. Existing studies primarily focus on +knowledge editing using factual triplets, which not only incur high costs for +collection but also struggle to express complex facts. Furthermore, these +studies are often limited in their evaluation perspectives. In this paper, we +propose Eva-KELLM, a new benchmark for evaluating knowledge editing of LLMs. +This benchmark includes an evaluation framework and a corresponding dataset. +Under our framework, we first ask the LLM to perform knowledge editing using +raw documents, which provides a more convenient and universal approach compared +to using factual triplets. We then evaluate the updated LLM from multiple +perspectives. In addition to assessing the effectiveness of knowledge editing +and the retention of unrelated knowledge from conventional studies, we further +test the LLM's ability in two aspects: 1) Reasoning with the altered knowledge, +aiming for the LLM to genuinely learn the altered knowledge instead of simply +memorizing it. 2) Cross-lingual knowledge transfer, where the LLM updated with +raw documents in one language should be capable of handling queries from +another language. To facilitate further research, we construct and release the +corresponding dataset. Using this benchmark, we investigate the effectiveness +of several commonly-used knowledge editing methods. Experimental results +indicate that the current methods for knowledge editing using raw documents are +not effective in yielding satisfactory results, particularly when it comes to +reasoning with altered knowledge and cross-lingual knowledge transfer. + +
+
+
+
+
+ + ☆ Utilizing Semantic Textual Similarity for Clinical Survey Data Feature + Selection + + +
+ Survey data can contain a high number of features while having a +comparatively low quantity of examples. Machine learning models that attempt to +predict outcomes from survey data under these conditions can overfit and result +in poor generalizability. One remedy to this issue is feature selection, which +attempts to select an optimal subset of features to learn upon. A relatively +unexplored source of information in the feature selection process is the usage +of textual names of features, which may be semantically indicative of which +features are relevant to a target outcome. The relationships between feature +names and target names can be evaluated using language models (LMs) to produce +semantic textual similarity (STS) scores, which can then be used to select +features. We examine the performance using STS to select features directly and +in the minimal-redundancy-maximal-relevance (mRMR) algorithm. The performance +of STS as a feature selection metric is evaluated against preliminary survey +data collected as a part of a clinical study on persistent post-surgical pain +(PPSP). The results suggest that features selected with STS can result in +higher performance models compared to traditional feature selection algorithms. + +
+
+
+
+
+ + ☆ Inductive-bias Learning: Generating Code Models with Large Language + Model + + +
+ Large Language Models(LLMs) have been attracting attention due to a ability +called in-context learning(ICL). ICL, without updating the parameters of a LLM, +it is possible to achieve highly accurate inference based on rules ``in the +context'' by merely inputting a training data into the prompt. Although ICL is +a developing field with many unanswered questions, LLMs themselves serves as a +inference model, seemingly realizing inference without explicitly indicate +``inductive bias''. On the other hand, a code generation is also a highlighted +application of LLMs. The accuracy of code generation has dramatically improved, +enabling even non-engineers to generate code to perform the desired tasks by +crafting appropriate prompts. In this paper, we propose a novel ``learning'' +method called an ``Inductive-Bias Learning (IBL)'', which combines the +techniques of ICL and code generation. An idea of IBL is straightforward. Like +ICL, IBL inputs a training data into the prompt and outputs a code with a +necessary structure for inference (we referred to as ``Code Model'') from a +``contextual understanding''. Despite being a seemingly simple approach, IBL +encompasses both a ``property of inference without explicit inductive bias'' +inherent in ICL and a ``readability and explainability'' of the code +generation. Surprisingly, generated Code Models have been found to achieve +predictive accuracy comparable to, and in some cases surpassing, ICL and +representative machine learning models. Our IBL code is open source: +https://github.com/fuyu-quant/IBLM + +
+
+ comment: 17 pages, 8 figures +
+
+
+
+
+ + ☆ Breaking Language Barriers: A Question Answering Dataset for Hindi and + Marathi + + +
+ The recent advances in deep-learning have led to the development of highly +sophisticated systems with an unquenchable appetite for data. On the other +hand, building good deep-learning models for low-resource languages remains a +challenging task. This paper focuses on developing a Question Answering dataset +for two such languages- Hindi and Marathi. Despite Hindi being the 3rd most +spoken language worldwide, with 345 million speakers, and Marathi being the +11th most spoken language globally, with 83.2 million speakers, both languages +face limited resources for building efficient Question Answering systems. To +tackle the challenge of data scarcity, we have developed a novel approach for +translating the SQuAD 2.0 dataset into Hindi and Marathi. We release the +largest Question-Answering dataset available for these languages, with each +dataset containing 28,000 samples. We evaluate the dataset on various +architectures and release the best-performing models for both Hindi and +Marathi, which will facilitate further research in these languages. Leveraging +similarity tools, our method holds the potential to create datasets in diverse +languages, thereby enhancing the understanding of natural language across +varied linguistic contexts. Our fine-tuned models, code, and dataset will be +made publicly available. + +
+
+
+
+
+ + ☆ Black-box Adversarial Attacks against Dense Retrieval Models: A + Multi-view Contrastive Learning Method CIKM2023 + + +
+ Neural ranking models (NRMs) and dense retrieval (DR) models have given rise +to substantial improvements in overall retrieval performance. In addition to +their effectiveness, and motivated by the proven lack of robustness of deep +learning-based approaches in other areas, there is growing interest in the +robustness of deep learning-based approaches to the core retrieval problem. +Adversarial attack methods that have so far been developed mainly focus on +attacking NRMs, with very little attention being paid to the robustness of DR +models. In this paper, we introduce the adversarial retrieval attack (AREA) +task. The AREA task is meant to trick DR models into retrieving a target +document that is outside the initial set of candidate documents retrieved by +the DR model in response to a query. We consider the decision-based black-box +adversarial setting, which is realistic in real-world search engines. To +address the AREA task, we first employ existing adversarial attack methods +designed for NRMs. We find that the promising results that have previously been +reported on attacking NRMs, do not generalize to DR models: these methods +underperform a simple term spamming method. We attribute the observed lack of +generalizability to the interaction-focused architecture of NRMs, which +emphasizes fine-grained relevance matching. DR models follow a different +representation-focused architecture that prioritizes coarse-grained +representations. We propose to formalize attacks on DR models as a contrastive +learning problem in a multi-view representation space. The core idea is to +encourage the consistency between each view representation of the target +document and its corresponding viewer via view-wise supervision signals. +Experimental results demonstrate that the proposed method can significantly +outperform existing attack strategies in misleading the DR model with small +indiscernible text perturbations. + +
+
+ comment: Accept by CIKM2023, 10 pages +
+
+
+
+
+ + ☆ UniDoc: A Universal Large Multimodal Model for Simultaneous Text + Detection, Recognition, Spotting and Understanding + + +
+ In the era of Large Language Models (LLMs), tremendous strides have been made +in the field of multimodal understanding. However, existing advanced algorithms +are limited to effectively utilizing the immense representation capabilities +and rich world knowledge inherent to these large pre-trained models, and the +beneficial connections among tasks within the context of text-rich scenarios +have not been sufficiently explored. In this work, we introduce UniDoc, a novel +multimodal model equipped with text detection and recognition capabilities, +which are deficient in existing approaches. Moreover, UniDoc capitalizes on the +beneficial interactions among tasks to enhance the performance of each +individual task. To implement UniDoc, we perform unified multimodal instruct +tuning on the contributed large-scale instruction following datasets. +Quantitative and qualitative experimental results show that UniDoc sets +state-of-the-art scores across multiple challenging benchmarks. To the best of +our knowledge, this is the first large multimodal model capable of simultaneous +text detection, recognition, spotting, and understanding. + +
+
+
+
+
+ + ☆ Optimizing Multi-Class Text Classification: A Diverse Stacking Ensemble + Framework Utilizing Transformers + + +
+ Customer reviews play a crucial role in assessing customer satisfaction, +gathering feedback, and driving improvements for businesses. Analyzing these +reviews provides valuable insights into customer sentiments, including +compliments, comments, and suggestions. Text classification techniques enable +businesses to categorize customer reviews into distinct categories, +facilitating a better understanding of customer feedback. However, challenges +such as overfitting and bias limit the effectiveness of a single classifier in +ensuring optimal prediction. This study proposes a novel approach to address +these challenges by introducing a stacking ensemble-based multi-text +classification method that leverages transformer models. By combining multiple +single transformers, including BERT, ELECTRA, and DistilBERT, as base-level +classifiers, and a meta-level classifier based on RoBERTa, an optimal +predictive model is generated. The proposed stacking ensemble-based multi-text +classification method aims to enhance the accuracy and robustness of customer +review analysis. Experimental evaluations conducted on a real-world customer +review dataset demonstrate the effectiveness and superiority of the proposed +approach over traditional single classifier models. The stacking ensemble-based +multi-text classification method using transformers proves to be a promising +solution for businesses seeking to extract valuable insights from customer +reviews and make data-driven decisions to enhance customer satisfaction and +drive continuous improvement. + +
+
+
+
+
+ + ☆ Causal Intersectionality and Dual Form of Gradient Descent for + Multimodal Analysis: a Case Study on Hateful Memes + + +
+ In the wake of the explosive growth of machine learning (ML) usage, +particularly within the context of emerging Large Language Models (LLMs), +comprehending the semantic significance rooted in their internal workings is +crucial. While causal analyses focus on defining semantics and its +quantification, the gradient-based approach is central to explainable AI (XAI), +tackling the interpretation of the black box. By synergizing these approaches, +the exploration of how a model's internal mechanisms illuminate its causal +effect has become integral for evidence-based decision-making. A parallel line +of research has revealed that intersectionality - the combinatory impact of +multiple demographics of an individual - can be structured in the form of an +Averaged Treatment Effect (ATE). Initially, this study illustrates that the +hateful memes detection problem can be formulated as an ATE, assisted by the +principles of intersectionality, and that a modality-wise summarization of +gradient-based attention attribution scores can delineate the distinct +behaviors of three Transformerbased models concerning ATE. Subsequently, we +show that the latest LLM LLaMA2 has the ability to disentangle the +intersectional nature of memes detection in an in-context learning setting, +with their mechanistic properties elucidated via meta-gradient, a secondary +form of gradient. In conclusion, this research contributes to the ongoing +dialogue surrounding XAI and the multifaceted nature of ML models. + +
+
+
+
+
+ + ♻ ☆ Efficient Guided Generation for Large Language Models + + +
+ In this article we show how the problem of neural text generation can be +constructively reformulated in terms of transitions between the states of a +finite-state machine. This framework leads to an efficient approach to guiding +text generation with regular expressions and context-free grammars by allowing +the construction of an index over a language model's vocabulary. The approach +is model agnostic, allows one to enforce domain-specific knowledge and +constraints, and enables the construction of reliable interfaces by +guaranteeing the structure of the generated text. It adds little overhead to +the token sequence generation process and significantly outperforms existing +solutions. An implementation is provided in the open source Python library +Outlines + +
+
+
+
+
+ + ♻ ☆ Zero-Shot Composed Image Retrieval with Textual Inversion ICCV2023 + + +
+ Composed Image Retrieval (CIR) aims to retrieve a target image based on a +query composed of a reference image and a relative caption that describes the +difference between the two images. The high effort and cost required for +labeling datasets for CIR hamper the widespread usage of existing methods, as +they rely on supervised learning. In this work, we propose a new task, +Zero-Shot CIR (ZS-CIR), that aims to address CIR without requiring a labeled +training dataset. Our approach, named zero-Shot composEd imAge Retrieval with +textuaL invErsion (SEARLE), maps the visual features of the reference image +into a pseudo-word token in CLIP token embedding space and integrates it with +the relative caption. To support research on ZS-CIR, we introduce an +open-domain benchmarking dataset named Composed Image Retrieval on Common +Objects in context (CIRCO), which is the first dataset for CIR containing +multiple ground truths for each query. The experiments show that SEARLE +exhibits better performance than the baselines on the two main datasets for CIR +tasks, FashionIQ and CIRR, and on the proposed CIRCO. The dataset, the code and +the model are publicly available at https://github.com/miccunifi/SEARLE. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ♻ ☆ Federated Few-Shot Learning for Mobile NLP + + +
+ Natural language processing (NLP) sees rich mobile applications. To support +various language understanding tasks, a foundation NLP model is often +fine-tuned in a federated, privacy-preserving setting (FL). This process +currently relies on at least hundreds of thousands of labeled training samples +from mobile clients; yet mobile users often lack willingness or knowledge to +label their data. Such an inadequacy of data labels is known as a few-shot +scenario; it becomes the key blocker for mobile NLP applications. + For the first time, this work investigates federated NLP in the few-shot +scenario (FedFSL). By retrofitting algorithmic advances of pseudo labeling and +prompt learning, we first establish a training pipeline that delivers +competitive accuracy when only 0.05% (fewer than 100) of the training data is +labeled and the remaining is unlabeled. To instantiate the workflow, we further +present a system FeS, addressing the high execution cost with novel designs. +(1) Curriculum pacing, which injects pseudo labels to the training workflow at +a rate commensurate to the learning progress; (2) Representational diversity, a +mechanism for selecting the most learnable data, only for which pseudo labels +will be generated; (3) Co-planning of a model's training depth and layer +capacity. Together, these designs reduce the training delay, client energy, and +network traffic by up to 46.0$\times$, 41.2$\times$ and 3000.0$\times$, +respectively. Through algorithm/system co-design, FFNLP demonstrates that FL +can apply to challenging settings where most training samples are unlabeled. + +
+
+ comment: MobiCom 2023 +
+
+
+
+
+ + ♻ ☆ Towards Practical Few-shot Federated NLP EuroSys23 + + +
+ Transformer-based pre-trained models have emerged as the predominant solution +for natural language processing (NLP). Fine-tuning such pre-trained models for +downstream tasks often requires a considerable amount of labeled private data. +In practice, private data is often distributed across heterogeneous mobile +devices and may be prohibited from being uploaded. Moreover, well-curated +labeled data is often scarce, presenting an additional challenge. To address +these challenges, we first introduce a data generator for federated few-shot +learning tasks, which encompasses the quantity and skewness of scarce labeled +data in a realistic setting. Subsequently, we propose AUG-FedPrompt, a +prompt-based federated learning system that exploits abundant unlabeled data +for data augmentation. Our experiments indicate that AUG-FedPrompt can perform +on par with full-set fine-tuning with a limited amount of labeled data. +However, such competitive performance comes at a significant system cost. + +
+
+ comment: EuroSys23 workshop +
+
+
+
+
+ + ♻ ☆ YATO: Yet Another deep learning based Text analysis Open toolkit + + +
+ We introduce YATO, an open-source, easy-to-use toolkit for text analysis with +deep learning. Different from existing heavily engineered toolkits and +platforms, YATO is lightweight and user-friendly for researchers from +cross-disciplinary areas. Designed in a hierarchical structure, YATO supports +free combinations of three types of widely used features including 1) +traditional neural networks (CNN, RNN, etc.); 2) pre-trained language models +(BERT, RoBERTa, ELECTRA, etc.); and 3) user-customized neural features via a +simple configurable file. Benefiting from the advantages of flexibility and +ease of use, YATO can facilitate fast reproduction and refinement of +state-of-the-art NLP models, and promote the cross-disciplinary applications of +NLP techniques. The code, examples, and documentation are publicly available at +https://github.com/jiesutd/YATO. A demo video is also available at +https://youtu.be/tSjjf5BzfQg. + +
+
+
+
+
+ + ♻ ☆ Single-Sentence Reader: A Novel Approach for Addressing Answer Position + Bias + + +
+ Machine Reading Comprehension (MRC) models tend to take advantage of spurious +correlations (also known as dataset bias or annotation artifacts in the +research community). Consequently, these models may perform the MRC task +without fully comprehending the given context and question, which is +undesirable since it may result in low robustness against distribution shift. +This paper delves into the concept of answer-position bias, where a significant +percentage of training questions have answers located solely in the first +sentence of the context. We propose a Single-Sentence Reader as a new approach +for addressing answer position bias in MRC. We implement this approach using +six different models and thoroughly analyze their performance. Remarkably, our +proposed Single-Sentence Readers achieve results that nearly match those of +models trained on conventional training sets, proving their effectiveness. Our +study also discusses several challenges our Single-Sentence Readers encounter +and proposes a potential solution. + +
+
+ comment: We need to revise our paper +
+
+
+
+
+ + ♻ ☆ Natural Language is All a Graph Needs + + +
+ The emergence of large-scale pre-trained language models, such as ChatGPT, +has revolutionized various research fields in artificial intelligence. +Transformers-based large language models (LLMs) have gradually replaced CNNs +and RNNs to unify fields of computer vision and natural language processing. +Compared with the data that exists relatively independently such as images, +videos or texts, graph is a type of data that contains rich structural and +relational information. Meanwhile, natural language, as one of the most +expressive mediums, excels in describing complex structures. However, existing +work on incorporating graph learning problems into the generative language +modeling framework remains very limited. As the importance of large language +models continues to grow, it becomes essential to explore whether LLMs can also +replace GNNs as the foundation model for graphs. In this paper, we propose +InstructGLM (Instruction-finetuned Graph Language Model), systematically design +highly scalable prompts based on natural language instructions, and use natural +language to describe the geometric structure and node features of the graph for +instruction tuning an LLM to perform learning and inference on graphs in a +generative manner. Our method exceeds all competitive GNN baselines on +ogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of +our method and sheds light on generative large language models as the +foundation model for graph machine learning. + +
+
+ comment: 21 pages, 2 figures, 5 tables +
+
+
+
+
+
+
+
+ + Information Retrieval 12 + +
+
+
+ + ☆ Large Language Models as Zero-Shot Conversational Recommenders CIKM 2023 + + +
+ In this paper, we present empirical studies on conversational recommendation +tasks using representative large language models in a zero-shot setting with +three primary contributions. (1) Data: To gain insights into model behavior in +"in-the-wild" conversational recommendation scenarios, we construct a new +dataset of recommendation-related conversations by scraping a popular +discussion website. This is the largest public real-world conversational +recommendation dataset to date. (2) Evaluation: On the new dataset and two +existing conversational recommendation datasets, we observe that even without +fine-tuning, large language models can outperform existing fine-tuned +conversational recommendation models. (3) Analysis: We propose various probing +tasks to investigate the mechanisms behind the remarkable performance of large +language models in conversational recommendation. We analyze both the large +language models' behaviors and the characteristics of the datasets, providing a +holistic understanding of the models' effectiveness, limitations and suggesting +directions for the design of future conversational recommenders + +
+
+ comment: Accepted as CIKM 2023 long paper. Longer version is coming soon + (e.g., more details about dataset) +
+
+
+
+
+ + ☆ Voucher Abuse Detection with Prompt-based Fine-tuning on Graph Neural + Networks CIKM23 + + +
+ Voucher abuse detection is an important anomaly detection problem in +E-commerce. While many GNN-based solutions have emerged, the supervised +paradigm depends on a large quantity of labeled data. A popular alternative is +to adopt self-supervised pre-training using label-free data, and further +fine-tune on a downstream task with limited labels. Nevertheless, the +"pre-train, fine-tune" paradigm is often plagued by the objective gap between +pre-training and downstream tasks. Hence, we propose VPGNN, a prompt-based +fine-tuning framework on GNNs for voucher abuse detection. We design a novel +graph prompting function to reformulate the downstream task into a similar +template as the pretext task in pre-training, thereby narrowing the objective +gap. Extensive experiments on both proprietary and public datasets demonstrate +the strength of VPGNN in both few-shot and semi-supervised scenarios. Moreover, +an online deployment of VPGNN in a production environment shows a 23.4% +improvement over two existing deployed models. + +
+
+ comment: 7 pages, Accepted by CIKM23 Applied Research Track +
+
+
+
+
+ + ☆ Explicit Time Embedding Based Cascade Attention Network for Information + Popularity Prediction + + +
+ Predicting information cascade popularity is a fundamental problem in social +networks. Capturing temporal attributes and cascade role information (e.g., +cascade graphs and cascade sequences) is necessary for understanding the +information cascade. Current methods rarely focus on unifying this information +for popularity predictions, which prevents them from effectively modeling the +full properties of cascades to achieve satisfactory prediction performances. In +this paper, we propose an explicit Time embedding based Cascade Attention +Network (TCAN) as a novel popularity prediction architecture for large-scale +information networks. TCAN integrates temporal attributes (i.e., periodicity, +linearity, and non-linear scaling) into node features via a general time +embedding approach (TE), and then employs a cascade graph attention encoder +(CGAT) and a cascade sequence attention encoder (CSAT) to fully learn the +representation of cascade graphs and cascade sequences. We use two real-world +datasets (i.e., Weibo and APS) with tens of thousands of cascade samples to +validate our methods. Experimental results show that TCAN obtains mean +logarithm squared errors of 2.007 and 1.201 and running times of 1.76 hours and +0.15 hours on both datasets, respectively. Furthermore, TCAN outperforms other +representative baselines by 10.4%, 3.8%, and 10.4% in terms of MSLE, MAE, and +R-squared on average while maintaining good interpretability. + +
+
+
+
+
+ + ☆ Time-aligned Exposure-enhanced Model for Click-Through Rate Prediction + + +
+ Click-Through Rate (CTR) prediction, crucial in applications like recommender +systems and online advertising, involves ranking items based on the likelihood +of user clicks. User behavior sequence modeling has marked progress in CTR +prediction, which extracts users' latent interests from their historical +behavior sequences to facilitate accurate CTR prediction. Recent research +explores using implicit feedback sequences, like unclicked records, to extract +diverse user interests. However, these methods encounter key challenges: 1) +temporal misalignment due to disparate sequence time ranges and 2) the lack of +fine-grained interaction among feedback sequences. To address these challenges, +we propose a novel framework called TEM4CTR, which ensures temporal alignment +among sequences while leveraging auxiliary feedback information to enhance +click behavior at the item level through a representation projection mechanism. +Moreover, this projection-based information transfer module can effectively +alleviate the negative impact of irrelevant or even potentially detrimental +components of the auxiliary feedback information on the learning process of +click behavior. Comprehensive experiments on public and industrial datasets +confirm the superiority and effectiveness of TEM4CTR, showcasing the +significance of temporal alignment in multi-feedback modeling. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ printf: Preference Modeling Based on User Reviews with Item Images and + Textual Information via Graph Learning CIKM '23 + + +
+ Nowadays, modern recommender systems usually leverage textual and visual +contents as auxiliary information to predict user preference. For textual +information, review texts are one of the most popular contents to model user +behaviors. Nevertheless, reviews usually lose their shine when it comes to +top-N recommender systems because those that solely utilize textual reviews as +features struggle to adequately capture the interaction relationships between +users and items. For visual one, it is usually modeled with naive convolutional +networks and also hard to capture high-order relationships between users and +items. Moreover, previous works did not collaboratively use both texts and +images in a proper way. In this paper, we propose printf, preference modeling +based on user reviews with item images and textual information via graph +learning, to address the above challenges. Specifically, the dimension-based +attention mechanism directs relations between user reviews and interacted +items, allowing each dimension to contribute different importance weights to +derive user representations. Extensive experiments are conducted on three +publicly available datasets. The experimental results demonstrate that our +proposed printf consistently outperforms baseline methods with the relative +improvements for NDCG@5 of 26.80%, 48.65%, and 25.74% on Amazon-Grocery, +Amazon-Tools, and Amazon-Electronics datasets, respectively. The in-depth +analysis also indicates the dimensions of review representations definitely +have different topics and aspects, assisting the validity of our model design. + +
+
+ comment: In Proceedings of The 32nd ACM International Conference on + Information and Knowledge Management (CIKM '23), ACM, 2023 +
+
+
+
+
+ + ☆ RAH! RecSys-Assistant-Human: A Human-Central Recommendation Framework + with Large Language Models + + +
+ The recommendation ecosystem involves interactions between recommender +systems(Computer) and users(Human). Orthogonal to the perspective of +recommender systems, we attempt to utilize LLMs from the perspective of users +and propose a more human-central recommendation framework named RAH, which +consists of Recommender system, Assistant and Human. The assistant is a +LLM-based and personal proxy for a human to achieve user satisfaction. The +assistant plays a non-invasion role and the RAH framework can adapt to +different recommender systems and user groups. Subsequently, we implement and +evaluate the RAH framework for learning user personalities and proxy human +feedback. The experiment shows that (1) using learn-action-critic and +reflection mechanisms can lead more aligned personality and (2) our assistant +can effectively proxy human feedback and help adjust recommender systems. +Finally, we discuss further strategies in the RAH framework to address +human-central concerns including user control, privacy and fairness. + +
+
+
+
+
+ + ☆ Black-box Adversarial Attacks against Dense Retrieval Models: A + Multi-view Contrastive Learning Method CIKM2023 + + +
+ Neural ranking models (NRMs) and dense retrieval (DR) models have given rise +to substantial improvements in overall retrieval performance. In addition to +their effectiveness, and motivated by the proven lack of robustness of deep +learning-based approaches in other areas, there is growing interest in the +robustness of deep learning-based approaches to the core retrieval problem. +Adversarial attack methods that have so far been developed mainly focus on +attacking NRMs, with very little attention being paid to the robustness of DR +models. In this paper, we introduce the adversarial retrieval attack (AREA) +task. The AREA task is meant to trick DR models into retrieving a target +document that is outside the initial set of candidate documents retrieved by +the DR model in response to a query. We consider the decision-based black-box +adversarial setting, which is realistic in real-world search engines. To +address the AREA task, we first employ existing adversarial attack methods +designed for NRMs. We find that the promising results that have previously been +reported on attacking NRMs, do not generalize to DR models: these methods +underperform a simple term spamming method. We attribute the observed lack of +generalizability to the interaction-focused architecture of NRMs, which +emphasizes fine-grained relevance matching. DR models follow a different +representation-focused architecture that prioritizes coarse-grained +representations. We propose to formalize attacks on DR models as a contrastive +learning problem in a multi-view representation space. The core idea is to +encourage the consistency between each view representation of the target +document and its corresponding viewer via view-wise supervision signals. +Experimental results demonstrate that the proposed method can significantly +outperform existing attack strategies in misleading the DR model with small +indiscernible text perturbations. + +
+
+ comment: Accept by CIKM2023, 10 pages +
+
+
+
+
+ + ♻ ☆ CompMix: A Benchmark for Heterogeneous Question Answering + + +
+ Fact-centric question answering (QA) often requires access to multiple, +heterogeneous, information sources. By jointly considering several sources like +a knowledge base (KB), a text collection, and tables from the web, QA systems +can enhance their answer coverage and confidence. However, existing QA +benchmarks are mostly constructed with a single source of knowledge in mind. +This limits capabilities of these benchmarks to fairly evaluate QA systems that +can tap into more than one information repository. To bridge this gap, we +release CompMix, a crowdsourced QA benchmark which naturally demands the +integration of a mixture of input sources. CompMix has a total of 9,410 +questions, and features several complex intents like joins and temporal +conditions. Evaluation of a range of QA systems on CompMix highlights the need +for further research on leveraging information from heterogeneous sources. + +
+
+
+
+
+ + ♻ ☆ Zero-Shot Composed Image Retrieval with Textual Inversion ICCV2023 + + +
+ Composed Image Retrieval (CIR) aims to retrieve a target image based on a +query composed of a reference image and a relative caption that describes the +difference between the two images. The high effort and cost required for +labeling datasets for CIR hamper the widespread usage of existing methods, as +they rely on supervised learning. In this work, we propose a new task, +Zero-Shot CIR (ZS-CIR), that aims to address CIR without requiring a labeled +training dataset. Our approach, named zero-Shot composEd imAge Retrieval with +textuaL invErsion (SEARLE), maps the visual features of the reference image +into a pseudo-word token in CLIP token embedding space and integrates it with +the relative caption. To support research on ZS-CIR, we introduce an +open-domain benchmarking dataset named Composed Image Retrieval on Common +Objects in context (CIRCO), which is the first dataset for CIR containing +multiple ground truths for each query. The experiments show that SEARLE +exhibits better performance than the baselines on the two main datasets for CIR +tasks, FashionIQ and CIRR, and on the proposed CIRCO. The dataset, the code and +the model are publicly available at https://github.com/miccunifi/SEARLE. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ♻ ☆ Scalable Neural Contextual Bandit for Recommender Systems + + +
+ High-quality recommender systems ought to deliver both innovative and +relevant content through effective and exploratory interactions with users. +Yet, supervised learning-based neural networks, which form the backbone of many +existing recommender systems, only leverage recognized user interests, falling +short when it comes to efficiently uncovering unknown user preferences. While +there has been some progress with neural contextual bandit algorithms towards +enabling online exploration through neural networks, their onerous +computational demands hinder widespread adoption in real-world recommender +systems. In this work, we propose a scalable sample-efficient neural contextual +bandit algorithm for recommender systems. To do this, we design an epistemic +neural network architecture, Epistemic Neural Recommendation (ENR), that +enables Thompson sampling at a large scale. In two distinct large-scale +experiments with real-world tasks, ENR significantly boosts click-through rates +and user ratings by at least 9% and 6% respectively compared to +state-of-the-art neural contextual bandit algorithms. Furthermore, it achieves +equivalent performance with at least 29% fewer user interactions compared to +the best-performing baseline algorithm. Remarkably, while accomplishing these +improvements, ENR demands orders of magnitude fewer computational resources +than neural contextual bandit baseline algorithms. + +
+
+
+
+
+ + ♻ ☆ FARA: Future-aware Ranking Algorithm for Fairness Optimization CIKM2023 + + +
+ Ranking systems are the key components of modern Information Retrieval (IR) +applications, such as search engines and recommender systems. Besides the +ranking relevance to users, the exposure fairness to item providers has also +been considered an important factor in ranking optimization. Many fair ranking +algorithms have been proposed to jointly optimize both ranking relevance and +fairness. However, we find that most existing fair ranking methods adopt greedy +algorithms that only optimize rankings for the next immediate session or +request. As shown in this paper, such a myopic paradigm could limit the upper +bound of ranking optimization and lead to suboptimal performance in the long +term. + To this end, we propose \textbf{FARA}, a novel \textbf{F}uture-\textbf{A}ware +\textbf{R}anking \textbf{A}lgorithm for ranking relevance and fairness +optimization. Instead of greedily optimizing rankings for the next immediate +session, FARA plans ahead by jointly optimizing multiple ranklists together and +saving them for future sessions. Specifically, FARA first uses the Taylor +expansion to investigate how future ranklists will influence the overall +fairness of the system. Then, based on the analysis of the Taylor expansion, +FARA adopts a two-phase optimization algorithm where we first solve an optimal +future exposure planning problem and then construct the optimal ranklists +according to the optimal future exposure planning. Theoretically, we show that +FARA is optimal for ranking relevance and fairness joint optimization. +Empirically, our extensive experiments on three semi-synthesized datasets show +that FARA is efficient, effective, and can deliver significantly better ranking +performance compared to state-of-the-art fair ranking methods. We make our +implementation public at +\href{https://github.com/Taosheng-ty/QP_fairness/}{https://github.com/Taosheng-ty/QP\_fairness/}. + +
+
+ comment: 11 pages, four figures, four tables. CIKM2023 +
+
+
+
+
+ + ♻ ☆ Natural Language is All a Graph Needs + + +
+ The emergence of large-scale pre-trained language models, such as ChatGPT, +has revolutionized various research fields in artificial intelligence. +Transformers-based large language models (LLMs) have gradually replaced CNNs +and RNNs to unify fields of computer vision and natural language processing. +Compared with the data that exists relatively independently such as images, +videos or texts, graph is a type of data that contains rich structural and +relational information. Meanwhile, natural language, as one of the most +expressive mediums, excels in describing complex structures. However, existing +work on incorporating graph learning problems into the generative language +modeling framework remains very limited. As the importance of large language +models continues to grow, it becomes essential to explore whether LLMs can also +replace GNNs as the foundation model for graphs. In this paper, we propose +InstructGLM (Instruction-finetuned Graph Language Model), systematically design +highly scalable prompts based on natural language instructions, and use natural +language to describe the geometric structure and node features of the graph for +instruction tuning an LLM to perform learning and inference on graphs in a +generative manner. Our method exceeds all competitive GNN baselines on +ogbn-arxiv, Cora and PubMed datasets, which demonstrates the effectiveness of +our method and sheds light on generative large language models as the +foundation model for graph machine learning. + +
+
+ comment: 21 pages, 2 figures, 5 tables +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Dronevision: An Experimental 3D Testbed for Flying Light Specks + + +
+ Today's robotic laboratories for drones are housed in a large room. At times, +they are the size of a warehouse. These spaces are typically equipped with +permanent devices to localize the drones, e.g., Vicon Infrared cameras. +Significant time is invested to fine-tune the localization apparatus to compute +and control the position of the drones. One may use these laboratories to +develop a 3D multimedia system with miniature sized drones configured with +light sources. As an alternative, this brave new idea paper envisions shrinking +these room-sized laboratories to the size of a cube or cuboid that sits on a +desk and costs less than 10K dollars. The resulting Dronevision (DV) will be +the size of a 1990s Television. In addition to light sources, its Flying Light +Specks (FLSs) will be network-enabled drones with storage and processing +capability to implement decentralized algorithms. The DV will include a +localization technique to expedite development of 3D displays. It will act as a +haptic interface for a user to interact with and manipulate the 3D virtual +illuminations. It will empower an experimenter to design, implement, test, +debug, and maintain software and hardware that realize novel algorithms in the +comfort of their office without having to reserve a laboratory. In addition to +enhancing productivity, it will improve safety of the experimenter by +minimizing the likelihood of accidents. This paper introduces the concept of a +DV, the research agenda one may pursue using this device, and our plans to +realize one. + +
+
+
+
+
+ + ☆ An Evaluation of Three Distance Measurement Technologies for Flying + Light Specks + + +
+ This study evaluates the accuracy of three different types of time-of-flight +sensors to measure distance. We envision the possible use of these sensors to +localize swarms of flying light specks (FLSs) to illuminate objects and avatars +of a metaverse. An FLS is a miniature-sized drone configured with RGB light +sources. It is unable to illuminate a point cloud by itself. However, the +inter-FLS relationship effect of an organizational framework will compensate +for the simplicity of each individual FLS, enabling a swarm of cooperating FLSs +to illuminate complex shapes and render haptic interactions. Distance between +FLSs is an important criterion of the inter-FLS relationship. We consider +sensors that use radio frequency (UWB), infrared light (IR), and sound +(ultrasonic) to quantify this metric. Obtained results show only one sensor is +able to measure distances as small as 1 cm with a high accuracy. A sensor may +require a calibration process that impacts its accuracy in measuring distance. + +
+
+ comment: In International Conference on Intelligent Metaverse Technologies and + Applications (iMETA2023), Tartu, Estonia, September 18-20, 2023 +
+
+
+
+
+ + ☆ ILCAS: Imitation Learning-Based Configuration-Adaptive Streaming for + Live Video Analytics with Cross-Camera Collaboration + + +
+ The high-accuracy and resource-intensive deep neural networks (DNNs) have +been widely adopted by live video analytics (VA), where camera videos are +streamed over the network to resource-rich edge/cloud servers for DNN +inference. Common video encoding configurations (e.g., resolution and frame +rate) have been identified with significant impacts on striking the balance +between bandwidth consumption and inference accuracy and therefore their +adaption scheme has been a focus of optimization. However, previous +profiling-based solutions suffer from high profiling cost, while existing deep +reinforcement learning (DRL) based solutions may achieve poor performance due +to the usage of fixed reward function for training the agent, which fails to +craft the application goals in various scenarios. In this paper, we propose +ILCAS, the first imitation learning (IL) based configuration-adaptive VA +streaming system. Unlike DRL-based solutions, ILCAS trains the agent with +demonstrations collected from the expert which is designed as an offline +optimal policy that solves the configuration adaption problem through dynamic +programming. To tackle the challenge of video content dynamics, ILCAS derives +motion feature maps based on motion vectors which allow ILCAS to visually +``perceive'' video content changes. Moreover, ILCAS incorporates a cross-camera +collaboration scheme to exploit the spatio-temporal correlations of cameras for +more proper configuration selection. Extensive experiments confirm the +superiority of ILCAS compared with state-of-the-art solutions, with 2-20.9% +improvement of mean accuracy and 19.9-85.3% reduction of chunk upload lag. + +
+
+ comment: This work has been submitted to the IEEE Transactions on Mobile + Computing for possible publication. Copyright may be transferred without + notice, after which this version may no longer be accessible +
+
+
+
+
+ + ☆ Bamboo: Boosting Training Efficiency for Real-Time Video Streaming via + Online Grouped Federated Transfer Learning + + +
+ Most of the learning-based algorithms for bitrate adaptation are limited to +offline learning, which inevitably suffers from the simulation-to-reality gap. +Online learning can better adapt to dynamic real-time communication scenes but +still face the challenge of lengthy training convergence time. In this paper, +we propose a novel online grouped federated transfer learning framework named +Bamboo to accelerate training efficiency. The preliminary experiments validate +that our method remarkably improves online training efficiency by up to 302% +compared to other reinforcement learning algorithms in various network +conditions while ensuring the quality of experience (QoE) of real-time video +communication. + +
+
+ comment: This paper will be presented at Apnet 2023 +
+
+
+
+
+ + ☆ Noisy-Correspondence Learning for Text-to-Image Person Re-identification + + +
+ Text-to-image person re-identification (TIReID) is a compelling topic in the +cross-modal community, which aims to retrieve the target person based on a +textual query. Although numerous TIReID methods have been proposed and achieved +promising performance, they implicitly assume the training image-text pairs are +correctly aligned, which is not always the case in real-world scenarios. In +practice, the image-text pairs inevitably exist under-correlated or even +false-correlated, a.k.a noisy correspondence (NC), due to the low quality of +the images and annotation errors. To address this problem, we propose a novel +Robust Dual Embedding method (RDE) that can learn robust visual-semantic +associations even with NC. Specifically, RDE consists of two main components: +1) A Confident Consensus Division (CCD) module that leverages the dual-grained +decisions of dual embedding modules to obtain a consensus set of clean training +data, which enables the model to learn correct and reliable visual-semantic +associations. 2) A Triplet-Alignment Loss (TAL) relaxes the conventional +triplet-ranking loss with hardest negatives, which tends to rapidly overfit NC, +to a log-exponential upper bound over all negatives, thus preventing the model +from overemphasizing false image-text pairs. We conduct extensive experiments +on three public benchmarks, namely CUHK-PEDES, ICFG-PEDES, and RSTPReID, to +evaluate the performance and robustness of our RDE. Our method achieves +state-of-the-art results both with and without synthetic noisy correspondences +on all three datasets. + +
+
+
+
+
+ + ♻ ☆ H4VDM: H.264 Video Device Matching + + +
+ Methods that can determine if two given video sequences are captured by the +same device (e.g., mobile telephone or digital camera) can be used in many +forensics tasks. In this paper we refer to this as "video device matching". In +open-set video forensics scenarios it is easier to determine if two video +sequences were captured with the same device than identifying the specific +device. In this paper, we propose a technique for open-set video device +matching. Given two H.264 compressed video sequences, our method can determine +if they are captured by the same device, even if our method has never +encountered the device in training. We denote our proposed technique as H.264 +Video Device Matching (H4VDM). H4VDM uses H.264 compression information +extracted from video sequences to make decisions. It is more robust against +artifacts that alter camera sensor fingerprints, and it can be used to analyze +relatively small fragments of the H.264 sequence. We trained and tested our +method on a publicly available video forensics dataset consisting of 35 +devices, where our proposed method demonstrated good performance. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 41 + +
+
+
+ + ☆ Graph of Thoughts: Solving Elaborate Problems with Large Language Models + + +
+ We introduce Graph of Thoughts (GoT): a framework that advances prompting +capabilities in large language models (LLMs) beyond those offered by paradigms +such as Chain-ofThought or Tree of Thoughts (ToT). The key idea and primary +advantage of GoT is the ability to model the information generated by an LLM as +an arbitrary graph, where units of information ("LLM thoughts") are vertices, +and edges correspond to dependencies between these vertices. This approach +enables combining arbitrary LLM thoughts into synergistic outcomes, distilling +the essence of whole networks of thoughts, or enhancing thoughts using feedback +loops. We illustrate that GoT offers advantages over state of the art on +different tasks, for example increasing the quality of sorting by 62% over ToT, +while simultaneously reducing costs by >31%. We ensure that GoT is extensible +with new thought transformations and thus can be used to spearhead new +prompting schemes. This work brings the LLM reasoning closer to human thinking +or brain mechanisms such as recurrence, both of which form complex networks. + +
+
+
+
+
+ + ☆ OCR Language Models with Custom Vocabularies + + +
+ Language models are useful adjuncts to optical models for producing accurate +optical character recognition (OCR) results. One factor which limits the power +of language models in this context is the existence of many specialized domains +with language statistics very different from those implied by a general +language model - think of checks, medical prescriptions, and many other +specialized document classes. This paper introduces an algorithm for +efficiently generating and attaching a domain specific word based language +model at run time to a general language model in an OCR system. In order to +best use this model the paper also introduces a modified CTC beam search +decoder which effectively allows hypotheses to remain in contention based on +possible future completion of vocabulary words. The result is a substantial +reduction in word error rate in recognizing material from specialized domains. + +
+
+
+
+
+ + ☆ Red-Teaming Large Language Models using Chain of Utterances for + Safety-Alignment + + +
+ Larger language models (LLMs) have taken the world by storm with their +massive multi-tasking capabilities simply by optimizing over a next-word +prediction objective. With the emergence of their properties and encoded +knowledge, the risk of LLMs producing harmful outputs increases, making them +unfit for scalable deployment for the public. In this work, we propose a new +safety evaluation benchmark RED-EVAL that carries out red-teaming. We show that +even widely deployed models are susceptible to the Chain of Utterances-based +(CoU) prompting, jailbreaking closed source LLM-based systems such as GPT-4 and +ChatGPT to unethically respond to more than 65% and 73% of harmful queries. We +also demonstrate the consistency of the RED-EVAL across 8 open-source LLMs in +generating harmful responses in more than 86% of the red-teaming attempts. +Next, we propose RED-INSTRUCT--An approach for the safety alignment of LLMs. It +constitutes two phases: 1) HARMFULQA data collection: Leveraging CoU prompting, +we collect a dataset that consists of 1.9K harmful questions covering a wide +range of topics, 9.5K safe and 7.3K harmful conversations from ChatGPT; 2) +SAFE-ALIGN: We demonstrate how the conversational dataset can be used for the +safety alignment of LLMs by minimizing the negative log-likelihood over helpful +responses and penalizing over harmful responses by gradient accent over sample +loss. Our model STARLING, a fine-tuned Vicuna-7B, is observed to be more safely +aligned when evaluated on RED-EVAL and HHH benchmarks while preserving the +utility of the baseline models (TruthfulQA, MMLU, and BBH). + +
+
+
+
+
+ + ☆ Tree-of-Mixed-Thought: Combining Fast and Slow Thinking for Multi-hop + Visual Reasoning + + +
+ There emerges a promising trend of using large language models (LLMs) to +generate code-like plans for complex inference tasks such as visual reasoning. +This paradigm, known as LLM-based planning, provides flexibility in problem +solving and endows better interpretability. However, current research is mostly +limited to basic scenarios of simple questions that can be straightforward +answered in a few inference steps. Planning for the more challenging multi-hop +visual reasoning tasks remains under-explored. Specifically, under multi-hop +reasoning situations, the trade-off between accuracy and the complexity of +plan-searching becomes prominent. The prevailing algorithms either address the +efficiency issue by employing the fast one-stop generation or adopt a complex +iterative generation method to improve accuracy. Both fail to balance the need +for efficiency and performance. Drawing inspiration from the dual system of +cognition in the human brain, the fast and the slow think processes, we propose +a hierarchical plan-searching algorithm that integrates the one-stop reasoning +(fast) and the Tree-of-thought (slow). Our approach succeeds in performance +while significantly saving inference steps. Moreover, we repurpose the PTR and +the CLEVER datasets, developing a systematic framework for evaluating the +performance and efficiency of LLMs-based plan-search algorithms under reasoning +tasks at different levels of difficulty. Extensive experiments demonstrate the +superiority of our proposed algorithm in terms of performance and efficiency. +The dataset and code will be release soon. + +
+
+ comment: 16 pages,1 figures, under review +
+
+
+
+
+ + ☆ ChatHaruhi: Reviving Anime Character in Reality via Large Language Model + + +
+ Role-playing chatbots built on large language models have drawn interest, but +better techniques are needed to enable mimicking specific fictional characters. +We propose an algorithm that controls language models via an improved prompt +and memories of the character extracted from scripts. We construct ChatHaruhi, +a dataset covering 32 Chinese / English TV / anime characters with over 54k +simulated dialogues. Both automatic and human evaluations show our approach +improves role-playing ability over baselines. Code and data are available at +https://github.com/LC1332/Chat-Haruhi-Suzumiya . + +
+
+ comment: v1 - First version of techique report +
+
+
+
+
+ + ☆ WizardMath: Empowering Mathematical Reasoning for Large Language Models + via Reinforced Evol-Instruct + + +
+ Large language models (LLMs), such as GPT-4, have shown remarkable +performance in natural language processing (NLP) tasks, including challenging +mathematical reasoning. However, most existing open-source models are only +pre-trained on large-scale internet data and without math-related optimization. +In this paper, we present WizardMath, which enhances the mathematical reasoning +abilities of Llama-2, by applying our proposed Reinforcement Learning from +Evol-Instruct Feedback (RLEIF) method to the domain of math. Through extensive +experiments on two mathematical reasoning benchmarks, namely GSM8k and MATH, we +reveal the extraordinary capabilities of our model. WizardMath surpasses all +other open-source LLMs by a substantial margin. Furthermore, our model even +outperforms ChatGPT-3.5, Claude Instant-1, PaLM-2 and Minerva on GSM8k, +simultaneously surpasses Text-davinci-002, PaLM-1 and GPT-3 on MATH. More +details and model weights are public at https://github.com/nlpxucan/WizardLM +and https://huggingface.co/WizardLM. + +
+
+ comment: LLM, Mathematical Reasoning +
+
+
+
+
+ + ☆ PUMGPT: A Large Vision-Language Model for Product Understanding + + +
+ Recent developments of multi-modal large language models have demonstrated +its strong ability in solving vision-language tasks. In this paper, we focus on +the product understanding task, which plays an essential role in enhancing +online shopping experience. Product understanding task includes a variety of +sub-tasks, which require models to respond diverse queries based on multi-modal +product information. Traditional methods design distinct model architectures +for each sub-task. On the contrary, we present PUMGPT, a large vision-language +model aims at unifying all product understanding tasks under a singular model +structure. To bridge the gap between vision and text representations, we +propose Layer-wise Adapters (LA), an approach that provides enhanced alignment +with fewer visual tokens and enables parameter-efficient fine-tuning. Moreover, +the inherent parameter-efficient fine-tuning ability allows PUMGPT to be +readily adapted to new product understanding tasks and emerging products. We +design instruction templates to generate diverse product instruction datasets. +Simultaneously, we utilize open-domain datasets during training to improve the +performance of PUMGPT and its generalization ability. Through extensive +evaluations, PUMGPT demonstrates its superior performance across multiple +product understanding tasks, including product captioning, category +question-answering, attribute extraction, attribute question-answering, and +even free-form question-answering about products. + +
+
+
+
+
+ + ☆ Semantic relatedness in DBpedia: A comparative and experimental + assessment + + +
+ Evaluating semantic relatedness of Web resources is still an open challenge. +This paper focuses on knowledge-based methods, which represent an alternative +to corpus-based approaches, and rely in general on the availability of +knowledge graphs. In particular, we have selected 10 methods from the existing +literature, that have been organized according to it adjacent resources, triple +patterns, and triple weights-based methods. They have been implemented and +evaluated by using DBpedia as reference RDF knowledge graph. Since DBpedia is +continuously evolving, the experimental results provided by these methods in +the literature are not comparable. For this reason, in this work, such methods +have been experimented by running them all at once on the same DBpedia release +and against 14 well-known golden datasets. On the basis of the correlation +values with human judgment obtained according to the experimental results, +weighting the RDF triples in combination with evaluating all the directed paths +linking the compared resources is the best strategy in order to compute +semantic relatedness in DBpedia. + +
+
+ comment: 37 pages, 16 figures +
+
+
+
+
+ + ☆ Predictive Authoring for Brazilian Portuguese Augmentative and + Alternative Communication + + +
+ Individuals with complex communication needs (CCN) often rely on augmentative +and alternative communication (AAC) systems to have conversations and +communique their wants. Such systems allow message authoring by arranging +pictograms in sequence. However, the difficulty of finding the desired item to +complete a sentence can increase as the user's vocabulary increases. This paper +proposes using BERTimbau, a Brazilian Portuguese version of BERT, for pictogram +prediction in AAC systems. To finetune BERTimbau, we constructed an AAC corpus +for Brazilian Portuguese to use as a training corpus. We tested different +approaches to representing a pictogram for prediction: as a word (using +pictogram captions), as a concept (using a dictionary definition), and as a set +of synonyms (using related terms). We also evaluated the usage of images for +pictogram prediction. The results demonstrate that using embeddings computed +from the pictograms' caption, synonyms, or definitions have a similar +performance. Using synonyms leads to lower perplexity, but using captions leads +to the highest accuracies. This paper provides insight into how to represent a +pictogram for prediction using a BERT-like model and the potential of using +images for pictogram prediction. + +
+
+
+
+
+ + ☆ Artificial-Spiking Hierarchical Networks for Vision-Language + Representation Learning + + +
+ With the success of self-supervised learning, multimodal foundation models +have rapidly adapted a wide range of downstream tasks driven by vision and +language (VL) pretraining. State-of-the-art methods achieve impressive +performance by pre-training on large-scale datasets. However, bridging the +semantic gap between the two modalities remains a nonnegligible challenge for +VL tasks. In this work, we propose an efficient computation framework for +multimodal alignment by introducing a novel visual semantic module to further +improve the performance of the VL tasks. Specifically, we propose a flexible +model, namely Artificial-Spiking Hierarchical Networks (ASH-Nets), which +combines the complementary advantages of Artificial neural networks (ANNs) and +Spiking neural networks (SNNs) to enrich visual semantic representations. In +particular, a visual concrete encoder and a semantic abstract encoder are +constructed to learn continuous and discrete latent variables to enhance the +flexibility of semantic encoding. Considering the spatio-temporal properties of +SNNs modeling, we introduce a contrastive learning method to optimize the +inputs of similar samples. This can improve the computational efficiency of the +hierarchical network, while the augmentation of hard samples is beneficial to +the learning of visual representations. Furthermore, the Spiking to Text +Uni-Alignment Learning (STUA) pre-training method is proposed, which only +relies on text features to enhance the encoding ability of abstract semantics. +We validate the performance on multiple well-established downstream VL tasks. +Experiments show that the proposed ASH-Nets achieve competitive results. + +
+
+
+
+
+ + ☆ Exploring Sampling Techniques for Generating Melodies with a Transformer + Language Model + + +
+ Research in natural language processing has demonstrated that the quality of +generations from trained autoregressive language models is significantly +influenced by the used sampling strategy. In this study, we investigate the +impact of different sampling techniques on musical qualities such as diversity +and structure. To accomplish this, we train a high-capacity transformer model +on a vast collection of highly-structured Irish folk melodies and analyze the +musical qualities of the samples generated using distribution truncation +sampling techniques. Specifically, we use nucleus sampling, the recently +proposed "typical sampling", and conventional ancestral sampling. We evaluate +the effect of these sampling strategies in two scenarios: optimal circumstances +with a well-calibrated model and suboptimal circumstances where we +systematically degrade the model's performance. We assess the generated samples +using objective and subjective evaluations. We discover that probability +truncation techniques may restrict diversity and structural patterns in optimal +circumstances, but may also produce more musical samples in suboptimal +circumstances. + +
+
+ comment: 7 pages, 5 figures, 1 table, accepted at the 24th Int. Society for + Music Information Retrieval Conf., Milan, Italy, 2023 +
+
+
+
+
+ + ☆ Scope is all you need: Transforming LLMs for HPC Code + + +
+ With easier access to powerful compute resources, there is a growing trend in +the field of AI for software development to develop larger and larger language +models (LLMs) to address a variety of programming tasks. Even LLMs applied to +tasks from the high-performance computing (HPC) domain are huge in size (e.g., +billions of parameters) and demand expensive compute resources for training. We +found this design choice confusing - why do we need large LLMs trained on +natural languages and programming languages unrelated to HPC for HPC-specific +tasks? In this line of work, we aim to question design choices made by existing +LLMs by developing smaller LLMs for specific domains - we call them +domain-specific LLMs. Specifically, we start off with HPC as a domain and +propose a novel tokenizer named Tokompiler, designed specifically for +preprocessing code in HPC and compilation-centric tasks. Tokompiler leverages +knowledge of language primitives to generate language-oriented tokens, +providing a context-aware understanding of code structure while avoiding human +semantics attributed to code structures completely. We applied Tokompiler to +pre-train two state-of-the-art models, SPT-Code and Polycoder, for a Fortran +code corpus mined from GitHub. We evaluate the performance of these models +against the conventional LLMs. Results demonstrate that Tokompiler +significantly enhances code completion accuracy and semantic understanding +compared to traditional tokenizers in normalized-perplexity tests, down to ~1 +perplexity score. This research opens avenues for further advancements in +domain-specific LLMs, catering to the unique demands of HPC and compilation +tasks. + +
+
+
+
+
+ + ☆ A Methodology for Generative Spelling Correction via Natural Spelling + Errors Emulation across Multiple Domains and Languages EACL 2023 + + +
+ Modern large language models demonstrate impressive capabilities in text +generation and generalization. However, they often struggle with solving text +editing tasks, particularly when it comes to correcting spelling errors and +mistypings. In this paper, we present a methodology for generative spelling +correction (SC), which was tested on English and Russian languages and +potentially can be extended to any language with minor changes. Our research +mainly focuses on exploring natural spelling errors and mistypings in texts and +studying the ways those errors can be emulated in correct sentences to +effectively enrich generative models' pre-train procedure. We investigate the +impact of such emulations and the models' abilities across different text +domains. In this work, we investigate two spelling corruption techniques: 1) +first one mimics human behavior when making a mistake through leveraging +statistics of errors from particular dataset and 2) second adds the most common +spelling errors, keyboard miss clicks, and some heuristics within the texts. We +conducted experiments employing various corruption strategies, models' +architectures and sizes on the pre-training and fine-tuning stages and +evaluated the models using single-domain and multi-domain test sets. As a +practical outcome of our work, we introduce SAGE (Spell checking via +Augmentation and Generative distribution Emulation) is a library for automatic +generative SC that includes a family of pre-trained generative models and +built-in augmentation algorithms. + +
+
+ comment: to appear in EACL 2023 +
+
+
+
+
+ + ☆ Leveraging Large Language Models for DRL-Based Anti-Jamming Strategies + in Zero Touch Networks + + +
+ As the dawn of sixth-generation (6G) networking approaches, it promises +unprecedented advancements in communication and automation. Among the leading +innovations of 6G is the concept of Zero Touch Networks (ZTNs), aiming to +achieve fully automated, self-optimizing networks with minimal human +intervention. Despite the advantages ZTNs offer in terms of efficiency and +scalability, challenges surrounding transparency, adaptability, and human trust +remain prevalent. Concurrently, the advent of Large Language Models (LLMs) +presents an opportunity to elevate the ZTN framework by bridging the gap +between automated processes and human-centric interfaces. This paper explores +the integration of LLMs into ZTNs, highlighting their potential to enhance +network transparency and improve user interactions. Through a comprehensive +case study on deep reinforcement learning (DRL)-based anti-jamming technique, +we demonstrate how LLMs can distill intricate network operations into +intuitive, human-readable reports. Additionally, we address the technical and +ethical intricacies of melding LLMs with ZTNs, with an emphasis on data +privacy, transparency, and bias reduction. Looking ahead, we identify emerging +research avenues at the nexus of LLMs and ZTNs, advocating for sustained +innovation and interdisciplinary synergy in the domain of automated networks. + +
+
+
+
+
+ + ☆ TrOMR:Transformer-Based Polyphonic Optical Music Recognition + + +
+ Optical Music Recognition (OMR) is an important technology in music and has +been researched for a long time. Previous approaches for OMR are usually based +on CNN for image understanding and RNN for music symbol classification. In this +paper, we propose a transformer-based approach with excellent global perceptual +capability for end-to-end polyphonic OMR, called TrOMR. We also introduce a +novel consistency loss function and a reasonable approach for data annotation +to improve recognition accuracy for complex music scores. Extensive experiments +demonstrate that TrOMR outperforms current OMR methods, especially in +real-world scenarios. We also develop a TrOMR system and build a camera scene +dataset for full-page music scores in real-world. The code and datasets will be +made available for reproducibility. + +
+
+
+
+
+ + ☆ A tailored Handwritten-Text-Recognition System for Medieval Latin + + +
+ The Bavarian Academy of Sciences and Humanities aims to digitize its Medieval +Latin Dictionary. This dictionary entails record cards referring to lemmas in +medieval Latin, a low-resource language. A crucial step of the digitization +process is the Handwritten Text Recognition (HTR) of the handwritten lemmas +found on these record cards. In our work, we introduce an end-to-end pipeline, +tailored to the medieval Latin dictionary, for locating, extracting, and +transcribing the lemmas. We employ two state-of-the-art (SOTA) image +segmentation models to prepare the initial data set for the HTR task. +Furthermore, we experiment with different transformer-based models and conduct +a set of experiments to explore the capabilities of different combinations of +vision encoders with a GPT-2 decoder. Additionally, we also apply extensive +data augmentation resulting in a highly competitive model. The best-performing +setup achieved a Character Error Rate (CER) of 0.015, which is even superior to +the commercial Google Cloud Vision model, and shows more stable performance. + +
+
+ comment: This paper has been accepted at the First Workshop on Ancient + Language Processing, co-located with RANLP 2023. This is the author's version + of the work. The definite version of record will be published in the + proceedings +
+
+
+
+
+ + ☆ Accelerated materials language processing enabled by GPT + + +
+ Materials language processing (MLP) is one of the key facilitators of +materials science research, as it enables the extraction of structured +information from massive materials science literature. Prior works suggested +high-performance MLP models for text classification, named entity recognition +(NER), and extractive question answering (QA), which require complex model +architecture, exhaustive fine-tuning and a large number of human-labelled +datasets. In this study, we develop generative pretrained transformer +(GPT)-enabled pipelines where the complex architectures of prior MLP models are +replaced with strategic designs of prompt engineering. First, we develop a +GPT-enabled document classification method for screening relevant documents, +achieving comparable accuracy and reliability compared to prior models, with +only small dataset. Secondly, for NER task, we design an entity-centric +prompts, and learning few-shot of them improved the performance on most of +entities in three open datasets. Finally, we develop an GPT-enabled extractive +QA model, which provides improved performance and shows the possibility of +automatically correcting annotations. While our findings confirm the potential +of GPT-enabled MLP models as well as their value in terms of reliability and +practicability, our scientific methods and systematic approach are applicable +to any materials science domain to accelerate the information extraction of +scientific literature. + +
+
+
+
+
+ + ☆ Document Automation Architectures: Updated Survey in Light of Large + Language Models + + +
+ This paper surveys the current state of the art in document automation (DA). +The objective of DA is to reduce the manual effort during the generation of +documents by automatically creating and integrating input from different +sources and assembling documents conforming to defined templates. There have +been reviews of commercial solutions of DA, particularly in the legal domain, +but to date there has been no comprehensive review of the academic research on +DA architectures and technologies. The current survey of DA reviews the +academic literature and provides a clearer definition and characterization of +DA and its features, identifies state-of-the-art DA architectures and +technologies in academic research, and provides ideas that can lead to new +research opportunities within the DA field in light of recent advances in +generative AI and large language models. + +
+
+ comment: The current paper is the updated version of an earlier survey on + document automation [Ahmadi Achachlouei et al. 2021]. Updates in the current + paper are as follows: We shortened almost all sections to reduce the size of + the main paper (without references) from 28 pages to 10 pages, added a review + of selected papers on large language models, removed certain sections and + most of diagrams. arXiv admin note: substantial text overlap with + arXiv:2109.11603 +
+
+
+
+
+ + ☆ KESDT: knowledge enhanced shallow and deep Transformer for detecting + adverse drug reactions + + +
+ Adverse drug reaction (ADR) detection is an essential task in the medical +field, as ADRs have a gravely detrimental impact on patients' health and the +healthcare system. Due to a large number of people sharing information on +social media platforms, an increasing number of efforts focus on social media +data to carry out effective ADR detection. Despite having achieved impressive +performance, the existing methods of ADR detection still suffer from three main +challenges. Firstly, researchers have consistently ignored the interaction +between domain keywords and other words in the sentence. Secondly, social media +datasets suffer from the challenges of low annotated data. Thirdly, the issue +of sample imbalance is commonly observed in social media datasets. To solve +these challenges, we propose the Knowledge Enhanced Shallow and Deep +Transformer(KESDT) model for ADR detection. Specifically, to cope with the +first issue, we incorporate the domain keywords into the Transformer model +through a shallow fusion manner, which enables the model to fully exploit the +interactive relationships between domain keywords and other words in the +sentence. To overcome the low annotated data, we integrate the synonym sets +into the Transformer model through a deep fusion manner, which expands the size +of the samples. To mitigate the impact of sample imbalance, we replace the +standard cross entropy loss function with the focal loss function for effective +model training. We conduct extensive experiments on three public datasets +including TwiMed, Twitter, and CADEC. The proposed KESDT outperforms +state-of-the-art baselines on F1 values, with relative improvements of 4.87%, +47.83%, and 5.73% respectively, which demonstrates the effectiveness of our +proposed KESDT. + +
+
+
+
+
+ + ☆ Lip Reading for Low-resource Languages by Learning and Combining General + Speech Knowledge and Language-specific Knowledge ICCV 2023 + + +
+ This paper proposes a novel lip reading framework, especially for +low-resource languages, which has not been well addressed in the previous +literature. Since low-resource languages do not have enough video-text paired +data to train the model to have sufficient power to model lip movements and +language, it is regarded as challenging to develop lip reading models for +low-resource languages. In order to mitigate the challenge, we try to learn +general speech knowledge, the ability to model lip movements, from a +high-resource language through the prediction of speech units. It is known that +different languages partially share common phonemes, thus general speech +knowledge learned from one language can be extended to other languages. Then, +we try to learn language-specific knowledge, the ability to model language, by +proposing Language-specific Memory-augmented Decoder (LMDecoder). LMDecoder +saves language-specific audio features into memory banks and can be trained on +audio-text paired data which is more easily accessible than video-text paired +data. Therefore, with LMDecoder, we can transform the input speech units into +language-specific audio features and translate them into texts by utilizing the +learned rich language knowledge. Finally, by combining general speech knowledge +and language-specific knowledge, we can efficiently develop lip reading models +even for low-resource languages. Through extensive experiments using five +languages, English, Spanish, French, Italian, and Portuguese, the effectiveness +of the proposed method is evaluated. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Differentiable Retrieval Augmentation via Generative Language Modeling + for E-commerce Query Intent Classification CIKM2023 + + +
+ Retrieval augmentation, which enhances downstream models by a knowledge +retriever and an external corpus instead of by merely increasing the number of +model parameters, has been successfully applied to many natural language +processing (NLP) tasks such as text classification, question answering and so +on. However, existing methods that separately or asynchronously train the +retriever and downstream model mainly due to the non-differentiability between +the two parts, usually lead to degraded performance compared to end-to-end +joint training. + +
+
+ comment: 5 pages, 2 figures; accepted by CIKM2023 +
+
+
+
+
+ + ☆ Conversational Ontology Alignment with ChatGPT + + +
+ This study evaluates the applicability and efficiency of ChatGPT for ontology +alignment using a naive approach. ChatGPT's output is compared to the results +of the Ontology Alignment Evaluation Initiative 2022 campaign using conference +track ontologies. This comparison is intended to provide insights into the +capabilities of a conversational large language model when used in a naive way +for ontology matching, and to investigate the potential advantages and +disadvantages of this approach. + +
+
+
+
+
+ + ☆ How susceptible are LLMs to Logical Fallacies? + + +
+ This paper investigates the rational thinking capability of Large Language +Models (LLMs) in multi-round argumentative debates by exploring the impact of +fallacious arguments on their logical reasoning performance. More specifically, +we present Logic Competence Measurement Benchmark (LOGICOM), a diagnostic +benchmark to assess the robustness of LLMs against logical fallacies. LOGICOM +involves two agents: a persuader and a debater engaging in a multi-round debate +on a controversial topic, where the persuader tries to convince the debater of +the correctness of its claim. First, LOGICOM assesses the potential of LLMs to +change their opinions through reasoning. Then, it evaluates the debater's +performance in logical reasoning by contrasting the scenario where the +persuader employs logical fallacies against one where logical reasoning is +used. We use this benchmark to evaluate the performance of GPT-3.5 and GPT-4 +using a dataset containing controversial topics, claims, and reasons supporting +them. Our findings indicate that both GPT-3.5 and GPT-4 can adjust their +opinion through reasoning. However, when presented with logical fallacies, +GPT-3.5 and GPT-4 are erroneously convinced 41% and 69% more often, +respectively, compared to when logical reasoning is used. Finally, we introduce +a new dataset containing over 5k pairs of logical vs. fallacious arguments. The +source code and dataset of this work are made publicly available. + +
+
+
+
+
+ + ☆ An Image is Worth a Thousand Toxic Words: A Metamorphic Testing + Framework for Content Moderation Software + + +
+ The exponential growth of social media platforms has brought about a +revolution in communication and content dissemination in human society. +Nevertheless, these platforms are being increasingly misused to spread toxic +content, including hate speech, malicious advertising, and pornography, leading +to severe negative consequences such as harm to teenagers' mental health. +Despite tremendous efforts in developing and deploying textual and image +content moderation methods, malicious users can evade moderation by embedding +texts into images, such as screenshots of the text, usually with some +interference. We find that modern content moderation software's performance +against such malicious inputs remains underexplored. In this work, we propose +OASIS, a metamorphic testing framework for content moderation software. OASIS +employs 21 transform rules summarized from our pilot study on 5,000 real-world +toxic contents collected from 4 popular social media applications, including +Twitter, Instagram, Sina Weibo, and Baidu Tieba. Given toxic textual contents, +OASIS can generate image test cases, which preserve the toxicity yet are likely +to bypass moderation. In the evaluation, we employ OASIS to test five +commercial textual content moderation software from famous companies (i.e., +Google Cloud, Microsoft Azure, Baidu Cloud, Alibaba Cloud and Tencent Cloud), +as well as a state-of-the-art moderation research model. The results show that +OASIS achieves up to 100% error finding rates. Moreover, through retraining the +models with the test cases generated by OASIS, the robustness of the moderation +model can be improved without performance degradation. + +
+
+ comment: Accepted by ASE 2023. arXiv admin note: substantial text overlap with + arXiv:2302.05706 +
+
+
+
+
+ + ☆ VL-PET: Vision-and-Language Parameter-Efficient Tuning via Granularity + Control ICCV 2023 + + +
+ As the model size of pre-trained language models (PLMs) grows rapidly, full +fine-tuning becomes prohibitively expensive for model training and storage. In +vision-and-language (VL), parameter-efficient tuning (PET) techniques are +proposed to integrate modular modifications (e.g., Adapter and LoRA) into +encoder-decoder PLMs. By tuning a small set of trainable parameters, these +techniques perform on par with full fine-tuning. However, excessive modular +modifications and neglecting the functionality gap between the encoders and +decoders can lead to performance degradation, while existing PET techniques +(e.g., VL-Adapter) overlook these critical issues. In this paper, we propose a +Vision-and-Language Parameter-Efficient Tuning (VL-PET) framework to impose +effective control over modular modifications via a novel granularity-controlled +mechanism. Considering different granularity-controlled matrices generated by +this mechanism, a variety of model-agnostic VL-PET modules can be instantiated +from our framework for better efficiency and effectiveness trade-offs. We +further propose lightweight PET module designs to enhance VL alignment and +modeling for the encoders and maintain text generation for the decoders. +Extensive experiments conducted on four image-text tasks and four video-text +tasks demonstrate the efficiency, effectiveness and transferability of our +VL-PET framework. In particular, our VL-PET-large with lightweight PET module +designs significantly outperforms VL-Adapter by 2.92% (3.41%) and LoRA by 3.37% +(7.03%) with BART-base (T5-base) on image-text tasks. Furthermore, we validate +the enhanced effect of employing our VL-PET designs on existing PET techniques, +enabling them to achieve significant performance improvements. Our code is +available at https://github.com/HenryHZY/VL-PET. + +
+
+ comment: ICCV 2023 (17 pages, 6 figures, 22 tables) +
+
+
+
+
+ + ☆ Towards Grounded Visual Spatial Reasoning in Multi-Modal Vision Language + Models + + +
+ With the advances in large scale vision-and-language models (VLMs) it is of +interest to assess their performance on various visual reasoning tasks such as +counting, referring expressions and general visual question answering. The +focus of this work is to study the ability of these models to understanding +spatial relations. Previously, this has been tackled using image-text matching +(Liu, Emerson, and Collier 2022) or visual question answering task, both +showing poor performance and a large gap compared to human performance. To +better understand the gap, we present fine-grained compositional grounding of +spatial relationships and propose a bottom up approach for ranking spatial +clauses and evaluating the performance of spatial relationship reasoning task. +We propose to combine the evidence from grounding noun phrases corresponding to +objects and their locations to compute the final rank of the spatial clause. We +demonstrate the approach on representative vision-language models (Tan and +Bansal 2019; Gupta et al. 2022; Kamath et al. 2021) and compare and highlight +their abilities to reason about spatial relationships. + +
+
+
+
+
+ + ☆ YORC: Yoruba Reading Comprehension dataset + + +
+ In this paper, we create YORC: a new multi-choice Yoruba Reading +Comprehension dataset that is based on Yoruba high-school reading comprehension +examination. We provide baseline results by performing cross-lingual transfer +using existing English RACE dataset based on a pre-trained encoder-only model. +Additionally, we provide results by prompting large language models (LLMs) like +GPT-4. + +
+
+
+
+
+ + ☆ Taken by Surprise: Contrast effect for Similarity Scores + + +
+ Accurately evaluating the similarity of object vector embeddings is of +critical importance for natural language processing, information retrieval and +classification tasks. Popular similarity scores (e.g cosine similarity) are +based on pairs of embedding vectors and disregard the distribution of the +ensemble from which objects are drawn. Human perception of object similarity +significantly depends on the context in which the objects appear. In this work +we propose the \emph{surprise score}, an ensemble-normalized similarity metric +that encapsulates the contrast effect of human perception and significantly +improves the classification performance on zero- and few-shot document +classification tasks. This score quantifies the surprise to find a given +similarity between two elements relative to the pairwise ensemble similarities. +We evaluate this metric on zero/few shot classification and clustering tasks +and typically find 10-15\% better performance compared to raw cosine +similarity. Our code is available at +https://github.com/MeetElise/surprise-similarity. + +
+
+ comment: 9 pages, 2 figures and 4 tables +
+
+
+
+
+ + ♻ ☆ Human-Like Intuitive Behavior and Reasoning Biases Emerged in Language + Models -- and Disappeared in GPT-4 + + +
+ Large language models (LLMs) are currently at the forefront of intertwining +AI systems with human communication and everyday life. Therefore, it is of +great importance to evaluate their emerging abilities. In this study, we show +that LLMs, most notably GPT-3, exhibit behavior that strikingly resembles +human-like intuition -- and the cognitive errors that come with it. However, +LLMs with higher cognitive capabilities, in particular ChatGPT and GPT-4, +learned to avoid succumbing to these errors and perform in a hyperrational +manner. For our experiments, we probe LLMs with the Cognitive Reflection Test +(CRT) as well as semantic illusions that were originally designed to +investigate intuitive decision-making in humans. Moreover, we probe how sturdy +the inclination for intuitive-like decision-making is. Our study demonstrates +that investigating LLMs with methods from psychology has the potential to +reveal otherwise unknown emergent traits. + +
+
+ comment: Overlap with arXiv:2212.05206 +
+
+
+
+
+ + ♻ ☆ A Part-of-Speech Tagger for Yiddish + + +
+ We describe the construction and evaluation of a part-of-speech tagger for +Yiddish. This is the first step in a larger project of automatically assigning +part-of-speech tags and syntactic structure to Yiddish text for purposes of +linguistic research. We combine two resources for the current work - an +80K-word subset of the Penn Parsed Corpus of Historical Yiddish (PPCHY) and 650 +million words of OCR'd Yiddish text from the Yiddish Book Center (YBC). Yiddish +orthography in the YBC corpus has many spelling inconsistencies, and we present +some evidence that even simple non-contextualized embeddings trained on YBC are +able to capture the relationships among spelling variants without the need to +first "standardize" the corpus. We also use YBC for continued pretraining of +contexualized embeddings, which are then integrated into a tagger model trained +and evaluated on the PPCHY. We evaluate the tagger performance on a 10-fold +cross-validation split, showing that the use of the YBC text for the +contextualized embeddings improves tagger performance. We conclude by +discussing some next steps, including the need for additional annotated +training and test data. + +
+
+
+
+
+ + ♻ ☆ A Comprehensive Overview of Large Language Models + + +
+ Large Language Models (LLMs) have recently demonstrated remarkable +capabilities in natural language processing tasks and beyond. This success of +LLMs has led to a large influx of research contributions in this direction. +These works encompass diverse topics such as architectural innovations of the +underlying neural networks, context length improvements, model alignment, +training datasets, benchmarking, efficiency and more. With the rapid +development of techniques and regular breakthroughs in LLM research, it has +become considerably challenging to perceive the bigger picture of the advances +in this direction. Considering the rapidly emerging plethora of literature on +LLMs, it is imperative that the research community is able to benefit from a +concise yet comprehensive overview of the recent developments in this field. +This article provides that overview to the research community. It not only +focuses on a systematic treatment of the existing literature on a broad range +of LLM related concept, but also pays special attention to providing +comprehensive summaries with extensive details about the individual existing +models, datasets and major insights. We also pay heed to aligning our overview +with the emerging outlook of this research direction by accounting for the +other recently materializing reviews of the broader research direction of LLMs. +Our self-contained comprehensive overview of LLMs discusses relevant background +concepts along with covering the advanced topics at the frontier of this +research direction. This review article is intended to not only provide a +systematic survey, but also a quick comprehensive reference for the researchers +and practitioners to draw insights from extensive informative summaries of the +existing works to advance the LLM research direction. + +
+
+
+
+
+ + ♻ ☆ WIKITIDE: A Wikipedia-Based Timestamped Definition Pairs Dataset + + +
+ A fundamental challenge in the current NLP context, dominated by language +models, comes from the inflexibility of current architectures to 'learn' new +information. While model-centric solutions like continual learning or +parameter-efficient fine tuning are available, the question still remains of +how to reliably identify changes in language or in the world. In this paper, we +propose WikiTiDe, a dataset derived from pairs of timestamped definitions +extracted from Wikipedia. We argue that such resource can be helpful for +accelerating diachronic NLP, specifically, for training models able to scan +knowledge resources for core updates concerning a concept, an event, or a named +entity. Our proposed end-to-end method is fully automatic, and leverages a +bootstrapping algorithm for gradually creating a high-quality dataset. Our +results suggest that bootstrapping the seed version of WikiTiDe leads to better +fine-tuned models. We also leverage fine-tuned models in a number of downstream +tasks, showing promising results with respect to competitive baselines. + +
+
+ comment: Accepted by RANLP 2023 main conference +
+
+
+
+
+ + ♻ ☆ SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models with + Large Language Models ACM MM 2023 + + +
+ Diffusion models, which have emerged to become popular text-to-image +generation models, can produce high-quality and content-rich images guided by +textual prompts. However, there are limitations to semantic understanding and +commonsense reasoning in existing models when the input prompts are concise +narrative, resulting in low-quality image generation. To improve the capacities +for narrative prompts, we propose a simple-yet-effective parameter-efficient +fine-tuning approach called the Semantic Understanding and Reasoning adapter +(SUR-adapter) for pre-trained diffusion models. To reach this goal, we first +collect and annotate a new dataset SURD which consists of more than 57,000 +semantically corrected multi-modal samples. Each sample contains a simple +narrative prompt, a complex keyword-based prompt, and a high-quality image. +Then, we align the semantic representation of narrative prompts to the complex +prompts and transfer knowledge of large language models (LLMs) to our +SUR-adapter via knowledge distillation so that it can acquire the powerful +semantic understanding and reasoning capabilities to build a high-quality +textual semantic representation for text-to-image generation. We conduct +experiments by integrating multiple LLMs and popular pre-trained diffusion +models to show the effectiveness of our approach in enabling diffusion models +to understand and reason concise natural language without image quality +degradation. Our approach can make text-to-image diffusion models easier to use +with better user experience, which demonstrates our approach has the potential +for further advancing the development of user-friendly text-to-image generation +models by bridging the semantic gap between simple narrative prompts and +complex keyword-based prompts. The code is released at +https://github.com/Qrange-group/SUR-adapter. + +
+
+ comment: accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Latent Jailbreak: A Test Suite for Evaluating Both Text Safety and + Output Robustness of Large Language Models + + +
+ Considerable research efforts have been devoted to ensuring that large +language models (LLMs) align with human values and generate safe text. However, +an excessive focus on sensitivity to certain topics can compromise the model's +robustness in following instructions, thereby impacting its overall performance +in completing tasks. Previous benchmarks for jailbreaking LLMs have primarily +focused on evaluating the safety of the models without considering their +robustness. In this paper, we propose a benchmark that assesses both the safety +and robustness of LLMs, emphasizing the need for a balanced approach. To +comprehensively study text safety and output robustness, we introduce a latent +jailbreak prompt dataset, each involving malicious instruction embedding. +Specifically, we instruct the model to complete a regular task, such as +translation, with the text to be translated containing malicious instructions. +To further analyze safety and robustness, we design a hierarchical annotation +framework. We present a systematic analysis of the safety and robustness of +LLMs regarding the position of explicit normal instructions, word replacements +(verbs in explicit normal instructions, target groups in malicious +instructions, cue words for explicit normal instructions), and instruction +replacements (different explicit normal instructions). Our results demonstrate +that current LLMs not only prioritize certain instruction verbs but also +exhibit varying jailbreak rates for different instruction verbs in explicit +normal instructions. Code and data are available at +https://github.com/qiuhuachuan/latent-jailbreak. + +
+
+ comment: Code and data are available at + https://github.com/qiuhuachuan/latent-jailbreak +
+
+
+
+
+ + ♻ ☆ Generative Multimodal Entity Linking + + +
+ Multimodal Entity Linking (MEL) is the task of mapping mentions with +multimodal contexts to the referent entities from a knowledge base (e.g. +Wikipedia). Existing MEL methods mainly focus on designing complex multimodal +interaction mechanisms and require fine-tuning all model parameters, which can +be prohibitively costly and difficult to scale in the era of Large Language +Models (LLMs). In this work, we propose GEMEL, a simple yet effective +Generative Multimodal Entity Linking framework based on LLMs, which directly +generates target entity names. We keep the vision and language model frozen and +only train a feature mapper to enable cross-modality interactions. To adapt +LLMs to the MEL task, we take advantage of the emergent in-context learning +capability of LLMs by retrieving multimodal instances as demonstrations. +Extensive experiments show that, with only ~0.3% of the model parameters +fine-tuned, GEMEL achieves state-of-the-art results on two well-established MEL +datasets (7.7% accuracy gains on WikiDiverse and 8.8% accuracy gains on +WikiMEL). The performance gain stems from mitigating the popularity bias of LLM +predictions and disambiguating less common entities effectively. Further +analysis verifies the generality and scalability of GEMEL. Our approach is +compatible with any off-the-shelf language model, paving the way towards an +efficient and general solution for utilizing LLMs in the MEL task. + +
+
+
+
+
+ + ♻ ☆ Gradient-Based Word Substitution for Obstinate Adversarial Examples + Generation in Language Models + + +
+ In this paper, we study the problem of generating obstinate (over-stability) +adversarial examples by word substitution in NLP, where input text is +meaningfully changed but the model's prediction does not, even though it +should. Previous word substitution approaches have predominantly focused on +manually designed antonym-based strategies for generating obstinate adversarial +examples, which hinders its application as these strategies can only find a +subset of obstinate adversarial examples and require human efforts. To address +this issue, in this paper, we introduce a novel word substitution method named +GradObstinate, a gradient-based approach that automatically generates obstinate +adversarial examples without any constraints on the search space or the need +for manual design principles. To empirically evaluate the efficacy of +GradObstinate, we conduct comprehensive experiments on five representative +models (Electra, ALBERT, Roberta, DistillBERT, and CLIP) finetuned on four NLP +benchmarks (SST-2, MRPC, SNLI, and SQuAD) and a language-grounding benchmark +(MSCOCO). Extensive experiments show that our proposed GradObstinate generates +more powerful obstinate adversarial examples, exhibiting a higher attack +success rate compared to antonym-based methods. Furthermore, to show the +transferability of obstinate word substitutions found by GradObstinate, we +replace the words in four representative NLP benchmarks with their obstinate +substitutions. Notably, obstinate substitutions exhibit a high success rate +when transferred to other models in black-box settings, including even GPT-3 +and ChatGPT. Examples of obstinate adversarial examples found by GradObstinate +are available at https://huggingface.co/spaces/anonauthors/SecretLanguage. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ I Can't Believe There's No Images! Learning Visual Tasks Using only + Language Supervision + + +
+ Many high-level skills that are required for computer vision tasks, such as +parsing questions, comparing and contrasting semantics, and writing +descriptions, are also required in other domains such as natural language +processing. In this paper, we ask whether it is possible to learn those skills +from text data and then transfer them to vision tasks without ever training on +visual training data. Key to our approach is exploiting the joint embedding +space of contrastively trained vision and language encoders. In practice, there +can be systematic differences between embedding spaces for different modalities +in contrastive models, and we analyze how these differences affect our approach +and study strategies to mitigate this concern. We produce models using only +text training data on four representative tasks: image captioning, visual +entailment, visual question answering and visual news captioning, and evaluate +them on standard benchmarks using images. We find these models perform close to +models trained on images, while surpassing prior work for captioning and visual +entailment in this text-only setting by over 9 points, and outperforming all +prior work on visual news by over 30 points. We also showcase a variety of +stylistic image captioning models that are trained using no image data and no +human-curated language data, but instead using readily-available text data from +books, the web, or language models. + +
+
+ comment: website (https://prior.allenai.org/projects/close), code + (https://github.com/allenai/close) +
+
+
+
+
+ + ♻ ☆ RLCD: Reinforcement Learning from Contrast Distillation for Language + Model Alignment + + +
+ We propose Reinforcement Learning from Contrast Distillation (RLCD), a method +for aligning language models to follow natural language principles without +using human feedback. RLCD trains a preference model using simulated preference +pairs that contain both a high-quality and low-quality example, generated using +contrasting positive and negative prompts. The preference model is then used to +improve a base unaligned language model via reinforcement learning. +Empirically, RLCD outperforms RLAIF (Bai et al., 2022b) and context +distillation (Huang et al., 2022) baselines across three diverse alignment +tasks--harmlessness, helpfulness, and story outline generation--and on both 7B +and 30B model scales for preference data simulation. + +
+
+
+
+
+ + ♻ ☆ Toward Transparent AI: A Survey on Interpreting the Inner Structures of + Deep Neural Networks + + +
+ The last decade of machine learning has seen drastic increases in scale and +capabilities. Deep neural networks (DNNs) are increasingly being deployed in +the real world. However, they are difficult to analyze, raising concerns about +using them without a rigorous understanding of how they function. Effective +tools for interpreting them will be important for building more trustworthy AI +by helping to identify problems, fix bugs, and improve basic understanding. In +particular, "inner" interpretability techniques, which focus on explaining the +internal components of DNNs, are well-suited for developing a mechanistic +understanding, guiding manual modifications, and reverse engineering solutions. + Much recent work has focused on DNN interpretability, and rapid progress has +thus far made a thorough systematization of methods difficult. In this survey, +we review over 300 works with a focus on inner interpretability tools. We +introduce a taxonomy that classifies methods by what part of the network they +help to explain (weights, neurons, subnetworks, or latent representations) and +whether they are implemented during (intrinsic) or after (post hoc) training. +To our knowledge, we are also the first to survey a number of connections +between interpretability research and work in adversarial robustness, continual +learning, modularity, network compression, and studying the human visual +system. We discuss key challenges and argue that the status quo in +interpretability research is largely unproductive. Finally, we highlight the +importance of future work that emphasizes diagnostics, debugging, adversaries, +and benchmarking in order to make interpretability tools more useful to +engineers in practical applications. + +
+
+
+
+
+ + ♻ ☆ Supporting Human-AI Collaboration in Auditing LLMs with LLMs + + +
+ Large language models are becoming increasingly pervasive and ubiquitous in +society via deployment in sociotechnical systems. Yet these language models, be +it for classification or generation, have been shown to be biased and behave +irresponsibly, causing harm to people at scale. It is crucial to audit these +language models rigorously. Existing auditing tools leverage either or both +humans and AI to find failures. In this work, we draw upon literature in +human-AI collaboration and sensemaking, and conduct interviews with research +experts in safe and fair AI, to build upon the auditing tool: AdaTest (Ribeiro +and Lundberg, 2022), which is powered by a generative large language model +(LLM). Through the design process we highlight the importance of sensemaking +and human-AI communication to leverage complementary strengths of humans and +generative models in collaborative auditing. To evaluate the effectiveness of +the augmented tool, AdaTest++, we conduct user studies with participants +auditing two commercial language models: OpenAI's GPT-3 and Azure's sentiment +analysis model. Qualitative analysis shows that AdaTest++ effectively leverages +human strengths such as schematization, hypothesis formation and testing. +Further, with our tool, participants identified a variety of failures modes, +covering 26 different topics over 2 tasks, that have been shown before in +formal audits and also those previously under-reported. + +
+
+ comment: 21 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Scaling Clinical Trial Matching Using Large Language Models: A Case + Study in Oncology + + +
+ Clinical trial matching is a key process in health delivery and discovery. In +practice, it is plagued by overwhelming unstructured data and unscalable manual +processing. In this paper, we conduct a systematic study on scaling clinical +trial matching using large language models (LLMs), with oncology as the focus +area. Our study is grounded in a clinical trial matching system currently in +test deployment at a large U.S. health network. Initial findings are promising: +out of box, cutting-edge LLMs, such as GPT-4, can already structure elaborate +eligibility criteria of clinical trials and extract complex matching logic +(e.g., nested AND/OR/NOT). While still far from perfect, LLMs substantially +outperform prior strong baselines and may serve as a preliminary solution to +help triage patient-trial candidates with humans in the loop. Our study also +reveals a few significant growth areas for applying LLMs to end-to-end clinical +trial matching, such as context limitation and accuracy, especially in +structuring patient information from longitudinal medical records. + +
+
+ comment: 24 pages, 5 figures, accepted at Machine Learning for Healthcare + (MLHC) 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 149 + +
+
+
+ + ☆ Towards Large-scale 3D Representation Learning with Multi-dataset Point + Prompt Training + + +
+ The rapid advancement of deep learning models often attributes to their +ability to leverage massive training data. In contrast, such privilege has not +yet fully benefited 3D deep learning, mainly due to the limited availability of +large-scale 3D datasets. Merging multiple available data sources and letting +them collaboratively train a single model is a potential solution. However, due +to the large domain gap between 3D point cloud datasets, such mixed supervision +could adversely affect the model's performance and lead to degenerated +performance (i.e., negative transfer) compared to single-dataset training. In +view of this challenge, we introduce Point Prompt Training (PPT), a novel +framework for multi-dataset synergistic learning in the context of 3D +representation learning that supports multiple pre-training paradigms. Based on +this framework, we propose Prompt-driven Normalization, which adapts the model +to different datasets with domain-specific prompts and Language-guided +Categorical Alignment that decently unifies the multiple-dataset label spaces +by leveraging the relationship between label text. Extensive experiments verify +that PPT can overcome the negative transfer associated with synergistic +learning and produce generalizable representations. Notably, it achieves +state-of-the-art performance on each dataset using a single weight-shared model +with supervised multi-dataset training. Moreover, when served as a pre-training +framework, it outperforms other pre-training approaches regarding +representation quality and attains remarkable state-of-the-art performance +across over ten diverse downstream tasks spanning both indoor and outdoor 3D +scenarios. + +
+
+ comment: Code available at Pointcept (https://github.com/Pointcept/Pointcept) +
+
+
+
+
+ + ☆ Smoothness Similarity Regularization for Few-Shot GAN Adaptation ICCV + + +
+ The task of few-shot GAN adaptation aims to adapt a pre-trained GAN model to +a small dataset with very few training images. While existing methods perform +well when the dataset for pre-training is structurally similar to the target +dataset, the approaches suffer from training instabilities or memorization +issues when the objects in the two domains have a very different structure. To +mitigate this limitation, we propose a new smoothness similarity regularization +that transfers the inherently learned smoothness of the pre-trained GAN to the +few-shot target domain even if the two domains are very different. We evaluate +our approach by adapting an unconditional and a class-conditional GAN to +diverse few-shot target domains. Our proposed method significantly outperforms +prior few-shot GAN adaptation methods in the challenging case of structurally +dissimilar source-target domains, while performing on par with the state of the +art for similar source-target domains. + +
+
+ comment: International Conference on Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ☆ Diff2Lip: Audio Conditioned Diffusion Models for Lip-Synchronization + + +
+ The task of lip synchronization (lip-sync) seeks to match the lips of human +faces with different audio. It has various applications in the film industry as +well as for creating virtual avatars and for video conferencing. This is a +challenging problem as one needs to simultaneously introduce detailed, +realistic lip movements while preserving the identity, pose, emotions, and +image quality. Many of the previous methods trying to solve this problem suffer +from image quality degradation due to a lack of complete contextual +information. In this paper, we present Diff2Lip, an audio-conditioned +diffusion-based model which is able to do lip synchronization in-the-wild while +preserving these qualities. We train our model on Voxceleb2, a video dataset +containing in-the-wild talking face videos. Extensive studies show that our +method outperforms popular methods like Wav2Lip and PC-AVS in Fr\'echet +inception distance (FID) metric and Mean Opinion Scores (MOS) of the users. We +show results on both reconstruction (same audio-video inputs) as well as cross +(different audio-video inputs) settings on Voxceleb2 and LRW datasets. Video +results and code can be accessed from our project page ( +https://soumik-kanad.github.io/diff2lip ). + +
+
+ comment: Website: see https://soumik-kanad.github.io/diff2lip . Submission + under review +
+
+
+
+
+ + ☆ Dynamic 3D Gaussians: Tracking by Persistent Dynamic View Synthesis + + +
+ We present a method that simultaneously addresses the tasks of dynamic scene +novel-view synthesis and six degree-of-freedom (6-DOF) tracking of all dense +scene elements. We follow an analysis-by-synthesis framework, inspired by +recent work that models scenes as a collection of 3D Gaussians which are +optimized to reconstruct input images via differentiable rendering. To model +dynamic scenes, we allow Gaussians to move and rotate over time while enforcing +that they have persistent color, opacity, and size. By regularizing Gaussians' +motion and rotation with local-rigidity constraints, we show that our Dynamic +3D Gaussians correctly model the same area of physical space over time, +including the rotation of that space. Dense 6-DOF tracking and dynamic +reconstruction emerges naturally from persistent dynamic view synthesis, +without requiring any correspondence or flow as input. We demonstrate a large +number of downstream applications enabled by our representation, including +first-person view synthesis, dynamic compositional scene synthesis, and 4D +video editing. + +
+
+
+
+
+ + ☆ HumanLiff: Layer-wise 3D Human Generation with Diffusion Model + + +
+ 3D human generation from 2D images has achieved remarkable progress through +the synergistic utilization of neural rendering and generative models. Existing +3D human generative models mainly generate a clothed 3D human as an +undetectable 3D model in a single pass, while rarely considering the layer-wise +nature of a clothed human body, which often consists of the human body and +various clothes such as underwear, outerwear, trousers, shoes, etc. In this +work, we propose HumanLiff, the first layer-wise 3D human generative model with +a unified diffusion process. Specifically, HumanLiff firstly generates +minimal-clothed humans, represented by tri-plane features, in a canonical +space, and then progressively generates clothes in a layer-wise manner. In this +way, the 3D human generation is thus formulated as a sequence of +diffusion-based 3D conditional generation. To reconstruct more fine-grained 3D +humans with tri-plane representation, we propose a tri-plane shift operation +that splits each tri-plane into three sub-planes and shifts these sub-planes to +enable feature grid subdivision. To further enhance the controllability of 3D +generation with 3D layered conditions, HumanLiff hierarchically fuses tri-plane +features and 3D layered conditions to facilitate the 3D diffusion model +learning. Extensive experiments on two layer-wise 3D human datasets, SynBody +(synthetic) and TightCap (real-world), validate that HumanLiff significantly +outperforms state-of-the-art methods in layer-wise 3D human generation. Our +code will be available at https://skhu101.github.io/HumanLiff. + +
+
+ comment: Project page: https://skhu101.github.io/HumanLiff/ +
+
+
+
+
+ + ☆ Robust Monocular Depth Estimation under Challenging Conditions ICCV 2023 + + +
+ While state-of-the-art monocular depth estimation approaches achieve +impressive results in ideal settings, they are highly unreliable under +challenging illumination and weather conditions, such as at nighttime or in the +presence of rain. In this paper, we uncover these safety-critical issues and +tackle them with md4all: a simple and effective solution that works reliably +under both adverse and ideal conditions, as well as for different types of +learning supervision. We achieve this by exploiting the efficacy of existing +methods under perfect settings. Therefore, we provide valid training signals +independently of what is in the input. First, we generate a set of complex +samples corresponding to the normal training ones. Then, we train the model by +guiding its self- or full-supervision by feeding the generated samples and +computing the standard losses on the corresponding original images. Doing so +enables a single model to recover information across diverse conditions without +modifications at inference time. Extensive experiments on two challenging +public datasets, namely nuScenes and Oxford RobotCar, demonstrate the +effectiveness of our techniques, outperforming prior works by a large margin in +both standard and challenging conditions. Source code and data are available +at: https://md4all.github.io. + +
+
+ comment: ICCV 2023. Source code and data: https://md4all.github.io +
+
+
+
+
+ + ☆ SimDA: Simple Diffusion Adapter for Efficient Video Generation + + +
+ The recent wave of AI-generated content has witnessed the great development +and success of Text-to-Image (T2I) technologies. By contrast, Text-to-Video +(T2V) still falls short of expectations though attracting increasing interests. +Existing works either train from scratch or adapt large T2I model to videos, +both of which are computation and resource expensive. In this work, we propose +a Simple Diffusion Adapter (SimDA) that fine-tunes only 24M out of 1.1B +parameters of a strong T2I model, adapting it to video generation in a +parameter-efficient way. In particular, we turn the T2I model for T2V by +designing light-weight spatial and temporal adapters for transfer learning. +Besides, we change the original spatial attention to the proposed Latent-Shift +Attention (LSA) for temporal consistency. With similar model architecture, we +further train a video super-resolution model to generate high-definition +(1024x1024) videos. In addition to T2V generation in the wild, SimDA could also +be utilized in one-shot video editing with only 2 minutes tuning. Doing so, our +method could minimize the training effort with extremely few tunable parameters +for model adaptation. + +
+
+
+
+
+ + ☆ Training with Product Digital Twins for AutoRetail Checkout + + +
+ Automating the checkout process is important in smart retail, where users +effortlessly pass products by hand through a camera, triggering automatic +product detection, tracking, and counting. In this emerging area, due to the +lack of annotated training data, we introduce a dataset comprised of product 3D +models, which allows for fast, flexible, and large-scale training data +generation through graphic engine rendering. Within this context, we discern an +intriguing facet, because of the user "hands-on" approach, bias in user +behavior leads to distinct patterns in the real checkout process. The existence +of such patterns would compromise training effectiveness if training data fail +to reflect the same. To address this user bias problem, we propose a training +data optimization framework, i.e., training with digital twins (DtTrain). +Specifically, we leverage the product 3D models and optimize their rendering +viewpoint and illumination to generate "digital twins" that visually resemble +representative user images. These digital twins, inherit product labels and, +when augmented, form the Digital Twin training set (DT set). Because the +digital twins individually mimic user bias, the resulting DT training set +better reflects the characteristics of the target scenario and allows us to +train more effective product detection and tracking models. In our experiment, +we show that DT set outperforms training sets created by existing dataset +synthesis methods in terms of counting accuracy. Moreover, by combining DT set +with pseudo-labeled real checkout data, further improvement is observed. The +code is available at https://github.com/yorkeyao/Automated-Retail-Checkout. + +
+
+
+
+
+ + ☆ Guide3D: Create 3D Avatars from Text and Image Guidance + + +
+ Recently, text-to-image generation has exhibited remarkable advancements, +with the ability to produce visually impressive results. In contrast, +text-to-3D generation has not yet reached a comparable level of quality. +Existing methods primarily rely on text-guided score distillation sampling +(SDS), and they encounter difficulties in transferring 2D attributes of the +generated images to 3D content. In this work, we aim to develop an effective 3D +generative model capable of synthesizing high-resolution textured meshes by +leveraging both textual and image information. To this end, we introduce +Guide3D, a zero-shot text-and-image-guided generative model for 3D avatar +generation based on diffusion models. Our model involves (1) generating +sparse-view images of a text-consistent character using diffusion models, and +(2) jointly optimizing multi-resolution differentiable marching tetrahedral +grids with pixel-aligned image features. We further propose a similarity-aware +feature fusion strategy for efficiently integrating features from different +views. Moreover, we introduce two novel training objectives as an alternative +to calculating SDS, significantly enhancing the optimization process. We +thoroughly evaluate the performance and components of our framework, which +outperforms the current state-of-the-art in producing topologically and +structurally correct geometry and high-resolution textures. Guide3D enables the +direct transfer of 2D-generated images to the 3D space. Our code will be made +publicly available. + +
+
+ comment: 25 pages, 22 figures +
+
+
+
+
+ + ☆ Invariant Training 2D-3D Joint Hard Samples for Few-Shot Point Cloud + Recognition + + +
+ We tackle the data scarcity challenge in few-shot point cloud recognition of +3D objects by using a joint prediction from a conventional 3D model and a +well-trained 2D model. Surprisingly, such an ensemble, though seems trivial, +has hardly been shown effective in recent 2D-3D models. We find out the crux is +the less effective training for the ''joint hard samples'', which have high +confidence prediction on different wrong labels, implying that the 2D and 3D +models do not collaborate well. To this end, our proposed invariant training +strategy, called InvJoint, does not only emphasize the training more on the +hard samples, but also seeks the invariance between the conflicting 2D and 3D +ambiguous predictions. InvJoint can learn more collaborative 2D and 3D +representations for better ensemble. Extensive experiments on 3D shape +classification with widely adopted ModelNet10/40, ScanObjectNN and Toys4K, and +shape retrieval with ShapeNet-Core validate the superiority of our InvJoint. + +
+
+
+
+
+ + ☆ A Lightweight Transformer for Faster and Robust EBSD Data Collection + + +
+ Three dimensional electron back-scattered diffraction (EBSD) microscopy is a +critical tool in many applications in materials science, yet its data quality +can fluctuate greatly during the arduous collection process, particularly via +serial-sectioning. Fortunately, 3D EBSD data is inherently sequential, opening +up the opportunity to use transformers, state-of-the-art deep learning +architectures that have made breakthroughs in a plethora of domains, for data +processing and recovery. To be more robust to errors and accelerate this 3D +EBSD data collection, we introduce a two step method that recovers missing +slices in an 3D EBSD volume, using an efficient transformer model and a +projection algorithm to process the transformer's outputs. Overcoming the +computational and practical hurdles of deep learning with scarce high +dimensional data, we train this model using only synthetic 3D EBSD data with +self-supervision and obtain superior recovery accuracy on real 3D EBSD data, +compared to existing methods. + +
+
+
+
+
+ + ☆ Audiovisual Moments in Time: A Large-Scale Annotated Dataset of + Audiovisual Actions + + +
+ We present Audiovisual Moments in Time (AVMIT), a large-scale dataset of +audiovisual action events. In an extensive annotation task 11 participants +labelled a subset of 3-second audiovisual videos from the Moments in Time +dataset (MIT). For each trial, participants assessed whether the labelled +audiovisual action event was present and whether it was the most prominent +feature of the video. The dataset includes the annotation of 57,177 audiovisual +videos, each independently evaluated by 3 of 11 trained participants. From this +initial collection, we created a curated test set of 16 distinct action +classes, with 60 videos each (960 videos). We also offer 2 sets of pre-computed +audiovisual feature embeddings, using VGGish/YamNet for audio data and +VGG16/EfficientNetB0 for visual data, thereby lowering the barrier to entry for +audiovisual DNN research. We explored the advantages of AVMIT annotations and +feature embeddings to improve performance on audiovisual event recognition. A +series of 6 Recurrent Neural Networks (RNNs) were trained on either +AVMIT-filtered audiovisual events or modality-agnostic events from MIT, and +then tested on our audiovisual test set. In all RNNs, top 1 accuracy was +increased by 2.71-5.94\% by training exclusively on audiovisual events, even +outweighing a three-fold increase in training data. We anticipate that the +newly annotated AVMIT dataset will serve as a valuable resource for research +and comparative experiments involving computational models and human +participants, specifically when addressing research questions where audiovisual +correspondence is of critical importance. + +
+
+
+
+
+ + ☆ PoSynDA: Multi-Hypothesis Pose Synthesis Domain Adaptation for Robust 3D + Human Pose Estimation + + +
+ The current 3D human pose estimators face challenges in adapting to new +datasets due to the scarcity of 2D-3D pose pairs in target domain training +sets. We present the \textit{Multi-Hypothesis \textbf{P}ose \textbf{Syn}thesis +\textbf{D}omain \textbf{A}daptation} (\textbf{PoSynDA}) framework to overcome +this issue without extensive target domain annotation. Utilizing a +diffusion-centric structure, PoSynDA simulates the 3D pose distribution in the +target domain, filling the data diversity gap. By incorporating a +multi-hypothesis network, it creates diverse pose hypotheses and aligns them +with the target domain. Target-specific source augmentation obtains the target +domain distribution data from the source domain by decoupling the scale and +position parameters. The teacher-student paradigm and low-rank adaptation +further refine the process. PoSynDA demonstrates competitive performance on +benchmarks, such as Human3.6M, MPI-INF-3DHP, and 3DPW, even comparable with the +target-trained MixSTE model~\cite{zhang2022mixste}. This work paves the way for +the practical application of 3D human pose estimation. The code is available at +https://github.com/hbing-l/PoSynDA. + +
+
+ comment: Accepted to ACM Multimedia 2023; 10 pages, 4 figures, 8 tables; the + code is at https://github.com/hbing-l/PoSynDA +
+
+
+
+
+ + ☆ Tree-of-Mixed-Thought: Combining Fast and Slow Thinking for Multi-hop + Visual Reasoning + + +
+ There emerges a promising trend of using large language models (LLMs) to +generate code-like plans for complex inference tasks such as visual reasoning. +This paradigm, known as LLM-based planning, provides flexibility in problem +solving and endows better interpretability. However, current research is mostly +limited to basic scenarios of simple questions that can be straightforward +answered in a few inference steps. Planning for the more challenging multi-hop +visual reasoning tasks remains under-explored. Specifically, under multi-hop +reasoning situations, the trade-off between accuracy and the complexity of +plan-searching becomes prominent. The prevailing algorithms either address the +efficiency issue by employing the fast one-stop generation or adopt a complex +iterative generation method to improve accuracy. Both fail to balance the need +for efficiency and performance. Drawing inspiration from the dual system of +cognition in the human brain, the fast and the slow think processes, we propose +a hierarchical plan-searching algorithm that integrates the one-stop reasoning +(fast) and the Tree-of-thought (slow). Our approach succeeds in performance +while significantly saving inference steps. Moreover, we repurpose the PTR and +the CLEVER datasets, developing a systematic framework for evaluating the +performance and efficiency of LLMs-based plan-search algorithms under reasoning +tasks at different levels of difficulty. Extensive experiments demonstrate the +superiority of our proposed algorithm in terms of performance and efficiency. +The dataset and code will be release soon. + +
+
+ comment: 16 pages,1 figures, under review +
+
+
+
+
+ + ☆ Revisiting Skin Tone Fairness in Dermatological Lesion Classification MICCAI + + +
+ Addressing fairness in lesion classification from dermatological images is +crucial due to variations in how skin diseases manifest across skin tones. +However, the absence of skin tone labels in public datasets hinders building a +fair classifier. To date, such skin tone labels have been estimated prior to +fairness analysis in independent studies using the Individual Typology Angle +(ITA). Briefly, ITA calculates an angle based on pixels extracted from skin +images taking into account the lightness and yellow-blue tints. These angles +are then categorised into skin tones that are subsequently used to analyse +fairness in skin cancer classification. In this work, we review and compare +four ITA-based approaches of skin tone classification on the ISIC18 dataset, a +common benchmark for assessing skin cancer classification fairness in the +literature. Our analyses reveal a high disagreement among previously published +studies demonstrating the risks of ITA-based skin tone estimation methods. +Moreover, we investigate the causes of such large discrepancy among these +approaches and find that the lack of diversity in the ISIC18 dataset limits its +use as a testbed for fairness analysis. Finally, we recommend further research +on robust ITA estimation and diverse dataset acquisition with skin tone +annotation to facilitate conclusive fairness assessments of artificial +intelligence tools in dermatology. Our code is available at +https://github.com/tkalbl/RevisitingSkinToneFairness. + +
+
+ comment: Accepted at 2023 MICCAI FAIMI Workshop +
+
+
+
+
+ + ☆ VALERIE22 -- A photorealistic, richly metadata annotated dataset of + urban environments + + +
+ The VALERIE tool pipeline is a synthetic data generator developed with the +goal to contribute to the understanding of domain-specific factors that +influence perception performance of DNNs (deep neural networks). This work was +carried out under the German research project KI Absicherung in order to +develop a methodology for the validation of DNNs in the context of pedestrian +detection in urban environments for automated driving. The VALERIE22 dataset +was generated with the VALERIE procedural tools pipeline providing a +photorealistic sensor simulation rendered from automatically synthesized +scenes. The dataset provides a uniquely rich set of metadata, allowing +extraction of specific scene and semantic features (like pixel-accurate +occlusion rates, positions in the scene and distance + angle to the camera). +This enables a multitude of possible tests on the data and we hope to stimulate +research on understanding performance of DNNs. Based on performance metric a +comparison with several other publicly available datasets is provided, +demonstrating that VALERIE22 is one of best performing synthetic datasets +currently available in the open domain. + +
+
+
+
+
+ + ☆ GeoDTR+: Toward generic cross-view geolocalization via geometric + disentanglement + + +
+ Cross-View Geo-Localization (CVGL) estimates the location of a ground image +by matching it to a geo-tagged aerial image in a database. Recent works achieve +outstanding progress on CVGL benchmarks. However, existing methods still suffer +from poor performance in cross-area evaluation, in which the training and +testing data are captured from completely distinct areas. We attribute this +deficiency to the lack of ability to extract the geometric layout of visual +features and models' overfitting to low-level details. Our preliminary work +introduced a Geometric Layout Extractor (GLE) to capture the geometric layout +from input features. However, the previous GLE does not fully exploit +information in the input feature. In this work, we propose GeoDTR+ with an +enhanced GLE module that better models the correlations among visual features. +To fully explore the LS techniques from our preliminary work, we further +propose Contrastive Hard Samples Generation (CHSG) to facilitate model +training. Extensive experiments show that GeoDTR+ achieves state-of-the-art +(SOTA) results in cross-area evaluation on CVUSA, CVACT, and VIGOR by a large +margin ($16.44\%$, $22.71\%$, and $17.02\%$ without polar transformation) while +keeping the same-area performance comparable to existing SOTA. Moreover, we +provide detailed analyses of GeoDTR+. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2212.04074 +
+
+
+
+
+ + ☆ Is context all you need? Scaling Neural Sign Language Translation to + Large Domains of Discourse + + +
+ Sign Language Translation (SLT) is a challenging task that aims to generate +spoken language sentences from sign language videos, both of which have +different grammar and word/gloss order. From a Neural Machine Translation (NMT) +perspective, the straightforward way of training translation models is to use +sign language phrase-spoken language sentence pairs. However, human +interpreters heavily rely on the context to understand the conveyed +information, especially for sign language interpretation, where the vocabulary +size may be significantly smaller than their spoken language equivalent. + Taking direct inspiration from how humans translate, we propose a novel +multi-modal transformer architecture that tackles the translation task in a +context-aware manner, as a human would. We use the context from previous +sequences and confident predictions to disambiguate weaker visual cues. To +achieve this we use complementary transformer encoders, namely: (1) A Video +Encoder, that captures the low-level video features at the frame-level, (2) A +Spotting Encoder, that models the recognized sign glosses in the video, and (3) +A Context Encoder, which captures the context of the preceding sign sequences. +We combine the information coming from these encoders in a final transformer +decoder to generate spoken language translations. + We evaluate our approach on the recently published large-scale BOBSL dataset, +which contains ~1.2M sequences, and on the SRF dataset, which was part of the +WMT-SLT 2022 challenge. We report significant improvements on state-of-the-art +translation performance using contextual information, nearly doubling the +reported BLEU-4 scores of baseline approaches. + +
+
+
+
+
+ + ☆ LaRS: A Diverse Panoptic Maritime Obstacle Detection Dataset and + Benchmark ICCV 2023 + + +
+ The progress in maritime obstacle detection is hindered by the lack of a +diverse dataset that adequately captures the complexity of general maritime +environments. We present the first maritime panoptic obstacle detection +benchmark LaRS, featuring scenes from Lakes, Rivers and Seas. Our major +contribution is the new dataset, which boasts the largest diversity in +recording locations, scene types, obstacle classes, and acquisition conditions +among the related datasets. LaRS is composed of over 4000 per-pixel labeled key +frames with nine preceding frames to allow utilization of the temporal texture, +amounting to over 40k frames. Each key frame is annotated with 8 thing, 3 stuff +classes and 19 global scene attributes. We report the results of 27 semantic +and panoptic segmentation methods, along with several performance insights and +future research directions. To enable objective evaluation, we have implemented +an online evaluation server. The LaRS dataset, evaluation toolkit and benchmark +are publicly available at: https://lojzezust.github.io/lars-dataset + +
+
+ comment: ICCV 2023, 9 pages, 8 figures +
+
+
+
+
+ + ☆ Far3D: Expanding the Horizon for Surround-view 3D Object Detection + + +
+ Recently 3D object detection from surround-view images has made notable +advancements with its low deployment cost. However, most works have primarily +focused on close perception range while leaving long-range detection less +explored. Expanding existing methods directly to cover long distances poses +challenges such as heavy computation costs and unstable convergence. To address +these limitations, this paper proposes a novel sparse query-based framework, +dubbed Far3D. By utilizing high-quality 2D object priors, we generate 3D +adaptive queries that complement the 3D global queries. To efficiently capture +discriminative features across different views and scales for long-range +objects, we introduce a perspective-aware aggregation module. Additionally, we +propose a range-modulated 3D denoising approach to address query error +propagation and mitigate convergence issues in long-range tasks. Significantly, +Far3D demonstrates SoTA performance on the challenging Argoverse 2 dataset, +covering a wide range of 150 meters, surpassing several LiDAR-based approaches. +Meanwhile, Far3D exhibits superior performance compared to previous methods on +the nuScenes dataset. The code will be available soon. + +
+
+
+
+
+ + ☆ Language-guided Human Motion Synthesis with Atomic Actions ACM MM 2023 + + +
+ Language-guided human motion synthesis has been a challenging task due to the +inherent complexity and diversity of human behaviors. Previous methods face +limitations in generalization to novel actions, often resulting in unrealistic +or incoherent motion sequences. In this paper, we propose ATOM (ATomic mOtion +Modeling) to mitigate this problem, by decomposing actions into atomic actions, +and employing a curriculum learning strategy to learn atomic action +composition. First, we disentangle complex human motions into a set of atomic +actions during learning, and then assemble novel actions using the learned +atomic actions, which offers better adaptability to new actions. Moreover, we +introduce a curriculum learning training strategy that leverages masked motion +modeling with a gradual increase in the mask ratio, and thus facilitates atomic +action assembly. This approach mitigates the overfitting problem commonly +encountered in previous methods while enforcing the model to learn better +motion representations. We demonstrate the effectiveness of ATOM through +extensive experiments, including text-to-motion and action-to-motion synthesis +tasks. We further illustrate its superiority in synthesizing plausible and +coherent text-guided human motion sequences. + +
+
+ comment: Accepted to ACM MM 2023, code: https://github.com/yhZhai/ATOM +
+
+
+
+
+ + ☆ On the Effectiveness of LayerNorm Tuning for Continual Learning in + Vision Transformers ICCV + + +
+ State-of-the-art rehearsal-free continual learning methods exploit the +peculiarities of Vision Transformers to learn task-specific prompts, +drastically reducing catastrophic forgetting. However, there is a tradeoff +between the number of learned parameters and the performance, making such +models computationally expensive. In this work, we aim to reduce this cost +while maintaining competitive performance. We achieve this by revisiting and +extending a simple transfer learning idea: learning task-specific normalization +layers. Specifically, we tune the scale and bias parameters of LayerNorm for +each continual learning task, selecting them at inference time based on the +similarity between task-specific keys and the output of the pre-trained model. +To make the classifier robust to incorrect selection of parameters during +inference, we introduce a two-stage training procedure, where we first optimize +the task-specific parameters and then train the classifier with the same +selection procedure of the inference time. Experiments on ImageNet-R and +CIFAR-100 show that our method achieves results that are either superior or on +par with {the state of the art} while being computationally cheaper. + +
+
+ comment: In The First Workshop on Visual Continual Learning (ICCVW 2023); Oral +
+
+
+
+
+ + ☆ Language-Guided Diffusion Model for Visual Grounding + + +
+ Visual grounding (VG) tasks involve explicit cross-modal alignment, as +semantically corresponding image regions are to be located for the language +phrases provided. Existing approaches complete such visual-text reasoning in a +single-step manner. Their performance causes high demands on large-scale +anchors and over-designed multi-modal fusion modules based on human priors, +leading to complicated frameworks that may be difficult to train and overfit to +specific scenarios. Even worse, such once-for-all reasoning mechanisms are +incapable of refining boxes continuously to enhance query-region matching. In +contrast, in this paper, we formulate an iterative reasoning process by +denoising diffusion modeling. Specifically, we propose a language-guided +diffusion framework for visual grounding, LG-DVG, which trains the model to +progressively reason queried object boxes by denoising a set of noisy boxes +with the language guide. To achieve this, LG-DVG gradually perturbs +query-aligned ground truth boxes to noisy ones and reverses this process step +by step, conditional on query semantics. Extensive experiments for our proposed +framework on five widely used datasets validate the superior performance of +solving visual grounding, a cross-modal alignment task, in a generative way. +The source codes are available at +\url{https://github.com/iQua/vgbase/tree/DiffusionVG}. + +
+
+ comment: 20 pages, 16 figures +
+
+
+
+
+ + ☆ Investigation of Architectures and Receptive Fields for Appearance-based + Gaze Estimation + + +
+ With the rapid development of deep learning technology in the past decade, +appearance-based gaze estimation has attracted great attention from both +computer vision and human-computer interaction research communities. +Fascinating methods were proposed with variant mechanisms including soft +attention, hard attention, two-eye asymmetry, feature disentanglement, rotation +consistency, and contrastive learning. Most of these methods take the +single-face or multi-region as input, yet the basic architecture of gaze +estimation has not been fully explored. In this paper, we reveal the fact that +tuning a few simple parameters of a ResNet architecture can outperform most of +the existing state-of-the-art methods for the gaze estimation task on three +popular datasets. With our extensive experiments, we conclude that the stride +number, input image resolution, and multi-region architecture are critical for +the gaze estimation performance while their effectiveness dependent on the +quality of the input face image. We obtain the state-of-the-art performances on +three datasets with 3.64 on ETH-XGaze, 4.50 on MPIIFaceGaze, and 9.13 on +Gaze360 degrees gaze estimation error by taking ResNet-50 as the backbone. + +
+
+
+
+
+ + ☆ StableVideo: Text-driven Consistency-aware Diffusion Video Editing ICCV 2023 + + +
+ Diffusion-based methods can generate realistic images and videos, but they +struggle to edit existing objects in a video while preserving their appearance +over time. This prevents diffusion models from being applied to natural video +editing in practical scenarios. In this paper, we tackle this problem by +introducing temporal dependency to existing text-driven diffusion models, which +allows them to generate consistent appearance for the edited objects. +Specifically, we develop a novel inter-frame propagation mechanism for +diffusion video editing, which leverages the concept of layered representations +to propagate the appearance information from one frame to the next. We then +build up a text-driven video editing framework based on this mechanism, namely +StableVideo, which can achieve consistency-aware video editing. Extensive +experiments demonstrate the strong editing capability of our approach. Compared +with state-of-the-art video editing methods, our approach shows superior +qualitative and quantitative results. Our code is available at +\href{https://github.com/rese1f/StableVideo}{this https URL}. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ O^2-Recon: Completing 3D Reconstruction of Occluded Objects in the Scene + with a Pre-trained 2D Diffusion Model + + +
+ Occlusion is a common issue in 3D reconstruction from RGB-D videos, often +blocking the complete reconstruction of objects and presenting an ongoing +problem. In this paper, we propose a novel framework, empowered by a 2D +diffusion-based in-painting model, to reconstruct complete surfaces for the +hidden parts of objects. Specifically, we utilize a pre-trained diffusion model +to fill in the hidden areas of 2D images. Then we use these in-painted images +to optimize a neural implicit surface representation for each instance for 3D +reconstruction. Since creating the in-painting masks needed for this process is +tricky, we adopt a human-in-the-loop strategy that involves very little human +engagement to generate high-quality masks. Moreover, some parts of objects can +be totally hidden because the videos are usually shot from limited +perspectives. To ensure recovering these invisible areas, we develop a cascaded +network architecture for predicting signed distance field, making use of +different frequency bands of positional encoding and maintaining overall +smoothness. Besides the commonly used rendering loss, Eikonal loss, and +silhouette loss, we adopt a CLIP-based semantic consistency loss to guide the +surface from unseen camera angles. Experiments on ScanNet scenes show that our +proposed framework achieves state-of-the-art accuracy and completeness in +object-level reconstruction from scene-level RGB-D videos. + +
+
+
+
+
+ + ☆ PUMGPT: A Large Vision-Language Model for Product Understanding + + +
+ Recent developments of multi-modal large language models have demonstrated +its strong ability in solving vision-language tasks. In this paper, we focus on +the product understanding task, which plays an essential role in enhancing +online shopping experience. Product understanding task includes a variety of +sub-tasks, which require models to respond diverse queries based on multi-modal +product information. Traditional methods design distinct model architectures +for each sub-task. On the contrary, we present PUMGPT, a large vision-language +model aims at unifying all product understanding tasks under a singular model +structure. To bridge the gap between vision and text representations, we +propose Layer-wise Adapters (LA), an approach that provides enhanced alignment +with fewer visual tokens and enables parameter-efficient fine-tuning. Moreover, +the inherent parameter-efficient fine-tuning ability allows PUMGPT to be +readily adapted to new product understanding tasks and emerging products. We +design instruction templates to generate diverse product instruction datasets. +Simultaneously, we utilize open-domain datasets during training to improve the +performance of PUMGPT and its generalization ability. Through extensive +evaluations, PUMGPT demonstrates its superior performance across multiple +product understanding tasks, including product captioning, category +question-answering, attribute extraction, attribute question-answering, and +even free-form question-answering about products. + +
+
+
+
+
+ + ☆ Deep Equilibrium Object Detection + + +
+ Query-based object detectors directly decode image features into object +instances with a set of learnable queries. These query vectors are +progressively refined to stable meaningful representations through a sequence +of decoder layers, and then used to directly predict object locations and +categories with simple FFN heads. In this paper, we present a new query-based +object detector (DEQDet) by designing a deep equilibrium decoder. Our DEQ +decoder models the query vector refinement as the fixed point solving of an +{implicit} layer and is equivalent to applying {infinite} steps of refinement. +To be more specific to object decoding, we use a two-step unrolled equilibrium +equation to explicitly capture the query vector refinement. Accordingly, we are +able to incorporate refinement awareness into the DEQ training with the inexact +gradient back-propagation (RAG). In addition, to stabilize the training of our +DEQDet and improve its generalization ability, we devise the deep supervision +scheme on the optimization path of DEQ with refinement-aware +perturbation~(RAP). Our experiments demonstrate DEQDet converges faster, +consumes less memory, and achieves better results than the baseline counterpart +(AdaMixer). In particular, our DEQDet with ResNet50 backbone and 300 queries +achieves the $49.5$ mAP and $33.0$ AP$_s$ on the MS COCO benchmark under +$2\times$ training scheme (24 epochs). + +
+
+
+
+
+ + ☆ Adapt Your Teacher: Improving Knowledge Distillation for Exemplar-free + Continual Learning ICCV 2023 + + +
+ In this work, we investigate exemplar-free class incremental learning (CIL) +with knowledge distillation (KD) as a regularization strategy, aiming to +prevent forgetting. KD-based methods are successfully used in CIL, but they +often struggle to regularize the model without access to exemplars of the +training data from previous tasks. Our analysis reveals that this issue +originates from substantial representation shifts in the teacher network when +dealing with out-of-distribution data. This causes large errors in the KD loss +component, leading to performance degradation in CIL. Inspired by recent +test-time adaptation methods, we introduce Teacher Adaptation (TA), a method +that concurrently updates the teacher and the main model during incremental +training. Our method seamlessly integrates with KD-based CIL approaches and +allows for consistent enhancement of their performance across multiple +exemplar-free CIL benchmarks. + +
+
+ comment: VCL workshop at ICCV 2023 +
+
+
+
+
+ + ☆ Decoupled conditional contrastive learning with variable metadata for + prostate lesion detection MICCAI + + +
+ Early diagnosis of prostate cancer is crucial for efficient treatment. +Multi-parametric Magnetic Resonance Images (mp-MRI) are widely used for lesion +detection. The Prostate Imaging Reporting and Data System (PI-RADS) has +standardized interpretation of prostate MRI by defining a score for lesion +malignancy. PI-RADS data is readily available from radiology reports but is +subject to high inter-reports variability. We propose a new contrastive loss +function that leverages weak metadata with multiple annotators per sample and +takes advantage of inter-reports variability by defining metadata confidence. +By combining metadata of varying confidence with unannotated data into a single +conditional contrastive loss function, we report a 3% AUC increase on lesion +detection on the public PI-CAI challenge dataset. + Code is available at: https://github.com/camilleruppli/decoupled_ccl + +
+
+ comment: Accepted at MILLanD workshop (MICCAI) +
+
+
+
+
+ + ☆ Meta-ZSDETR: Zero-shot DETR with Meta-learning ICCV 2023 + + +
+ Zero-shot object detection aims to localize and recognize objects of unseen +classes. Most of existing works face two problems: the low recall of RPN in +unseen classes and the confusion of unseen classes with background. In this +paper, we present the first method that combines DETR and meta-learning to +perform zero-shot object detection, named Meta-ZSDETR, where model training is +formalized as an individual episode based meta-learning task. Different from +Faster R-CNN based methods that firstly generate class-agnostic proposals, and +then classify them with visual-semantic alignment module, Meta-ZSDETR directly +predict class-specific boxes with class-specific queries and further filter +them with the predicted accuracy from classification head. The model is +optimized with meta-contrastive learning, which contains a regression head to +generate the coordinates of class-specific boxes, a classification head to +predict the accuracy of generated boxes, and a contrastive head that utilizes +the proposed contrastive-reconstruction loss to further separate different +classes in visual space. We conduct extensive experiments on two benchmark +datasets MS COCO and PASCAL VOC. Experimental results show that our method +outperforms the existing ZSD methods by a large margin. + +
+
+ comment: Accepted in ICCV 2023 +
+
+
+
+
+ + ☆ Uncertainty-based quality assurance of carotid artery wall segmentation + in black-blood MRI + + +
+ The application of deep learning models to large-scale data sets requires +means for automatic quality assurance. We have previously developed a fully +automatic algorithm for carotid artery wall segmentation in black-blood MRI +that we aim to apply to large-scale data sets. This method identifies nested +artery walls in 3D patches centered on the carotid artery. In this study, we +investigate to what extent the uncertainty in the model predictions for the +contour location can serve as a surrogate for error detection and, +consequently, automatic quality assurance. We express the quality of automatic +segmentations using the Dice similarity coefficient. The uncertainty in the +model's prediction is estimated using either Monte Carlo dropout or test-time +data augmentation. We found that (1) including uncertainty measurements did not +degrade the quality of the segmentations, (2) uncertainty metrics provide a +good proxy of the quality of our contours if the center found during the first +step is enclosed in the lumen of the carotid artery and (3) they could be used +to detect low-quality segmentations at the participant level. This automatic +quality assurance tool might enable the application of our model in large-scale +data sets. + +
+
+
+
+
+ + ☆ Small Object Detection via Coarse-to-fine Proposal Generation and + Imitation Learning ICCV2023 + + +
+ The past few years have witnessed the immense success of object detection, +while current excellent detectors struggle on tackling size-limited instances. +Concretely, the well-known challenge of low overlaps between the priors and +object regions leads to a constrained sample pool for optimization, and the +paucity of discriminative information further aggravates the recognition. To +alleviate the aforementioned issues, we propose CFINet, a two-stage framework +tailored for small object detection based on the Coarse-to-fine pipeline and +Feature Imitation learning. Firstly, we introduce Coarse-to-fine RPN (CRPN) to +ensure sufficient and high-quality proposals for small objects through the +dynamic anchor selection strategy and cascade regression. Then, we equip the +conventional detection head with a Feature Imitation (FI) branch to facilitate +the region representations of size-limited instances that perplex the model in +an imitation manner. Moreover, an auxiliary imitation loss following supervised +contrastive learning paradigm is devised to optimize this branch. When +integrated with Faster RCNN, CFINet achieves state-of-the-art performance on +the large-scale small object detection benchmarks, SODA-D and SODA-A, +underscoring its superiority over baseline detector and other mainstream +detection approaches. + +
+
+ comment: Camera-ready version for ICCV2023. Our code will be available at + https://github.com/shaunyuan22/CFINet +
+
+
+
+
+ + ☆ Improving 3D Pose Estimation for Sign Language + + +
+ This work addresses 3D human pose reconstruction in single images. We present +a method that combines Forward Kinematics (FK) with neural networks to ensure a +fast and valid prediction of 3D pose. Pose is represented as a hierarchical +tree/graph with nodes corresponding to human joints that model their physical +limits. Given a 2D detection of keypoints in the image, we lift the skeleton to +3D using neural networks to predict both the joint rotations and bone lengths. +These predictions are then combined with skeletal constraints using an FK layer +implemented as a network layer in PyTorch. The result is a fast and accurate +approach to the estimation of 3D skeletal pose. Through quantitative and +qualitative evaluation, we demonstrate the method is significantly more +accurate than MediaPipe in terms of both per joint positional error and visual +appearance. Furthermore, we demonstrate generalization over different datasets. +The implementation in PyTorch runs at between 100-200 milliseconds per image +(including CNN detection) using CPU only. + +
+
+
+
+
+ + ☆ Denoising Diffusion for 3D Hand Pose Estimation from Images + + +
+ Hand pose estimation from a single image has many applications. However, +approaches to full 3D body pose estimation are typically trained on day-to-day +activities or actions. As such, detailed hand-to-hand interactions are poorly +represented, especially during motion. We see this in the failure cases of +techniques such as OpenPose or MediaPipe. However, accurate hand pose +estimation is crucial for many applications where the global body motion is +less important than accurate hand pose estimation. + This paper addresses the problem of 3D hand pose estimation from monocular +images or sequences. We present a novel end-to-end framework for 3D hand +regression that employs diffusion models that have shown excellent ability to +capture the distribution of data for generative purposes. Moreover, we enforce +kinematic constraints to ensure realistic poses are generated by incorporating +an explicit forward kinematic layer as part of the network. The proposed model +provides state-of-the-art performance when lifting a 2D single-hand image to +3D. However, when sequence data is available, we add a Transformer module over +a temporal window of consecutive frames to refine the results, overcoming +jittering and further increasing accuracy. + The method is quantitatively and qualitatively evaluated showing +state-of-the-art robustness, generalization, and accuracy on several different +datasets. + +
+
+
+
+
+ + ☆ Leveraging Intrinsic Properties for Non-Rigid Garment Alignment ICCV 2023 + + +
+ We address the problem of aligning real-world 3D data of garments, which +benefits many applications such as texture learning, physical parameter +estimation, generative modeling of garments, etc. Existing extrinsic methods +typically perform non-rigid iterative closest point and struggle to align +details due to incorrect closest matches and rigidity constraints. While +intrinsic methods based on functional maps can produce high-quality +correspondences, they work under isometric assumptions and become unreliable +for garment deformations which are highly non-isometric. To achieve +wrinkle-level as well as texture-level alignment, we present a novel +coarse-to-fine two-stage method that leverages intrinsic manifold properties +with two neural deformation fields, in the 3D space and the intrinsic space, +respectively. The coarse stage performs a 3D fitting, where we leverage +intrinsic manifold properties to define a manifold deformation field. The +coarse fitting then induces a functional map that produces an alignment of +intrinsic embeddings. We further refine the intrinsic alignment with a second +neural deformation field for higher accuracy. We evaluate our method with our +captured garment dataset, GarmCap. The method achieves accurate wrinkle-level +and texture-level alignment and works for difficult garment types such as long +coats. Our project page is +https://jsnln.github.io/iccv2023_intrinsic/index.html. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Learnt Contrastive Concept Embeddings for Sign Recognition + + +
+ In natural language processing (NLP) of spoken languages, word embeddings +have been shown to be a useful method to encode the meaning of words. Sign +languages are visual languages, which require sign embeddings to capture the +visual and linguistic semantics of sign. Unlike many common approaches to Sign +Recognition, we focus on explicitly creating sign embeddings that bridge the +gap between sign language and spoken language. We propose a learning framework +to derive LCC (Learnt Contrastive Concept) embeddings for sign language, a +weakly supervised contrastive approach to learning sign embeddings. We train a +vocabulary of embeddings that are based on the linguistic labels for sign +video. Additionally, we develop a conceptual similarity loss which is able to +utilise word embeddings from NLP methods to create sign embeddings that have +better sign language to spoken language correspondence. These learnt +representations allow the model to automatically localise the sign in time. Our +approach achieves state-of-the-art keypoint-based sign recognition performance +on the WLASL and BOBSL datasets. + +
+
+
+
+
+ + ☆ ResQ: Residual Quantization for Video Perception ICCV 2023 + + +
+ This paper accelerates video perception, such as semantic segmentation and +human pose estimation, by levering cross-frame redundancies. Unlike the +existing approaches, which avoid redundant computations by warping the past +features using optical-flow or by performing sparse convolutions on frame +differences, we approach the problem from a new perspective: low-bit +quantization. We observe that residuals, as the difference in network +activations between two neighboring frames, exhibit properties that make them +highly quantizable. Based on this observation, we propose a novel quantization +scheme for video networks coined as Residual Quantization. ResQ extends the +standard, frame-by-frame, quantization scheme by incorporating temporal +dependencies that lead to better performance in terms of accuracy vs. +bit-width. Furthermore, we extend our model to dynamically adjust the bit-width +proportional to the amount of changes in the video. We demonstrate the +superiority of our model, against the standard quantization and existing +efficient video perception models, using various architectures on semantic +segmentation and human pose estimation benchmarks. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Video-Instrument Synergistic Network for Referring Video Instrument + Segmentation in Robotic Surgery + + +
+ Robot-assisted surgery has made significant progress, with instrument +segmentation being a critical factor in surgical intervention quality. It +serves as the building block to facilitate surgical robot navigation and +surgical education for the next generation of operating intelligence. Although +existing methods have achieved accurate instrument segmentation results, they +simultaneously generate segmentation masks for all instruments, without the +capability to specify a target object and allow an interactive experience. This +work explores a new task of Referring Surgical Video Instrument Segmentation +(RSVIS), which aims to automatically identify and segment the corresponding +surgical instruments based on the given language expression. To achieve this, +we devise a novel Video-Instrument Synergistic Network (VIS-Net) to learn both +video-level and instrument-level knowledge to boost performance, while previous +work only used video-level information. Meanwhile, we design a Graph-based +Relation-aware Module (GRM) to model the correlation between multi-modal +information (i.e., textual description and video frame) to facilitate the +extraction of instrument-level information. We are also the first to produce +two RSVIS datasets to promote related research. Our method is verified on these +datasets, and experimental results exhibit that the VIS-Net can significantly +outperform existing state-of-the-art referring segmentation methods. Our code +and our datasets will be released upon the publication of this work. + +
+
+
+
+
+ + ☆ Vision Relation Transformer for Unbiased Scene Graph Generation ICCV 2023 + + +
+ Recent years have seen a growing interest in Scene Graph Generation (SGG), a +comprehensive visual scene understanding task that aims to predict entity +relationships using a relation encoder-decoder pipeline stacked on top of an +object encoder-decoder backbone. Unfortunately, current SGG methods suffer from +an information loss regarding the entities local-level cues during the relation +encoding process. To mitigate this, we introduce the Vision rElation +TransfOrmer (VETO), consisting of a novel local-level entity relation encoder. +We further observe that many existing SGG methods claim to be unbiased, but are +still biased towards either head or tail classes. To overcome this bias, we +introduce a Mutually Exclusive ExperT (MEET) learning strategy that captures +important relation features without bias towards head or tail classes. +Experimental results on the VG and GQA datasets demonstrate that VETO + MEET +boosts the predictive performance by up to 47 percentage over the state of the +art while being 10 times smaller. + +
+
+ comment: Accepted for publication in ICCV 2023 +
+
+
+
+
+ + ☆ Quantitative Susceptibility Mapping through Model-based Deep Image Prior + (MoDIP) + + +
+ The data-driven approach of supervised learning methods has limited +applicability in solving dipole inversion in Quantitative Susceptibility +Mapping (QSM) with varying scan parameters across different objects. To address +this generalization issue in supervised QSM methods, we propose a novel +training-free model-based unsupervised method called MoDIP (Model-based Deep +Image Prior). MoDIP comprises a small, untrained network and a Data Fidelity +Optimization (DFO) module. The network converges to an interim state, acting as +an implicit prior for image regularization, while the optimization process +enforces the physical model of QSM dipole inversion. Experimental results +demonstrate MoDIP's excellent generalizability in solving QSM dipole inversion +across different scan parameters. It exhibits robustness against pathological +brain QSM, achieving over 32% accuracy improvement than supervised deep +learning and traditional iterative methods. It is also 33% more computationally +efficient and runs 4 times faster than conventional DIP-based approaches, +enabling 3D high-resolution image reconstruction in under 4.5 minutes. + +
+
+
+
+
+ + ☆ Data augmentation and explainability for bias discovery and mitigation + in deep learning + + +
+ This dissertation explores the impact of bias in deep neural networks and +presents methods for reducing its influence on model performance. The first +part begins by categorizing and describing potential sources of bias and errors +in data and models, with a particular focus on bias in machine learning +pipelines. The next chapter outlines a taxonomy and methods of Explainable AI +as a way to justify predictions and control and improve the model. Then, as an +example of a laborious manual data inspection and bias discovery process, a +skin lesion dataset is manually examined. A Global Explanation for the Bias +Identification method is proposed as an alternative semi-automatic approach to +manual data exploration for discovering potential biases in data. Relevant +numerical methods and metrics are discussed for assessing the effects of the +identified biases on the model. Whereas identifying errors and bias is +critical, improving the model and reducing the number of flaws in the future is +an absolute priority. Hence, the second part of the thesis focuses on +mitigating the influence of bias on ML models. Three approaches are proposed +and discussed: Style Transfer Data Augmentation, Targeted Data Augmentations, +and Attribution Feedback. Style Transfer Data Augmentation aims to address +shape and texture bias by merging a style of a malignant lesion with a +conflicting shape of a benign one. Targeted Data Augmentations randomly insert +possible biases into all images in the dataset during the training, as a way to +make the process random and, thus, destroy spurious correlations. Lastly, +Attribution Feedback is used to fine-tune the model to improve its accuracy by +eliminating obvious mistakes and teaching it to ignore insignificant input +parts via an attribution loss. The goal of these approaches is to reduce the +influence of bias on machine learning models, rather than eliminate it +entirely. + +
+
+ comment: A PhD Thesis +
+
+
+
+
+ + ☆ Accelerated Bayesian imaging by relaxed proximal-point Langevin sampling + + +
+ This paper presents a new accelerated proximal Markov chain Monte Carlo +methodology to perform Bayesian inference in imaging inverse problems with an +underlying convex geometry. The proposed strategy takes the form of a +stochastic relaxed proximal-point iteration that admits two complementary +interpretations. For models that are smooth or regularised by Moreau-Yosida +smoothing, the algorithm is equivalent to an implicit midpoint discretisation +of an overdamped Langevin diffusion targeting the posterior distribution of +interest. This discretisation is asymptotically unbiased for Gaussian targets +and shown to converge in an accelerated manner for any target that is +$\kappa$-strongly log-concave (i.e., requiring in the order of $\sqrt{\kappa}$ +iterations to converge, similarly to accelerated optimisation schemes), +comparing favorably to [M. Pereyra, L. Vargas Mieles, K.C. Zygalakis, SIAM J. +Imaging Sciences, 13, 2 (2020), pp. 905-935] which is only provably accelerated +for Gaussian targets and has bias. For models that are not smooth, the +algorithm is equivalent to a Leimkuhler-Matthews discretisation of a Langevin +diffusion targeting a Moreau-Yosida approximation of the posterior distribution +of interest, and hence achieves a significantly lower bias than conventional +unadjusted Langevin strategies based on the Euler-Maruyama discretisation. For +targets that are $\kappa$-strongly log-concave, the provided non-asymptotic +convergence analysis also identifies the optimal time step which maximizes the +convergence speed. The proposed methodology is demonstrated through a range of +experiments related to image deconvolution with Gaussian and Poisson noise, +with assumption-driven and data-driven convex priors. + +
+
+ comment: 34 pages, 13 figures +
+
+
+
+
+ + ☆ Artificial-Spiking Hierarchical Networks for Vision-Language + Representation Learning + + +
+ With the success of self-supervised learning, multimodal foundation models +have rapidly adapted a wide range of downstream tasks driven by vision and +language (VL) pretraining. State-of-the-art methods achieve impressive +performance by pre-training on large-scale datasets. However, bridging the +semantic gap between the two modalities remains a nonnegligible challenge for +VL tasks. In this work, we propose an efficient computation framework for +multimodal alignment by introducing a novel visual semantic module to further +improve the performance of the VL tasks. Specifically, we propose a flexible +model, namely Artificial-Spiking Hierarchical Networks (ASH-Nets), which +combines the complementary advantages of Artificial neural networks (ANNs) and +Spiking neural networks (SNNs) to enrich visual semantic representations. In +particular, a visual concrete encoder and a semantic abstract encoder are +constructed to learn continuous and discrete latent variables to enhance the +flexibility of semantic encoding. Considering the spatio-temporal properties of +SNNs modeling, we introduce a contrastive learning method to optimize the +inputs of similar samples. This can improve the computational efficiency of the +hierarchical network, while the augmentation of hard samples is beneficial to +the learning of visual representations. Furthermore, the Spiking to Text +Uni-Alignment Learning (STUA) pre-training method is proposed, which only +relies on text features to enhance the encoding ability of abstract semantics. +We validate the performance on multiple well-established downstream VL tasks. +Experiments show that the proposed ASH-Nets achieve competitive results. + +
+
+
+
+
+ + ☆ From Hope to Safety: Unlearning Biases of Deep Models by Enforcing the + Right Reasons in Latent Space + + +
+ Deep Neural Networks are prone to learning spurious correlations embedded in +the training data, leading to potentially biased predictions. This poses risks +when deploying these models for high-stake decision-making, such as in medical +applications. Current methods for post-hoc model correction either require +input-level annotations, which are only possible for spatially localized +biases, or augment the latent feature space, thereby hoping to enforce the +right reasons. We present a novel method ensuring the right reasons on the +concept level by reducing the model's sensitivity towards biases through the +gradient. When modeling biases via Concept Activation Vectors, we highlight the +importance of choosing robust directions, as traditional regression-based +approaches such as Support Vector Machines tend to result in diverging +directions. We effectively mitigate biases in controlled and real-world +settings on the ISIC, Bone Age, ImageNet and CelebA datasets using VGG, ResNet +and EfficientNet architectures. + +
+
+
+
+
+ + ☆ Transformer-based Detection of Microorganismson High-Resolution Petri + Dish Images ICCV + + +
+ Many medical or pharmaceutical processes have strict guidelines regarding +continuous hygiene monitoring. This often involves the labor-intensive task of +manually counting microorganisms in Petri dishes by trained personnel. +Automation attempts often struggle due to major challenges: significant scaling +differences, low separation, low contrast, etc. To address these challenges, we +introduce AttnPAFPN, a high-resolution detection pipeline that leverages a +novel transformer variation, the efficient-global self-attention mechanism. Our +streamlined approach can be easily integrated in almost any multi-scale object +detection pipeline. In a comprehensive evaluation on the publicly available +AGAR dataset, we demonstrate the superior accuracy of our network over the +current state-of-the-art. In order to demonstrate the task-independent +performance of our approach, we perform further experiments on COCO and +LIVECell datasets. + +
+
+ comment: This paper has been accepted at IEEE International Conference on + Computer Vision Workshops (ICCV workshop), 2023 +
+
+
+
+
+ + ☆ Can ultrasound confidence maps predict sonographers' labeling + variability? + + +
+ Measuring cross-sectional areas in ultrasound images is a standard tool to +evaluate disease progress or treatment response. Often addressed today with +supervised deep-learning segmentation approaches, existing solutions highly +depend upon the quality of experts' annotations. However, the annotation +quality in ultrasound is anisotropic and position-variant due to the inherent +physical imaging principles, including attenuation, shadows, and missing +boundaries, commonly exacerbated with depth. This work proposes a novel +approach that guides ultrasound segmentation networks to account for +sonographers' uncertainties and generate predictions with variability similar +to the experts. We claim that realistic variability can reduce overconfident +predictions and improve physicians' acceptance of deep-learning cross-sectional +segmentation solutions. Our method provides CM's certainty for each pixel for +minimal computational overhead as it can be precalculated directly from the +image. We show that there is a correlation between low values in the confidence +maps and expert's label uncertainty. Therefore, we propose to give the +confidence maps as additional information to the networks. We study the effect +of the proposed use of ultrasound CMs in combination with four state-of-the-art +neural networks and in two configurations: as a second input channel and as +part of the loss. We evaluate our method on 3D ultrasound datasets of the +thyroid and lower limb muscles. Our results show ultrasound CMs increase the +Dice score, improve the Hausdorff and Average Surface Distances, and decrease +the number of isolated pixel predictions. Furthermore, our findings suggest +that ultrasound CMs improve the penalization of uncertain areas in the ground +truth data, thereby improving problematic interpolations. Our code and example +data will be made public at +https://github.com/IFL-CAMP/Confidence-segmentation. + +
+
+
+
+
+ + ☆ Self-Supervised Single-Image Deconvolution with Siamese Neural Networks MICCAI 2023 + + +
+ Inverse problems in image reconstruction are fundamentally complicated by +unknown noise properties. Classical iterative deconvolution approaches amplify +noise and require careful parameter selection for an optimal trade-off between +sharpness and grain. Deep learning methods allow for flexible parametrization +of the noise and learning its properties directly from the data. Recently, +self-supervised blind-spot neural networks were successfully adopted for image +deconvolution by including a known point-spread function in the end-to-end +training. However, their practical application has been limited to 2D images in +the biomedical domain because it implies large kernels that are poorly +optimized. We tackle this problem with Fast Fourier Transform convolutions that +provide training speed-up in 3D microscopy deconvolution tasks. Further, we +propose to adopt a Siamese invariance loss for deconvolution and empirically +identify its optimal position in the neural network between blind-spot and full +image branches. The experimental results show that our improved framework +outperforms the previous state-of-the-art deconvolution methods with a known +point spread function. + +
+
+ comment: Accepted for DALI @ MICCAI 2023 +
+
+
+
+
+ + ☆ MonoNeRD: NeRF-like Representations for Monocular 3D Object Detection ICCV 2023 + + +
+ In the field of monocular 3D detection, it is common practice to utilize +scene geometric clues to enhance the detector's performance. However, many +existing works adopt these clues explicitly such as estimating a depth map and +back-projecting it into 3D space. This explicit methodology induces sparsity in +3D representations due to the increased dimensionality from 2D to 3D, and leads +to substantial information loss, especially for distant and occluded objects. +To alleviate this issue, we propose MonoNeRD, a novel detection framework that +can infer dense 3D geometry and occupancy. Specifically, we model scenes with +Signed Distance Functions (SDF), facilitating the production of dense 3D +representations. We treat these representations as Neural Radiance Fields +(NeRF) and then employ volume rendering to recover RGB images and depth maps. +To the best of our knowledge, this work is the first to introduce volume +rendering for M3D, and demonstrates the potential of implicit reconstruction +for image-based 3D perception. Extensive experiments conducted on the KITTI-3D +benchmark and Waymo Open Dataset demonstrate the effectiveness of MonoNeRD. +Codes are available at https://github.com/cskkxjk/MonoNeRD. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Metadata Improves Segmentation Through Multitasking Elicitation MICCAI 2023 + + +
+ Metainformation is a common companion to biomedical images. However, this +potentially powerful additional source of signal from image acquisition has had +limited use in deep learning methods, for semantic segmentation in particular. +Here, we incorporate metadata by employing a channel modulation mechanism in +convolutional networks and study its effect on semantic segmentation tasks. We +demonstrate that metadata as additional input to a convolutional network can +improve segmentation results while being inexpensive in implementation as a +nimble add-on to popular models. We hypothesize that this benefit of metadata +can be attributed to facilitating multitask switching. This aspect of +metadata-driven systems is explored and discussed in detail. + +
+
+ comment: Accepted for DART @ MICCAI 2023 +
+
+
+
+
+ + ☆ Generalizable Decision Boundaries: Dualistic Meta-Learning for Open Set + Domain Generalization ICCV2023 + + +
+ Domain generalization (DG) is proposed to deal with the issue of domain +shift, which occurs when statistical differences exist between source and +target domains. However, most current methods do not account for a common +realistic scenario where the source and target domains have different classes. +To overcome this deficiency, open set domain generalization (OSDG) then emerges +as a more practical setting to recognize unseen classes in unseen domains. An +intuitive approach is to use multiple one-vs-all classifiers to define decision +boundaries for each class and reject the outliers as unknown. However, the +significant class imbalance between positive and negative samples often causes +the boundaries biased towards positive ones, resulting in misclassification for +known samples in the unseen target domain. In this paper, we propose a novel +meta-learning-based framework called dualistic MEta-learning with joint +DomaIn-Class matching (MEDIC), which considers gradient matching towards +inter-domain and inter-class splits simultaneously to find a generalizable +boundary balanced for all tasks. Experimental results demonstrate that MEDIC +not only outperforms previous methods in open set scenarios, but also maintains +competitive close set generalization ability at the same time. Our code is +available at https://github.com/zzwdx/MEDIC. + +
+
+ comment: 10 pages, 5 figures, accepted by ICCV2023 +
+
+
+
+
+ + ☆ Diffusion Models for Image Restoration and Enhancement -- A + Comprehensive Survey + + +
+ Image restoration (IR) has been an indispensable and challenging task in the +low-level vision field, which strives to improve the subjective quality of +images distorted by various forms of degradation. Recently, the diffusion model +has achieved significant advancements in the visual generation of AIGC, thereby +raising an intuitive question, "whether diffusion model can boost image +restoration". To answer this, some pioneering studies attempt to integrate +diffusion models into the image restoration task, resulting in superior +performances than previous GAN-based methods. Despite that, a comprehensive and +enlightening survey on diffusion model-based image restoration remains scarce. +In this paper, we are the first to present a comprehensive review of recent +diffusion model-based methods on image restoration, encompassing the learning +paradigm, conditional strategy, framework design, modeling strategy, and +evaluation. Concretely, we first introduce the background of the diffusion +model briefly and then present two prevalent workflows that exploit diffusion +models in image restoration. Subsequently, we classify and emphasize the +innovative designs using diffusion models for both IR and blind/real-world IR, +intending to inspire future development. To evaluate existing methods +thoroughly, we summarize the commonly-used dataset, implementation details, and +evaluation metrics. Additionally, we present the objective comparison for +open-sourced methods across three tasks, including image super-resolution, +deblurring, and inpainting. Ultimately, informed by the limitations in existing +works, we propose five potential and challenging directions for the future +research of diffusion model-based IR, including sampling efficiency, model +compression, distortion simulation and estimation, distortion invariant +learning, and framework design. + +
+
+ comment: 34 pages +
+
+
+
+
+ + ☆ DReg-NeRF: Deep Registration for Neural Radiance Fields ICCV 2023 + + +
+ Although Neural Radiance Fields (NeRF) is popular in the computer vision +community recently, registering multiple NeRFs has yet to gain much attention. +Unlike the existing work, NeRF2NeRF, which is based on traditional optimization +methods and needs human annotated keypoints, we propose DReg-NeRF to solve the +NeRF registration problem on object-centric scenes without human intervention. +After training NeRF models, our DReg-NeRF first extracts features from the +occupancy grid in NeRF. Subsequently, our DReg-NeRF utilizes a transformer +architecture with self-attention and cross-attention layers to learn the +relations between pairwise NeRF blocks. In contrast to state-of-the-art (SOTA) +point cloud registration methods, the decoupled correspondences are supervised +by surface fields without any ground truth overlapping labels. We construct a +novel view synthesis dataset with 1,700+ 3D objects obtained from Objaverse to +train our network. When evaluated on the test set, our proposed method beats +the SOTA point cloud registration methods by a large margin, with a mean +$\text{RPE}=9.67^{\circ}$ and a mean $\text{RTE}=0.038$. + Our code is available at https://github.com/AIBluefisher/DReg-NeRF. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Label-Free Event-based Object Recognition via Joint Learning with Image + Reconstruction from Events ICCV 2023 + + +
+ Recognizing objects from sparse and noisy events becomes extremely difficult +when paired images and category labels do not exist. In this paper, we study +label-free event-based object recognition where category labels and paired +images are not available. To this end, we propose a joint formulation of object +recognition and image reconstruction in a complementary manner. Our method +first reconstructs images from events and performs object recognition through +Contrastive Language-Image Pre-training (CLIP), enabling better recognition +through a rich context of images. Since the category information is essential +in reconstructing images, we propose category-guided attraction loss and +category-agnostic repulsion loss to bridge the textual features of predicted +categories and the visual features of reconstructed images using CLIP. +Moreover, we introduce a reliable data sampling strategy and local-global +reconstruction consistency to boost joint learning of two tasks. To enhance the +accuracy of prediction and quality of reconstruction, we also propose a +prototype-based approach using unpaired images. Extensive experiments +demonstrate the superiority of our method and its extensibility for zero-shot +object recognition. Our project code is available at +\url{https://github.com/Chohoonhee/Ev-LaFOR}. + +
+
+ comment: Accepted to ICCV 2023 (Oral) +
+
+
+
+
+ + ☆ Deciphering knee osteoarthritis diagnostic features with explainable + artificial intelligence: A systematic review + + +
+ Existing artificial intelligence (AI) models for diagnosing knee +osteoarthritis (OA) have faced criticism for their lack of transparency and +interpretability, despite achieving medical-expert-like performance. This +opacity makes them challenging to trust in clinical practice. Recently, +explainable artificial intelligence (XAI) has emerged as a specialized +technique that can provide confidence in the model's prediction by revealing +how the prediction is derived, thus promoting the use of AI systems in +healthcare. This paper presents the first survey of XAI techniques used for +knee OA diagnosis. The XAI techniques are discussed from two perspectives: data +interpretability and model interpretability. The aim of this paper is to +provide valuable insights into XAI's potential towards a more reliable knee OA +diagnosis approach and encourage its adoption in clinical practice. + +
+
+
+
+
+ + ☆ Image Processing and Machine Learning for Hyperspectral Unmixing: An + Overview and the HySUPP Python Package + + +
+ Spectral pixels are often a mixture of the pure spectra of the materials, +called endmembers, due to the low spatial resolution of hyperspectral sensors, +double scattering, and intimate mixtures of materials in the scenes. Unmixing +estimates the fractional abundances of the endmembers within the pixel. +Depending on the prior knowledge of endmembers, linear unmixing can be divided +into three main groups: supervised, semi-supervised, and unsupervised (blind) +linear unmixing. Advances in Image processing and machine learning +substantially affected unmixing. This paper provides an overview of advanced +and conventional unmixing approaches. Additionally, we draw a critical +comparison between advanced and conventional techniques from the three +categories. We compare the performance of the unmixing techniques on three +simulated and two real datasets. The experimental results reveal the advantages +of different unmixing categories for different unmixing scenarios. Moreover, we +provide an open-source Python-based package available at +https://github.com/BehnoodRasti/HySUPP to reproduce the results. + +
+
+
+
+
+ + ☆ Which Transformer to Favor: A Comparative Analysis of Efficiency in + Vision Transformers + + +
+ The growing popularity of Vision Transformers as the go-to models for image +classification has led to an explosion of architectural modifications claiming +to be more efficient than the original ViT. However, a wide diversity of +experimental conditions prevents a fair comparison between all of them, based +solely on their reported results. To address this gap in comparability, we +conduct a comprehensive analysis of more than 30 models to evaluate the +efficiency of vision transformers and related architectures, considering +various performance metrics. Our benchmark provides a comparable baseline +across the landscape of efficiency-oriented transformers, unveiling a plethora +of surprising insights. For example, we discover that ViT is still Pareto +optimal across multiple efficiency metrics, despite the existence of several +alternative approaches claiming to be more efficient. Results also indicate +that hybrid attention-CNN models fare particularly well when it comes to low +inference memory and number of parameters, and also that it is better to scale +the model size, than the image size. Furthermore, we uncover a strong positive +correlation between the number of FLOPS and the training memory, which enables +the estimation of required VRAM from theoretical measurements alone. + Thanks to our holistic evaluation, this study offers valuable insights for +practitioners and researchers, facilitating informed decisions when selecting +models for specific applications. We publicly release our code and data at +https://github.com/tobna/WhatTransformerToFavor + +
+
+
+
+
+ + ☆ Single Frame Semantic Segmentation Using Multi-Modal Spherical Images WACV 2024 + + +
+ In recent years, the research community has shown a lot of interest to +panoramic images that offer a 360-degree directional perspective. Multiple data +modalities can be fed, and complimentary characteristics can be utilized for +more robust and rich scene interpretation based on semantic segmentation, to +fully realize the potential. Existing research, however, mostly concentrated on +pinhole RGB-X semantic segmentation. In this study, we propose a +transformer-based cross-modal fusion architecture to bridge the gap between +multi-modal fusion and omnidirectional scene perception. We employ +distortion-aware modules to address extreme object deformations and panorama +distortions that result from equirectangular representation. Additionally, we +conduct cross-modal interactions for feature rectification and information +exchange before merging the features in order to communicate long-range +contexts for bi-modal and tri-modal feature streams. In thorough tests using +combinations of four different modality types in three indoor panoramic-view +datasets, our technique achieved state-of-the-art mIoU performance: 60.60% on +Stanford2D3DS (RGB-HHA), 71.97% Structured3D (RGB-D-N), and 35.92% Matterport3D +(RGB-D). We plan to release all codes and trained models soon. + +
+
+ comment: Accepted at WACV 2024 +
+
+
+
+
+ + ☆ A tailored Handwritten-Text-Recognition System for Medieval Latin + + +
+ The Bavarian Academy of Sciences and Humanities aims to digitize its Medieval +Latin Dictionary. This dictionary entails record cards referring to lemmas in +medieval Latin, a low-resource language. A crucial step of the digitization +process is the Handwritten Text Recognition (HTR) of the handwritten lemmas +found on these record cards. In our work, we introduce an end-to-end pipeline, +tailored to the medieval Latin dictionary, for locating, extracting, and +transcribing the lemmas. We employ two state-of-the-art (SOTA) image +segmentation models to prepare the initial data set for the HTR task. +Furthermore, we experiment with different transformer-based models and conduct +a set of experiments to explore the capabilities of different combinations of +vision encoders with a GPT-2 decoder. Additionally, we also apply extensive +data augmentation resulting in a highly competitive model. The best-performing +setup achieved a Character Error Rate (CER) of 0.015, which is even superior to +the commercial Google Cloud Vision model, and shows more stable performance. + +
+
+ comment: This paper has been accepted at the First Workshop on Ancient + Language Processing, co-located with RANLP 2023. This is the author's version + of the work. The definite version of record will be published in the + proceedings +
+
+
+
+
+ + ☆ Overlap Bias Matching is Necessary for Point Cloud Registration + + +
+ Point cloud registration is a fundamental problem in many domains. +Practically, the overlap between point clouds to be registered may be +relatively small. Most unsupervised methods lack effective initial evaluation +of overlap, leading to suboptimal registration accuracy. To address this issue, +we propose an unsupervised network Overlap Bias Matching Network (OBMNet) for +partial point cloud registration. Specifically, we propose a plug-and-play +Overlap Bias Matching Module (OBMM) comprising two integral components, overlap +sampling module and bias prediction module. These two components are utilized +to capture the distribution of overlapping regions and predict bias +coefficients of point cloud common structures, respectively. Then, we integrate +OBMM with the neighbor map matching module to robustly identify correspondences +by precisely merging matching scores of points within the neighborhood, which +addresses the ambiguities in single-point features. OBMNet can maintain +efficacy even in pair-wise registration scenarios with low overlap ratios. +Experimental results on extensive datasets demonstrate that our approach's +performance achieves a significant improvement compared to the state-of-the-art +registration approach. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2202.11292 by other authors +
+
+
+
+
+ + ☆ Open-vocabulary Video Question Answering: A New Benchmark for Evaluating + the Generalizability of Video Question Answering Models ICCV 2023 + + +
+ Video Question Answering (VideoQA) is a challenging task that entails complex +multi-modal reasoning. In contrast to multiple-choice VideoQA which aims to +predict the answer given several options, the goal of open-ended VideoQA is to +answer questions without restricting candidate answers. However, the majority +of previous VideoQA models formulate open-ended VideoQA as a classification +task to classify the video-question pairs into a fixed answer set, i.e., +closed-vocabulary, which contains only frequent answers (e.g., top-1000 +answers). This leads the model to be biased toward only frequent answers and +fail to generalize on out-of-vocabulary answers. We hence propose a new +benchmark, Open-vocabulary Video Question Answering (OVQA), to measure the +generalizability of VideoQA models by considering rare and unseen answers. In +addition, in order to improve the model's generalization power, we introduce a +novel GNN-based soft verbalizer that enhances the prediction on rare and unseen +answers by aggregating the information from their similar words. For +evaluation, we introduce new baselines by modifying the existing +(closed-vocabulary) open-ended VideoQA models and improve their performances by +further taking into account rare and unseen answers. Our ablation studies and +qualitative analyses demonstrate that our GNN-based soft verbalizer further +improves the model performance, especially on rare and unseen answers. We hope +that our benchmark OVQA can serve as a guide for evaluating the +generalizability of VideoQA models and inspire future research. Code is +available at https://github.com/mlvlab/OVQA. + +
+
+ comment: Accepted paper at ICCV 2023 +
+
+
+
+
+ + ☆ Multi-scale Target-Aware Framework for Constrained Image Splicing + Detection and Localization + + +
+ Constrained image splicing detection and localization (CISDL) is a +fundamental task of multimedia forensics, which detects splicing operation +between two suspected images and localizes the spliced region on both images. +Recent works regard it as a deep matching problem and have made significant +progress. However, existing frameworks typically perform feature extraction and +correlation matching as separate processes, which may hinder the model's +ability to learn discriminative features for matching and can be susceptible to +interference from ambiguous background pixels. In this work, we propose a +multi-scale target-aware framework to couple feature extraction and correlation +matching in a unified pipeline. In contrast to previous methods, we design a +target-aware attention mechanism that jointly learns features and performs +correlation matching between the probe and donor images. Our approach can +effectively promote the collaborative learning of related patches, and perform +mutual promotion of feature learning and correlation matching. Additionally, in +order to handle scale transformations, we introduce a multi-scale projection +method, which can be readily integrated into our target-aware framework that +enables the attention process to be conducted between tokens containing +information of varying scales. Our experiments demonstrate that our model, +which uses a unified pipeline, outperforms state-of-the-art methods on several +benchmark datasets and is robust against scale transformations. + +
+
+
+
+
+ + ☆ RLIPv2: Fast Scaling of Relational Language-Image Pre-training ICCV 2023 + + +
+ Relational Language-Image Pre-training (RLIP) aims to align vision +representations with relational texts, thereby advancing the capability of +relational reasoning in computer vision tasks. However, hindered by the slow +convergence of RLIPv1 architecture and the limited availability of existing +scene graph data, scaling RLIPv1 is challenging. In this paper, we propose +RLIPv2, a fast converging model that enables the scaling of relational +pre-training to large-scale pseudo-labelled scene graph data. To enable fast +scaling, RLIPv2 introduces Asymmetric Language-Image Fusion (ALIF), a mechanism +that facilitates earlier and deeper gated cross-modal fusion with sparsified +language encoding layers. ALIF leads to comparable or better performance than +RLIPv1 in a fraction of the time for pre-training and fine-tuning. To obtain +scene graph data at scale, we extend object detection datasets with free-form +relation labels by introducing a captioner (e.g., BLIP) and a designed Relation +Tagger. The Relation Tagger assigns BLIP-generated relation texts to region +pairs, thus enabling larger-scale relational pre-training. Through extensive +experiments conducted on Human-Object Interaction Detection and Scene Graph +Generation, RLIPv2 shows state-of-the-art performance on three benchmarks under +fully-finetuning, few-shot and zero-shot settings. Notably, the largest RLIPv2 +achieves 23.29mAP on HICO-DET without any fine-tuning, yields 32.22mAP with +just 1% data and yields 45.09mAP with 100% data. Code and models are publicly +available at https://github.com/JacobYuan7/RLIPv2. + +
+
+ comment: Accepted to ICCV 2023. Code and models: + https://github.com/JacobYuan7/RLIPv2 +
+
+
+
+
+ + ☆ Boosting Few-shot Action Recognition with Graph-guided Hybrid Matching ICCV2023 + + +
+ Class prototype construction and matching are core aspects of few-shot action +recognition. Previous methods mainly focus on designing spatiotemporal relation +modeling modules or complex temporal alignment algorithms. Despite the +promising results, they ignored the value of class prototype construction and +matching, leading to unsatisfactory performance in recognizing similar +categories in every task. In this paper, we propose GgHM, a new framework with +Graph-guided Hybrid Matching. Concretely, we learn task-oriented features by +the guidance of a graph neural network during class prototype construction, +optimizing the intra- and inter-class feature correlation explicitly. Next, we +design a hybrid matching strategy, combining frame-level and tuple-level +matching to classify videos with multivariate styles. We additionally propose a +learnable dense temporal modeling module to enhance the video feature temporal +representation to build a more solid foundation for the matching process. GgHM +shows consistent improvements over other challenging baselines on several +few-shot datasets, demonstrating the effectiveness of our method. The code will +be publicly available at https://github.com/jiazheng-xing/GgHM. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ Denoising diffusion-based MR to CT image translation enables whole spine + vertebral segmentation in 2D and 3D without manual annotations + + +
+ Background: Automated segmentation of spinal MR images plays a vital role +both scientifically and clinically. However, accurately delineating posterior +spine structures presents challenges. + Methods: This retrospective study, approved by the ethical committee, +involved translating T1w and T2w MR image series into CT images in a total of +n=263 pairs of CT/MR series. Landmark-based registration was performed to align +image pairs. We compared 2D paired (Pix2Pix, denoising diffusion implicit +models (DDIM) image mode, DDIM noise mode) and unpaired (contrastive unpaired +translation, SynDiff) image-to-image translation using "peak signal to noise +ratio" (PSNR) as quality measure. A publicly available segmentation network +segmented the synthesized CT datasets, and Dice scores were evaluated on +in-house test sets and the "MRSpineSeg Challenge" volumes. The 2D findings were +extended to 3D Pix2Pix and DDIM. + Results: 2D paired methods and SynDiff exhibited similar translation +performance and Dice scores on paired data. DDIM image mode achieved the +highest image quality. SynDiff, Pix2Pix, and DDIM image mode demonstrated +similar Dice scores (0.77). For craniocaudal axis rotations, at least two +landmarks per vertebra were required for registration. The 3D translation +outperformed the 2D approach, resulting in improved Dice scores (0.80) and +anatomically accurate segmentations in a higher resolution than the original MR +image. + Conclusion: Two landmarks per vertebra registration enabled paired +image-to-image translation from MR to CT and outperformed all unpaired +approaches. The 3D techniques provided anatomically correct segmentations, +avoiding underprediction of small structures like the spinous process. + +
+
+ comment: 35 pages, 7 figures, Code and a model weights available + https://doi.org/10.5281/zenodo.8221159 and + https://doi.org/10.5281/zenodo.8198697 +
+
+
+
+
+ + ☆ Surprise machines: revealing Harvard Art Museums' image collection + + +
+ Surprise Machines is a project of experimental museology that sets out to +visualize the entire image collection of the Harvard Art Museums, intending to +open up unexpected vistas on more than 200,000 objects usually inaccessible to +visitors. Part of the exhibition Curatorial A(i)gents organized by metaLAB (at) +Harvard, the project explores the limits of artificial intelligence to display +a large set of images and create surprise among visitors. To achieve such a +feeling of surprise, a choreographic interface was designed to connect the +audience's movement with several unique views of the collection. + +
+
+ comment: 14 pages and 7 figures +
+
+
+
+
+ + ☆ LSCD: A Large-Scale Screen Content Dataset for Video Compression + + +
+ Multimedia compression allows us to watch videos, see pictures and hear +sounds within a limited bandwidth, which helps the flourish of the internet. +During the past decades, multimedia compression has achieved great success +using hand-craft features and systems. With the development of artificial +intelligence and video compression, there emerges a lot of research work +related to using the neural network on the video compression task to get rid of +the complicated system. Not only producing the advanced algorithms, but +researchers also spread the compression to different content, such as User +Generated Content(UGC). With the rapid development of mobile devices, screen +content videos become an important part of multimedia data. In contrast, we +find community lacks a large-scale dataset for screen content video +compression, which impedes the fast development of the corresponding +learning-based algorithms. In order to fulfill this blank and accelerate the +research of this special type of videos, we propose the Large-scale Screen +Content Dataset(LSCD), which contains 714 source sequences. Meanwhile, we +provide the analysis of the proposed dataset to show some features of screen +content videos, which will help researchers have a better understanding of how +to explore new algorithms. Besides collecting and post-processing the data to +organize the dataset, we also provide a benchmark containing the performance of +both traditional codec and learning-based methods. + +
+
+
+
+
+ + ☆ SAMedOCT: Adapting Segment Anything Model (SAM) for Retinal OCT + + +
+ The Segment Anything Model (SAM) has gained significant attention in the +field of image segmentation due to its impressive capabilities and prompt-based +interface. While SAM has already been extensively evaluated in various domains, +its adaptation to retinal OCT scans remains unexplored. To bridge this research +gap, we conduct a comprehensive evaluation of SAM and its adaptations on a +large-scale public dataset of OCTs from RETOUCH challenge. Our evaluation +covers diverse retinal diseases, fluid compartments, and device vendors, +comparing SAM against state-of-the-art retinal fluid segmentation methods. +Through our analysis, we showcase adapted SAM's efficacy as a powerful +segmentation model in retinal OCT scans, although still lagging behind +established methods in some circumstances. The findings highlight SAM's +adaptability and robustness, showcasing its utility as a valuable tool in +retinal OCT image analysis and paving the way for further advancements in this +domain. + +
+
+
+
+
+ + ☆ Unlimited Knowledge Distillation for Action Recognition in the Dark + + +
+ Dark videos often lose essential information, which causes the knowledge +learned by networks is not enough to accurately recognize actions. Existing +knowledge assembling methods require massive GPU memory to distill the +knowledge from multiple teacher models into a student model. In action +recognition, this drawback becomes serious due to much computation required by +video process. Constrained by limited computation source, these approaches are +infeasible. To address this issue, we propose an unlimited knowledge +distillation (UKD) in this paper. Compared with existing knowledge assembling +methods, our UKD can effectively assemble different knowledge without +introducing high GPU memory consumption. Thus, the number of teaching models +for distillation is unlimited. With our UKD, the network's learned knowledge +can be remarkably enriched. Our experiments show that the single stream network +distilled with our UKD even surpasses a two-stream network. Extensive +experiments are conducted on the ARID dataset. + +
+
+
+
+
+ + ☆ Audio-Visual Glance Network for Efficient Video Recognition ICCV 2023 + + +
+ Deep learning has made significant strides in video understanding tasks, but +the computation required to classify lengthy and massive videos using +clip-level video classifiers remains impractical and prohibitively expensive. +To address this issue, we propose Audio-Visual Glance Network (AVGN), which +leverages the commonly available audio and visual modalities to efficiently +process the spatio-temporally important parts of a video. AVGN firstly divides +the video into snippets of image-audio clip pair and employs lightweight +unimodal encoders to extract global visual features and audio features. To +identify the important temporal segments, we use an Audio-Visual Temporal +Saliency Transformer (AV-TeST) that estimates the saliency scores of each +frame. To further increase efficiency in the spatial dimension, AVGN processes +only the important patches instead of the whole images. We use an +Audio-Enhanced Spatial Patch Attention (AESPA) module to produce a set of +enhanced coarse visual features, which are fed to a policy network that +produces the coordinates of the important patches. This approach enables us to +focus only on the most important spatio-temporally parts of the video, leading +to more efficient video recognition. Moreover, we incorporate various training +techniques and multi-modal feature fusion to enhance the robustness and +effectiveness of our AVGN. By combining these strategies, our AVGN sets new +state-of-the-art performance in multiple video recognition benchmarks while +achieving faster processing speed. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Retro-FPN: Retrospective Feature Pyramid Network for Point Cloud + Semantic Segmentation ICCV 2023 + + +
+ Learning per-point semantic features from the hierarchical feature pyramid is +essential for point cloud semantic segmentation. However, most previous methods +suffered from ambiguous region features or failed to refine per-point features +effectively, which leads to information loss and ambiguous semantic +identification. To resolve this, we propose Retro-FPN to model the per-point +feature prediction as an explicit and retrospective refining process, which +goes through all the pyramid layers to extract semantic features explicitly for +each point. Its key novelty is a retro-transformer for summarizing semantic +contexts from the previous layer and accordingly refining the features in the +current stage. In this way, the categorization of each point is conditioned on +its local semantic pattern. Specifically, the retro-transformer consists of a +local cross-attention block and a semantic gate unit. The cross-attention +serves to summarize the semantic pattern retrospectively from the previous +layer. And the gate unit carefully incorporates the summarized contexts and +refines the current semantic features. Retro-FPN is a pluggable neural network +that applies to hierarchical decoders. By integrating Retro-FPN with three +representative backbones, including both point-based and voxel-based methods, +we show that Retro-FPN can significantly improve performance over +state-of-the-art backbones. Comprehensive experiments on widely used benchmarks +can justify the effectiveness of our design. The source is available at +https://github.com/AllenXiangX/Retro-FPN + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Lip Reading for Low-resource Languages by Learning and Combining General + Speech Knowledge and Language-specific Knowledge ICCV 2023 + + +
+ This paper proposes a novel lip reading framework, especially for +low-resource languages, which has not been well addressed in the previous +literature. Since low-resource languages do not have enough video-text paired +data to train the model to have sufficient power to model lip movements and +language, it is regarded as challenging to develop lip reading models for +low-resource languages. In order to mitigate the challenge, we try to learn +general speech knowledge, the ability to model lip movements, from a +high-resource language through the prediction of speech units. It is known that +different languages partially share common phonemes, thus general speech +knowledge learned from one language can be extended to other languages. Then, +we try to learn language-specific knowledge, the ability to model language, by +proposing Language-specific Memory-augmented Decoder (LMDecoder). LMDecoder +saves language-specific audio features into memory banks and can be trained on +audio-text paired data which is more easily accessible than video-text paired +data. Therefore, with LMDecoder, we can transform the input speech units into +language-specific audio features and translate them into texts by utilizing the +learned rich language knowledge. Finally, by combining general speech knowledge +and language-specific knowledge, we can efficiently develop lip reading models +even for low-resource languages. Through extensive experiments using five +languages, English, Spanish, French, Italian, and Portuguese, the effectiveness +of the proposed method is evaluated. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Rethinking Image Forgery Detection via Contrastive Learning and + Unsupervised Clustering + + +
+ Image forgery detection aims to detect and locate forged regions in an image. +Most existing forgery detection algorithms formulate classification problems to +classify pixels into forged or pristine. However, the definition of forged and +pristine pixels is only relative within one single image, e.g., a forged region +in image A is actually a pristine one in its source image B (splicing forgery). +Such a relative definition has been severely overlooked by existing methods, +which unnecessarily mix forged (pristine) regions across different images into +the same category. To resolve this dilemma, we propose the FOrensic ContrAstive +cLustering (FOCAL) method, a novel, simple yet very effective paradigm based on +contrastive learning and unsupervised clustering for the image forgery +detection. Specifically, FOCAL 1) utilizes pixel-level contrastive learning to +supervise the high-level forensic feature extraction in an image-by-image +manner, explicitly reflecting the above relative definition; 2) employs an +on-the-fly unsupervised clustering algorithm (instead of a trained one) to +cluster the learned features into forged/pristine categories, further +suppressing the cross-image influence from training data; and 3) allows to +further boost the detection performance via simple feature-level concatenation +without the need of retraining. Extensive experimental results over six public +testing datasets demonstrate that our proposed FOCAL significantly outperforms +the state-of-the-art competing algorithms by big margins: +24.3% on Coverage, ++18.6% on Columbia, +17.5% on FF++, +14.2% on MISD, +13.5% on CASIA and +10.3% +on NIST in terms of IoU. The paradigm of FOCAL could bring fresh insights and +serve as a novel benchmark for the image forgery detection task. The code is +available at https://github.com/HighwayWu/FOCAL. + +
+
+
+
+
+ + ☆ DiffDis: Empowering Generative Diffusion Model with Cross-Modal + Discrimination Capability ICCV2023 + + +
+ Recently, large-scale diffusion models, e.g., Stable diffusion and DallE2, +have shown remarkable results on image synthesis. On the other hand, +large-scale cross-modal pre-trained models (e.g., CLIP, ALIGN, and FILIP) are +competent for various downstream tasks by learning to align vision and language +embeddings. In this paper, we explore the possibility of jointly modeling +generation and discrimination. Specifically, we propose DiffDis to unify the +cross-modal generative and discriminative pretraining into one single framework +under the diffusion process. DiffDis first formulates the image-text +discriminative problem as a generative diffusion process of the text embedding +from the text encoder conditioned on the image. Then, we propose a novel +dual-stream network architecture, which fuses the noisy text embedding with the +knowledge of latent images from different scales for image-text discriminative +learning. Moreover, the generative and discriminative tasks can efficiently +share the image-branch network structure in the multi-modality model. +Benefiting from diffusion-based unified training, DiffDis achieves both better +generation ability and cross-modal semantic alignment in one architecture. +Experimental results show that DiffDis outperforms single-task models on both +the image generation and the image-text discriminative tasks, e.g., 1.65% +improvement on average accuracy of zero-shot classification over 12 datasets +and 2.42 improvement on FID of zero-shot image synthesis. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ☆ Human Part-wise 3D Motion Context Learning for Sign Language Recognition ICCV 2023 + + +
+ In this paper, we propose P3D, the human part-wise motion context learning +framework for sign language recognition. Our main contributions lie in two +dimensions: learning the part-wise motion context and employing the pose +ensemble to utilize 2D and 3D pose jointly. First, our empirical observation +implies that part-wise context encoding benefits the performance of sign +language recognition. While previous methods of sign language recognition +learned motion context from the sequence of the entire pose, we argue that such +methods cannot exploit part-specific motion context. In order to utilize +part-wise motion context, we propose the alternating combination of a part-wise +encoding Transformer (PET) and a whole-body encoding Transformer (WET). PET +encodes the motion contexts from a part sequence, while WET merges them into a +unified context. By learning part-wise motion context, our P3D achieves +superior performance on WLASL compared to previous state-of-the-art methods. +Second, our framework is the first to ensemble 2D and 3D poses for sign +language recognition. Since the 3D pose holds rich motion context and depth +information to distinguish the words, our P3D outperformed the previous +state-of-the-art methods employing a pose ensemble. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Online Class Incremental Learning on Stochastic Blurry Task Boundary via + Mask and Visual Prompt Tuning + + +
+ Continual learning aims to learn a model from a continuous stream of data, +but it mainly assumes a fixed number of data and tasks with clear task +boundaries. However, in real-world scenarios, the number of input data and +tasks is constantly changing in a statistical way, not a static way. Although +recently introduced incremental learning scenarios having blurry task +boundaries somewhat address the above issues, they still do not fully reflect +the statistical properties of real-world situations because of the fixed ratio +of disjoint and blurry samples. In this paper, we propose a new Stochastic +incremental Blurry task boundary scenario, called Si-Blurry, which reflects the +stochastic properties of the real-world. We find that there are two major +challenges in the Si-Blurry scenario: (1) inter- and intra-task forgettings and +(2) class imbalance problem. To alleviate them, we introduce Mask and Visual +Prompt tuning (MVP). In MVP, to address the inter- and intra-task forgetting +issues, we propose a novel instance-wise logit masking and contrastive visual +prompt tuning loss. Both of them help our model discern the classes to be +learned in the current batch. It results in consolidating the previous +knowledge. In addition, to alleviate the class imbalance problem, we introduce +a new gradient similarity-based focal loss and adaptive feature scaling to ease +overfitting to the major classes and underfitting to the minor classes. +Extensive experiments show that our proposed MVP significantly outperforms the +existing state-of-the-art methods in our challenging Si-Blurry scenario. + +
+
+
+
+
+ + ☆ V2A-Mapper: A Lightweight Solution for Vision-to-Audio Generation by + Connecting Foundation Models + + +
+ Building artificial intelligence (AI) systems on top of a set of foundation +models (FMs) is becoming a new paradigm in AI research. Their representative +and generative abilities learnt from vast amounts of data can be easily adapted +and transferred to a wide range of downstream tasks without extra training from +scratch. However, leveraging FMs in cross-modal generation remains +under-researched when audio modality is involved. On the other hand, +automatically generating semantically-relevant sound from visual input is an +important problem in cross-modal generation studies. To solve this +vision-to-audio (V2A) generation problem, existing methods tend to design and +build complex systems from scratch using modestly sized datasets. In this +paper, we propose a lightweight solution to this problem by leveraging +foundation models, specifically CLIP, CLAP, and AudioLDM. We first investigate +the domain gap between the latent space of the visual CLIP and the auditory +CLAP models. Then we propose a simple yet effective mapper mechanism +(V2A-Mapper) to bridge the domain gap by translating the visual input between +CLIP and CLAP spaces. Conditioned on the translated CLAP embedding, pretrained +audio generative FM AudioLDM is adopted to produce high-fidelity and +visually-aligned sound. Compared to previous approaches, our method only +requires a quick training of the V2A-Mapper. We further analyze and conduct +extensive experiments on the choice of the V2A-Mapper and show that a +generative mapper is better at fidelity and variability (FD) while a regression +mapper is slightly better at relevance (CS). Both objective and subjective +evaluation on two V2A datasets demonstrate the superiority of our proposed +method compared to current state-of-the-art approaches - trained with 86% fewer +parameters but achieving 53% and 19% improvement in FD and CS, respectively. + +
+
+ comment: 13 pages, 10 figures. Code, demo, and samples: + https://v2a-mapper.github.io/ +
+
+
+
+
+ + ☆ Inferior Alveolar Nerve Segmentation in CBCT images using + Connectivity-Based Selective Re-training + + +
+ Inferior Alveolar Nerve (IAN) canal detection in CBCT is an important step in +many dental and maxillofacial surgery applications to prevent irreversible +damage to the nerve during the procedure.The ToothFairy2023 Challenge aims to +establish a 3D maxillofacial dataset consisting of all sparse labels and +partial dense labels, and improve the ability of automatic IAN segmentation. In +this work, in order to avoid the negative impact brought by sparse labeling, we +transform the mixed supervised problem into a semi-supervised problem. Inspired +by self-training via pseudo labeling, we propose a selective re-training +framework based on IAN connectivity. Our method is quantitatively evaluated on +the ToothFairy verification cases, achieving the dice similarity coefficient +(DSC) of 0.7956, and 95\% hausdorff distance (HD95) of 4.4905, and wining the +champion in the competition. Code is available at +https://github.com/GaryNico517/SSL-IAN-Retraining. + +
+
+ comment: technical paper for Miccai ToothFairy2023 Challenge +
+
+
+
+
+ + ☆ NAPA-VQ: Neighborhood Aware Prototype Augmentation with Vector + Quantization for Continual Learning ICCV 2023 + + +
+ Catastrophic forgetting; the loss of old knowledge upon acquiring new +knowledge, is a pitfall faced by deep neural networks in real-world +applications. Many prevailing solutions to this problem rely on storing +exemplars (previously encountered data), which may not be feasible in +applications with memory limitations or privacy constraints. Therefore, the +recent focus has been on Non-Exemplar based Class Incremental Learning (NECIL) +where a model incrementally learns about new classes without using any past +exemplars. However, due to the lack of old data, NECIL methods struggle to +discriminate between old and new classes causing their feature representations +to overlap. We propose NAPA-VQ: Neighborhood Aware Prototype Augmentation with +Vector Quantization, a framework that reduces this class overlap in NECIL. We +draw inspiration from Neural Gas to learn the topological relationships in the +feature space, identifying the neighboring classes that are most likely to get +confused with each other. This neighborhood information is utilized to enforce +strong separation between the neighboring classes as well as to generate old +class representative prototypes that can better aid in obtaining a +discriminative decision boundary between old and new classes. Our comprehensive +experiments on CIFAR-100, TinyImageNet, and ImageNet-Subset demonstrate that +NAPA-VQ outperforms the State-of-the-art NECIL methods by an average +improvement of 5%, 2%, and 4% in accuracy and 10%, 3%, and 9% in forgetting +respectively. Our code can be found in https://github.com/TamashaM/NAPA-VQ.git. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Self-Calibrated Cross Attention Network for Few-Shot Segmentation ICCV'23 + + +
+ The key to the success of few-shot segmentation (FSS) lies in how to +effectively utilize support samples. Most solutions compress support foreground +(FG) features into prototypes, but lose some spatial details. Instead, others +use cross attention to fuse query features with uncompressed support FG. Query +FG could be fused with support FG, however, query background (BG) cannot find +matched BG features in support FG, yet inevitably integrates dissimilar +features. Besides, as both query FG and BG are combined with support FG, they +get entangled, thereby leading to ineffective segmentation. To cope with these +issues, we design a self-calibrated cross attention (SCCA) block. For efficient +patch-based attention, query and support features are firstly split into +patches. Then, we design a patch alignment module to align each query patch +with its most similar support patch for better cross attention. Specifically, +SCCA takes a query patch as Q, and groups the patches from the same query image +and the aligned patches from the support image as K&V. In this way, the query +BG features are fused with matched BG features (from query patches), and thus +the aforementioned issues will be mitigated. Moreover, when calculating SCCA, +we design a scaled-cosine mechanism to better utilize the support features for +similarity calculation. Extensive experiments conducted on PASCAL-5^i and +COCO-20^i demonstrate the superiority of our model, e.g., the mIoU score under +5-shot setting on COCO-20^i is 5.6%+ better than previous state-of-the-arts. +The code is available at https://github.com/Sam1224/SCCAN. + +
+
+ comment: This paper is accepted by ICCV'23 +
+
+
+
+
+ + ☆ RFDforFin: Robust Deep Forgery Detection for GAN-generated Fingerprint + Images + + +
+ With the rapid development of the image generation technologies, the +malicious abuses of the GAN-generated fingerprint images poses a significant +threat to the public safety in certain circumstances. Although the existing +universal deep forgery detection approach can be applied to detect the fake +fingerprint images, they are easily attacked and have poor robustness. +Meanwhile, there is no specifically designed deep forgery detection method for +fingerprint images. In this paper, we propose the first deep forgery detection +approach for fingerprint images, which combines unique ridge features of +fingerprint and generation artifacts of the GAN-generated images, to the best +of our knowledge. Specifically, we firstly construct a ridge stream, which +exploits the grayscale variations along the ridges to extract unique +fingerprint-specific features. Then, we construct a generation artifact stream, +in which the FFT-based spectrums of the input fingerprint images are exploited, +to extract more robust generation artifact features. At last, the unique ridge +features and generation artifact features are fused for binary classification +(\textit{i.e.}, real or fake). Comprehensive experiments demonstrate that our +proposed approach is effective and robust with low complexities. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ Diverse Cotraining Makes Strong Semi-Supervised Segmentor ICCV2023 + + +
+ Deep co-training has been introduced to semi-supervised segmentation and +achieves impressive results, yet few studies have explored the working +mechanism behind it. In this work, we revisit the core assumption that supports +co-training: multiple compatible and conditionally independent views. By +theoretically deriving the generalization upper bound, we prove the prediction +similarity between two models negatively impacts the model's generalization +ability. However, most current co-training models are tightly coupled together +and violate this assumption. Such coupling leads to the homogenization of +networks and confirmation bias which consequently limits the performance. To +this end, we explore different dimensions of co-training and systematically +increase the diversity from the aspects of input domains, different +augmentations and model architectures to counteract homogenization. Our Diverse +Co-training outperforms the state-of-the-art (SOTA) methods by a large margin +across different evaluation protocols on the Pascal and Cityscapes. For +example. we achieve the best mIoU of 76.2%, 77.7% and 80.2% on Pascal with only +92, 183 and 366 labeled images, surpassing the previous best results by more +than 5%. + +
+
+ comment: ICCV2023, Camera Ready Version, Code: + \url{https://github.com/williamium3000/diverse-cotraining} +
+
+
+
+
+ + ☆ DiffLLE: Diffusion-guided Domain Calibration for Unsupervised Low-light + Image Enhancement + + +
+ Existing unsupervised low-light image enhancement methods lack enough +effectiveness and generalization in practical applications. We suppose this is +because of the absence of explicit supervision and the inherent gap between +real-world scenarios and the training data domain. In this paper, we develop +Diffusion-based domain calibration to realize more robust and effective +unsupervised Low-Light Enhancement, called DiffLLE. Since the diffusion model +performs impressive denoising capability and has been trained on massive clean +images, we adopt it to bridge the gap between the real low-light domain and +training degradation domain, while providing efficient priors of real-world +content for unsupervised models. Specifically, we adopt a naive unsupervised +enhancement algorithm to realize preliminary restoration and design two +zero-shot plug-and-play modules based on diffusion model to improve +generalization and effectiveness. The Diffusion-guided Degradation Calibration +(DDC) module narrows the gap between real-world and training low-light +degradation through diffusion-based domain calibration and a lightness +enhancement curve, which makes the enhancement model perform robustly even in +sophisticated wild degradation. Due to the limited enhancement effect of the +unsupervised model, we further develop the Fine-grained Target domain +Distillation (FTD) module to find a more visual-friendly solution space. It +exploits the priors of the pre-trained diffusion model to generate +pseudo-references, which shrinks the preliminary restored results from a coarse +normal-light domain to a finer high-quality clean field, addressing the lack of +strong explicit supervision for unsupervised methods. Benefiting from these, +our approach even outperforms some supervised methods by using only a simple +unsupervised baseline. Extensive experiments demonstrate the superior +effectiveness of the proposed DiffLLE. + +
+
+
+
+
+ + ☆ MATLABER: Material-Aware Text-to-3D via LAtent BRDF auto-EncodeR + + +
+ Based on powerful text-to-image diffusion models, text-to-3D generation has +made significant progress in generating compelling geometry and appearance. +However, existing methods still struggle to recover high-fidelity object +materials, either only considering Lambertian reflectance, or failing to +disentangle BRDF materials from the environment lights. In this work, we +propose Material-Aware Text-to-3D via LAtent BRDF auto-EncodeR +(\textbf{MATLABER}) that leverages a novel latent BRDF auto-encoder for +material generation. We train this auto-encoder with large-scale real-world +BRDF collections and ensure the smoothness of its latent space, which +implicitly acts as a natural distribution of materials. During appearance +modeling in text-to-3D generation, the latent BRDF embeddings, rather than BRDF +parameters, are predicted via a material network. Through exhaustive +experiments, our approach demonstrates the superiority over existing ones in +generating realistic and coherent object materials. Moreover, high-quality +materials naturally enable multiple downstream tasks such as relighting and +material editing. Code and model will be publicly available at +\url{https://sheldontsui.github.io/projects/Matlaber}. + +
+
+
+
+
+ + ☆ Progression-Guided Temporal Action Detection in Videos + + +
+ We present a novel framework, Action Progression Network (APN), for temporal +action detection (TAD) in videos. The framework locates actions in videos by +detecting the action evolution process. To encode the action evolution, we +quantify a complete action process into 101 ordered stages (0\%, 1\%, ..., +100\%), referred to as action progressions. We then train a neural network to +recognize the action progressions. The framework detects action boundaries by +detecting complete action processes in the videos, e.g., a video segment with +detected action progressions closely follow the sequence 0\%, 1\%, ..., 100\%. +The framework offers three major advantages: (1) Our neural networks are +trained end-to-end, contrasting conventional methods that optimize modules +separately; (2) The APN is trained using action frames exclusively, enabling +models to be trained on action classification datasets and robust to videos +with temporal background styles differing from those in training; (3) Our +framework effectively avoids detecting incomplete actions and excels in +detecting long-lasting actions due to the fine-grained and explicit encoding of +the temporal structure of actions. Leveraging these advantages, the APN +achieves competitive performance and significantly surpasses its counterparts +in detecting long-lasting actions. With an IoU threshold of 0.5, the APN +achieves a mean Average Precision (mAP) of 58.3\% on the THUMOS14 dataset and +98.9\% mAP on the DFMAD70 dataset. + +
+
+ comment: Under Review. Code available at https://github.com/makecent/APN +
+
+
+
+
+ + ☆ Point Contrastive Prediction with Semantic Clustering for + Self-Supervised Learning on Point Cloud Videos ICCV 2023 + + +
+ We propose a unified point cloud video self-supervised learning framework for +object-centric and scene-centric data. Previous methods commonly conduct +representation learning at the clip or frame level and cannot well capture +fine-grained semantics. Instead of contrasting the representations of clips or +frames, in this paper, we propose a unified self-supervised framework by +conducting contrastive learning at the point level. Moreover, we introduce a +new pretext task by achieving semantic alignment of superpoints, which further +facilitates the representations to capture semantic cues at multiple scales. In +addition, due to the high redundancy in the temporal dimension of dynamic point +clouds, directly conducting contrastive learning at the point level usually +leads to massive undesired negatives and insufficient modeling of positive +representations. To remedy this, we propose a selection strategy to retain +proper negatives and make use of high-similarity samples from other instances +as positive supplements. Extensive experiments show that our method outperforms +supervised counterparts on a wide range of downstream tasks and demonstrates +the superior transferability of the learned representations. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Masked Spatio-Temporal Structure Prediction for Self-supervised Learning + on Point Cloud Videos ICCV 2023 + + +
+ Recently, the community has made tremendous progress in developing effective +methods for point cloud video understanding that learn from massive amounts of +labeled data. However, annotating point cloud videos is usually notoriously +expensive. Moreover, training via one or only a few traditional tasks (e.g., +classification) may be insufficient to learn subtle details of the +spatio-temporal structure existing in point cloud videos. In this paper, we +propose a Masked Spatio-Temporal Structure Prediction (MaST-Pre) method to +capture the structure of point cloud videos without human annotations. MaST-Pre +is based on spatio-temporal point-tube masking and consists of two +self-supervised learning tasks. First, by reconstructing masked point tubes, +our method is able to capture the appearance information of point cloud videos. +Second, to learn motion, we propose a temporal cardinality difference +prediction task that estimates the change in the number of points within a +point tube. In this way, MaST-Pre is forced to model the spatial and temporal +structure in point cloud videos. Extensive experiments on MSRAction-3D, +NTU-RGBD, NvGesture, and SHREC'17 demonstrate the effectiveness of the proposed +method. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ SparseBEV: High-Performance Sparse 3D Object Detection from Multi-Camera + Videos ICCV 2023 + + +
+ Camera-based 3D object detection in BEV (Bird's Eye View) space has drawn +great attention over the past few years. Dense detectors typically follow a +two-stage pipeline by first constructing a dense BEV feature and then +performing object detection in BEV space, which suffers from complex view +transformations and high computation cost. On the other side, sparse detectors +follow a query-based paradigm without explicit dense BEV feature construction, +but achieve worse performance than the dense counterparts. In this paper, we +find that the key to mitigate this performance gap is the adaptability of the +detector in both BEV and image space. To achieve this goal, we propose +SparseBEV, a fully sparse 3D object detector that outperforms the dense +counterparts. SparseBEV contains three key designs, which are (1) +scale-adaptive self attention to aggregate features with adaptive receptive +field in BEV space, (2) adaptive spatio-temporal sampling to generate sampling +locations under the guidance of queries, and (3) adaptive mixing to decode the +sampled features with dynamic weights from the queries. On the test split of +nuScenes, SparseBEV achieves the state-of-the-art performance of 67.5 NDS. On +the val split, SparseBEV achieves 55.8 NDS while maintaining a real-time +inference speed of 23.5 FPS. Code is available at +https://github.com/MCG-NJU/SparseBEV. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ ASAG: Building Strong One-Decoder-Layer Sparse Detectors via Adaptive + Sparse Anchor Generation ICCV 2023 + + +
+ Recent sparse detectors with multiple, e.g. six, decoder layers achieve +promising performance but much inference time due to complex heads. Previous +works have explored using dense priors as initialization and built +one-decoder-layer detectors. Although they gain remarkable acceleration, their +performance still lags behind their six-decoder-layer counterparts by a large +margin. In this work, we aim to bridge this performance gap while retaining +fast speed. We find that the architecture discrepancy between dense and sparse +detectors leads to feature conflict, hampering the performance of +one-decoder-layer detectors. Thus we propose Adaptive Sparse Anchor Generator +(ASAG) which predicts dynamic anchors on patches rather than grids in a sparse +way so that it alleviates the feature conflict problem. For each image, ASAG +dynamically selects which feature maps and which locations to predict, forming +a fully adaptive way to generate image-specific anchors. Further, a simple and +effective Query Weighting method eases the training instability from +adaptiveness. Extensive experiments show that our method outperforms +dense-initialized ones and achieves a better speed-accuracy trade-off. The code +is available at \url{https://github.com/iSEE-Laboratory/ASAG}. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Improving Buoy Detection with Deep Transfer Learning for Mussel Farm + Automation + + +
+ The aquaculture sector in New Zealand is experiencing rapid expansion, with a +particular emphasis on mussel exports. As the demands of mussel farming +operations continue to evolve, the integration of artificial intelligence and +computer vision techniques, such as intelligent object detection, is emerging +as an effective approach to enhance operational efficiency. This study delves +into advancing buoy detection by leveraging deep learning methodologies for +intelligent mussel farm monitoring and management. The primary objective +centers on improving accuracy and robustness in detecting buoys across a +spectrum of real-world scenarios. A diverse dataset sourced from mussel farms +is captured and labeled for training, encompassing imagery taken from cameras +mounted on both floating platforms and traversing vessels, capturing various +lighting and weather conditions. To establish an effective deep learning model +for buoy detection with a limited number of labeled data, we employ transfer +learning techniques. This involves adapting a pre-trained object detection +model to create a specialized deep learning buoy detection model. We explore +different pre-trained models, including YOLO and its variants, alongside data +diversity to investigate their effects on model performance. Our investigation +demonstrates a significant enhancement in buoy detection performance through +deep learning, accompanied by improved generalization across diverse weather +conditions, highlighting the practical effectiveness of our approach. + +
+
+ comment: 7 pages, 5 figures, submitted to ICVNZ 2023 conference + https://ivcnz2023.massey.ac.nz/ +
+
+
+
+
+ + ☆ Deep Boosting Multi-Modal Ensemble Face Recognition with Sample-Level + Weighting + + +
+ Deep convolutional neural networks have achieved remarkable success in face +recognition (FR), partly due to the abundant data availability. However, the +current training benchmarks exhibit an imbalanced quality distribution; most +images are of high quality. This poses issues for generalization on hard +samples since they are underrepresented during training. In this work, we +employ the multi-model boosting technique to deal with this issue. Inspired by +the well-known AdaBoost, we propose a sample-level weighting approach to +incorporate the importance of different samples into the FR loss. Individual +models of the proposed framework are experts at distinct levels of sample +hardness. Therefore, the combination of models leads to a robust feature +extractor without losing the discriminability on the easy samples. Also, for +incorporating the sample hardness into the training criterion, we analytically +show the effect of sample mining on the important aspects of current angular +margin loss functions, i.e., margin and scale. The proposed method shows +superior performance in comparison with the state-of-the-art algorithms in +extensive experiments on the CFP-FP, LFW, CPLFW, CALFW, AgeDB, TinyFace, IJB-B, +and IJB-C evaluation datasets. + +
+
+ comment: 2023 IEEE International Joint Conference on Biometrics (IJCB) +
+
+
+
+
+ + ☆ CCFace: Classification Consistency for Low-Resolution Face Recognition + + +
+ In recent years, deep face recognition methods have demonstrated impressive +results on in-the-wild datasets. However, these methods have shown a +significant decline in performance when applied to real-world low-resolution +benchmarks like TinyFace or SCFace. To address this challenge, we propose a +novel classification consistency knowledge distillation approach that transfers +the learned classifier from a high-resolution model to a low-resolution +network. This approach helps in finding discriminative representations for +low-resolution instances. To further improve the performance, we designed a +knowledge distillation loss using the adaptive angular penalty inspired by the +success of the popular angular margin loss function. The adaptive penalty +reduces overfitting on low-resolution samples and alleviates the convergence +issue of the model integrated with data augmentation. Additionally, we utilize +an asymmetric cross-resolution learning approach based on the state-of-the-art +semi-supervised representation learning paradigm to improve discriminability on +low-resolution instances and prevent them from forming a cluster. Our proposed +method outperforms state-of-the-art approaches on low-resolution benchmarks, +with a three percent improvement on TinyFace while maintaining performance on +high-resolution benchmarks. + +
+
+ comment: 2023 IEEE International Joint Conference on Biometrics (IJCB) +
+
+
+
+
+ + ☆ Generalized Sum Pooling for Metric Learning ICCV + + +
+ A common architectural choice for deep metric learning is a convolutional +neural network followed by global average pooling (GAP). Albeit simple, GAP is +a highly effective way to aggregate information. One possible explanation for +the effectiveness of GAP is considering each feature vector as representing a +different semantic entity and GAP as a convex combination of them. Following +this perspective, we generalize GAP and propose a learnable generalized sum +pooling method (GSP). GSP improves GAP with two distinct abilities: i) the +ability to choose a subset of semantic entities, effectively learning to ignore +nuisance information, and ii) learning the weights corresponding to the +importance of each entity. Formally, we propose an entropy-smoothed optimal +transport problem and show that it is a strict generalization of GAP, i.e., a +specific realization of the problem gives back GAP. We show that this +optimization problem enjoys analytical gradients enabling us to use it as a +direct learnable replacement for GAP. We further propose a zero-shot loss to +ease the learning of GSP. We show the effectiveness of our method with +extensive evaluations on 4 popular metric learning benchmarks. Code is +available at: GSP-DML Framework + +
+
+ comment: Accepted as a conference paper at International Conference on + Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ☆ DMCVR: Morphology-Guided Diffusion Model for 3D Cardiac Volume + Reconstruction MICCAI 2023 + + +
+ Accurate 3D cardiac reconstruction from cine magnetic resonance imaging +(cMRI) is crucial for improved cardiovascular disease diagnosis and +understanding of the heart's motion. However, current cardiac MRI-based +reconstruction technology used in clinical settings is 2D with limited +through-plane resolution, resulting in low-quality reconstructed cardiac +volumes. To better reconstruct 3D cardiac volumes from sparse 2D image stacks, +we propose a morphology-guided diffusion model for 3D cardiac volume +reconstruction, DMCVR, that synthesizes high-resolution 2D images and +corresponding 3D reconstructed volumes. Our method outperforms previous +approaches by conditioning the cardiac morphology on the generative model, +eliminating the time-consuming iterative optimization process of the latent +code, and improving generation quality. The learned latent spaces provide +global semantics, local cardiac morphology and details of each 2D cMRI slice +with highly interpretable value to reconstruct 3D cardiac shape. Our +experiments show that DMCVR is highly effective in several aspects, such as 2D +generation and 3D reconstruction performance. With DMCVR, we can produce +high-resolution 3D cardiac MRI reconstructions, surpassing current techniques. +Our proposed framework has great potential for improving the accuracy of +cardiac disease diagnosis and treatment planning. Code can be accessed at +https://github.com/hexiaoxiao-cs/DMCVR. + +
+
+ comment: Accepted in MICCAI 2023 +
+
+
+
+
+ + ☆ A review of technical factors to consider when designing neural networks + for semantic segmentation of Earth Observation imagery + + +
+ Semantic segmentation (classification) of Earth Observation imagery is a +crucial task in remote sensing. This paper presents a comprehensive review of +technical factors to consider when designing neural networks for this purpose. +The review focuses on Convolutional Neural Networks (CNNs), Recurrent Neural +Networks (RNNs), Generative Adversarial Networks (GANs), and transformer +models, discussing prominent design patterns for these ANN families and their +implications for semantic segmentation. Common pre-processing techniques for +ensuring optimal data preparation are also covered. These include methods for +image normalization and chipping, as well as strategies for addressing data +imbalance in training samples, and techniques for overcoming limited data, +including augmentation techniques, transfer learning, and domain adaptation. By +encompassing both the technical aspects of neural network design and the +data-related considerations, this review provides researchers and practitioners +with a comprehensive and up-to-date understanding of the factors involved in +designing effective neural networks for semantic segmentation of Earth +Observation imagery. + +
+
+ comment: 145 pages with 32 figures +
+
+
+
+
+ + ♻ ☆ Segmenting Known Objects and Unseen Unknowns without Prior Knowledge ICCV 2023 + + +
+ Panoptic segmentation methods assign a known class to each pixel given in +input. Even for state-of-the-art approaches, this inevitably enforces decisions +that systematically lead to wrong predictions for objects outside the training +categories. However, robustness against out-of-distribution samples and corner +cases is crucial in safety-critical settings to avoid dangerous consequences. +Since real-world datasets cannot contain enough data points to adequately +sample the long tail of the underlying distribution, models must be able to +deal with unseen and unknown scenarios as well. Previous methods targeted this +by re-identifying already-seen unlabeled objects. In this work, we propose the +necessary step to extend segmentation with a new setting which we term holistic +segmentation. Holistic segmentation aims to identify and separate objects of +unseen, unknown categories into instances without any prior knowledge about +them while performing panoptic segmentation of known classes. We tackle this +new problem with U3HS, which finds unknowns as highly uncertain regions and +clusters their corresponding instance-aware embeddings into individual objects. +By doing so, for the first time in panoptic segmentation with unknown objects, +our U3HS is trained without unknown categories, reducing assumptions and +leaving the settings as unconstrained as in real-life scenarios. Extensive +experiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate +the effectiveness of U3HS for this new, challenging, and assumptions-free +setting called holistic segmentation. Project page: +https://holisticseg.github.io. + +
+
+ comment: ICCV 2023. Project page: https://holisticseg.github.io +
+
+
+
+
+ + ♻ ☆ DarSwin: Distortion Aware Radial Swin Transformer + + +
+ Wide-angle lenses are commonly used in perception tasks requiring a large +field of view. Unfortunately, these lenses produce significant distortions +making conventional models that ignore the distortion effects unable to adapt +to wide-angle images. In this paper, we present a novel transformer-based model +that automatically adapts to the distortion produced by wide-angle lenses. We +leverage the physical characteristics of such lenses, which are analytically +defined by the radial distortion profile (assumed to be known), to develop a +distortion aware radial swin transformer (DarSwin). In contrast to conventional +transformer-based architectures, DarSwin comprises a radial patch partitioning, +a distortion-based sampling technique for creating token embeddings, and an +angular position encoding for radial patch merging. We validate our method on +classification tasks using synthetically distorted ImageNet data and show +through extensive experiments that DarSwin can perform zero-shot adaptation to +unseen distortions of different wide-angle lenses. Compared to other baselines, +DarSwin achieves the best results (in terms of Top-1 accuracy) with significant +gains when trained on bounded levels of distortions (very-low, low, medium, and +high) and tested on all including out-of-distribution distortions. The code and +models are publicly available at https://lvsn.github.io/darswin/ + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ ELITE: Encoding Visual Concepts into Textual Embeddings for Customized + Text-to-Image Generation ICCV 2023 + + +
+ In addition to the unprecedented ability in imaginary creation, large +text-to-image models are expected to take customized concepts in image +generation. Existing works generally learn such concepts in an +optimization-based manner, yet bringing excessive computation or memory burden. +In this paper, we instead propose a learning-based encoder, which consists of a +global and a local mapping networks for fast and accurate customized +text-to-image generation. In specific, the global mapping network projects the +hierarchical features of a given image into multiple new words in the textual +word embedding space, i.e., one primary word for well-editable concept and +other auxiliary words to exclude irrelevant disturbances (e.g., background). In +the meantime, a local mapping network injects the encoded patch features into +cross attention layers to provide omitted details, without sacrificing the +editability of primary concepts. We compare our method with existing +optimization-based approaches on a variety of user-defined concepts, and +demonstrate that our method enables high-fidelity inversion and more robust +editability with a significantly faster encoding process. Our code is publicly +available at https://github.com/csyxwei/ELITE. + +
+
+ comment: Accepted by ICCV 2023, oral presentation. Code: + https://github.com/csyxwei/ELITE +
+
+
+
+
+ + ♻ ☆ Fairness Continual Learning Approach to Semantic Scene Understanding in + Open-World Environments + + +
+ Continual semantic segmentation aims to learn new classes while maintaining +the information from the previous classes. Although prior studies have shown +impressive progress in recent years, the fairness concern in the continual +semantic segmentation needs to be better addressed. Meanwhile, fairness is one +of the most vital factors in deploying the deep learning model, especially in +human-related or safety applications. In this paper, we present a novel +Fairness Continual Learning approach to the semantic segmentation problem. In +particular, under the fairness objective, a new fairness continual learning +framework is proposed based on class distributions. Then, a novel Prototypical +Contrastive Clustering loss is proposed to address the significant challenges +in continual learning, i.e., catastrophic forgetting and background shift. Our +proposed loss has also been proven as a novel, generalized learning paradigm of +knowledge distillation commonly used in continual learning. Moreover, the +proposed Conditional Structural Consistency loss further regularized the +structural constraint of the predicted segmentation. Our proposed approach has +achieved State-of-the-Art performance on three standard scene understanding +benchmarks, i.e., ADE20K, Cityscapes, and Pascal VOC, and promoted the fairness +of the segmentation model. + +
+
+
+
+
+ + ♻ ☆ KeyPosS: Plug-and-Play Facial Landmark Detection through GPS-Inspired + True-Range Multilateration + + +
+ In the realm of facial analysis, accurate landmark detection is crucial for +various applications, ranging from face recognition and expression analysis to +animation. Conventional heatmap or coordinate regression-based techniques, +however, often face challenges in terms of computational burden and +quantization errors. To address these issues, we present the KeyPoint +Positioning System (KeyPosS) - a groundbreaking facial landmark detection +framework that stands out from existing methods. The framework utilizes a fully +convolutional network to predict a distance map, which computes the distance +between a Point of Interest (POI) and multiple anchor points. These anchor +points are ingeniously harnessed to triangulate the POI's position through the +True-range Multilateration algorithm. Notably, the plug-and-play nature of +KeyPosS enables seamless integration into any decoding stage, ensuring a +versatile and adaptable solution. We conducted a thorough evaluation of +KeyPosS's performance by benchmarking it against state-of-the-art models on +four different datasets. The results show that KeyPosS substantially +outperforms leading methods in low-resolution settings while requiring a +minimal time overhead. The code is available at +https://github.com/zhiqic/KeyPosS. + +
+
+ comment: Accepted to ACM Multimedia 2023; 10 pages, 7 figures, 6 tables; the + code is at https://github.com/zhiqic/KeyPosS +
+
+
+
+
+ + ♻ ☆ Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation + of rPPG + + +
+ rPPG (Remote photoplethysmography) is a technology that measures and analyzes +BVP (Blood Volume Pulse) by using the light absorption characteristics of +hemoglobin captured through a camera. Analyzing the measured BVP can derive +various physiological signals such as heart rate, stress level, and blood +pressure, which can be applied to various applications such as telemedicine, +remote patient monitoring, and early prediction of cardiovascular disease. rPPG +is rapidly evolving and attracting great attention from both academia and +industry by providing great usability and convenience as it can measure +biosignals using a camera-equipped device without medical or wearable devices. +Despite extensive efforts and advances in this field, serious challenges +remain, including issues related to skin color, camera characteristics, ambient +lighting, and other sources of noise and artifacts, which degrade accuracy +performance. We argue that fair and evaluable benchmarking is urgently required +to overcome these challenges and make meaningful progress from both academic +and commercial perspectives. In most existing work, models are trained, tested, +and validated only on limited datasets. Even worse, some studies lack available +code or reproducibility, making it difficult to fairly evaluate and compare +performance. Therefore, the purpose of this study is to provide a benchmarking +framework to evaluate various rPPG techniques across a wide range of datasets +for fair evaluation and comparison, including both conventional non-deep neural +network (non-DNN) and deep neural network (DNN) methods. GitHub URL: +https://github.com/remotebiosensing/rppg + +
+
+ comment: 20 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ YoloCurvSeg: You Only Label One Noisy Skeleton for Vessel-style + Curvilinear Structure Segmentation + + +
+ Weakly-supervised learning (WSL) has been proposed to alleviate the conflict +between data annotation cost and model performance through employing +sparsely-grained (i.e., point-, box-, scribble-wise) supervision and has shown +promising performance, particularly in the image segmentation field. However, +it is still a very challenging task due to the limited supervision, especially +when only a small number of labeled samples are available. Additionally, almost +all existing WSL segmentation methods are designed for star-convex structures +which are very different from curvilinear structures such as vessels and +nerves. In this paper, we propose a novel sparsely annotated segmentation +framework for curvilinear structures, named YoloCurvSeg. A very essential +component of YoloCurvSeg is image synthesis. Specifically, a background +generator delivers image backgrounds that closely match the real distributions +through inpainting dilated skeletons. The extracted backgrounds are then +combined with randomly emulated curves generated by a Space Colonization +Algorithm-based foreground generator and through a multilayer patch-wise +contrastive learning synthesizer. In this way, a synthetic dataset with both +images and curve segmentation labels is obtained, at the cost of only one or a +few noisy skeleton annotations. Finally, a segmenter is trained with the +generated dataset and possibly an unlabeled dataset. The proposed YoloCurvSeg +is evaluated on four publicly available datasets (OCTA500, CORN, DRIVE and +CHASEDB1) and the results show that YoloCurvSeg outperforms state-of-the-art +WSL segmentation methods by large margins. With only one noisy skeleton +annotation (respectively 0.14\%, 0.03\%, 1.40\%, and 0.65\% of the full +annotation), YoloCurvSeg achieves more than 97\% of the fully-supervised +performance on each dataset. Code and datasets will be released at +https://github.com/llmir/YoloCurvSeg. + +
+
+ comment: 20 pages, 15 figures, MEDIA accepted +
+
+
+
+
+ + ♻ ☆ Hard No-Box Adversarial Attack on Skeleton-Based Human Action + Recognition with Skeleton-Motion-Informed Gradient ICCV 2023 + + +
+ Recently, methods for skeleton-based human activity recognition have been +shown to be vulnerable to adversarial attacks. However, these attack methods +require either the full knowledge of the victim (i.e. white-box attacks), +access to training data (i.e. transfer-based attacks) or frequent model queries +(i.e. black-box attacks). All their requirements are highly restrictive, +raising the question of how detrimental the vulnerability is. In this paper, we +show that the vulnerability indeed exists. To this end, we consider a new +attack task: the attacker has no access to the victim model or the training +data or labels, where we coin the term hard no-box attack. Specifically, we +first learn a motion manifold where we define an adversarial loss to compute a +new gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our +gradient contains information of the motion dynamics, which is different from +existing gradient-based attack methods that compute the loss gradient assuming +each dimension in the data is independent. The SMI gradient can augment many +gradient-based attack methods, leading to a new family of no-box attack +methods. Extensive evaluation and comparison show that our method imposes a +real threat to existing classifiers. They also show that the SMI gradient +improves the transferability and imperceptibility of adversarial samples in +both no-box and transfer-based black-box settings. + +
+
+ comment: Camera-ready version for ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Dynamic Snake Convolution based on Topological Geometric Constraints for + Tubular Structure Segmentation ICCV 2023 + + +
+ Accurate segmentation of topological tubular structures, such as blood +vessels and roads, is crucial in various fields, ensuring accuracy and +efficiency in downstream tasks. However, many factors complicate the task, +including thin local structures and variable global morphologies. In this work, +we note the specificity of tubular structures and use this knowledge to guide +our DSCNet to simultaneously enhance perception in three stages: feature +extraction, feature fusion, and loss constraint. First, we propose a dynamic +snake convolution to accurately capture the features of tubular structures by +adaptively focusing on slender and tortuous local structures. Subsequently, we +propose a multi-view feature fusion strategy to complement the attention to +features from multiple perspectives during feature fusion, ensuring the +retention of important information from different global morphologies. Finally, +a continuity constraint loss function, based on persistent homology, is +proposed to constrain the topological continuity of the segmentation better. +Experiments on 2D and 3D datasets show that our DSCNet provides better accuracy +and continuity on the tubular structure segmentation task compared with several +methods. Our codes will be publicly available. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ CroCo v2: Improved Cross-view Completion Pre-training for Stereo + Matching and Optical Flow ICCV 2023 + + +
+ Despite impressive performance for high-level downstream tasks, +self-supervised pre-training methods have not yet fully delivered on dense +geometric vision tasks such as stereo matching or optical flow. The application +of self-supervised concepts, such as instance discrimination or masked image +modeling, to geometric tasks is an active area of research. In this work, we +build on the recent cross-view completion framework, a variation of masked +image modeling that leverages a second view from the same scene which makes it +well suited for binocular downstream tasks. The applicability of this concept +has so far been limited in at least two ways: (a) by the difficulty of +collecting real-world image pairs -- in practice only synthetic data have been +used -- and (b) by the lack of generalization of vanilla transformers to dense +downstream tasks for which relative position is more meaningful than absolute +position. We explore three avenues of improvement. First, we introduce a method +to collect suitable real-world image pairs at large scale. Second, we +experiment with relative positional embeddings and show that they enable vision +transformers to perform substantially better. Third, we scale up vision +transformer based cross-completion architectures, which is made possible by the +use of large amounts of data. With these improvements, we show for the first +time that state-of-the-art results on stereo matching and optical flow can be +reached without using any classical task-specific techniques like correlation +volume, iterative estimation, image warping or multi-scale reasoning, thus +paving the way towards universal vision models. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ LVLane: Deep Learning for Lane Detection and Classification in + Challenging Conditions + + +
+ Lane detection plays a pivotal role in the field of autonomous vehicles and +advanced driving assistant systems (ADAS). Despite advances from image +processing to deep learning based models, algorithm performance is highly +dependent on training data matching the local challenges such as extreme +lighting conditions, partially visible lane markings, and sparse lane markings +like Botts' dots. To address this, we present an end-to-end lane detection and +classification system based on deep learning methodologies. In our study, we +introduce a unique dataset meticulously curated to encompass scenarios that +pose significant challenges for state-of-the-art (SOTA) lane localization +models. Moreover, we propose a CNN-based classification branch, seamlessly +integrated with the detector, facilitating the identification of distinct lane +types. This architecture enables informed lane-changing decisions and empowers +more resilient ADAS capabilities. We also investigate the effect of using mixed +precision training and testing on different models and batch sizes. +Experimental evaluations conducted on the widely-used TuSimple dataset, Caltech +Lane dataset, and our LVLane dataset demonstrate the effectiveness of our model +in accurately detecting and classifying lanes amidst challenging scenarios. Our +method achieves state-of-the-art classification results on the TuSimple +dataset. The code of the work can be found on www.github.com/zillur-av/LVLane. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ Quantum Image Denoising: A Framework via Boltzmann Machines, QUBO, and + Quantum Annealing + + +
+ We investigate a framework for binary image denoising via restricted +Boltzmann machines (RBMs) that introduces a denoising objective in quadratic +unconstrained binary optimization (QUBO) form and is well-suited for quantum +annealing. The denoising objective is attained by balancing the distribution +learned by a trained RBM with a penalty term for derivations from the noisy +image. We derive the statistically optimal choice of the penalty parameter +assuming the target distribution has been well-approximated, and further +suggest an empirically supported modification to make the method robust to that +idealistic assumption. We also show under additional assumptions that the +denoised images attained by our method are, in expectation, strictly closer to +the noise-free images than the noisy images are. While we frame the model as an +image denoising model, it can be applied to any binary data. As the QUBO +formulation is well-suited for implementation on quantum annealers, we test the +model on a D-Wave Advantage machine, and also test on data too large for +current quantum annealers by approximating QUBO solutions through classical +heuristics. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Window-Based Early-Exit Cascades for Uncertainty Estimation: When Deep + Ensembles are More Efficient than Single Models ICCV 2023 + + +
+ Deep Ensembles are a simple, reliable, and effective method of improving both +the predictive performance and uncertainty estimates of deep learning +approaches. However, they are widely criticised as being computationally +expensive, due to the need to deploy multiple independent models. Recent work +has challenged this view, showing that for predictive accuracy, ensembles can +be more computationally efficient (at inference) than scaling single models +within an architecture family. This is achieved by cascading ensemble members +via an early-exit approach. In this work, we investigate extending these +efficiency gains to tasks related to uncertainty estimation. As many such +tasks, e.g. selective classification, are binary classification, our key novel +insight is to only pass samples within a window close to the binary decision +boundary to later cascade stages. Experiments on ImageNet-scale data across a +number of network architectures and uncertainty tasks show that the proposed +window-based early-exit approach is able to achieve a superior +uncertainty-computation trade-off compared to scaling single models. For +example, a cascaded EfficientNet-B2 ensemble is able to achieve similar +coverage at 5% risk as a single EfficientNet-B4 with <30% the number of MACs. +We also find that cascades/ensembles give more reliable improvements on OOD +data vs scaling models up. Code for this work is available at: +https://github.com/Guoxoug/window-early-exit. + +
+
+ comment: Accepted to ICCV 2023 (camera-ready version, 9 pages) +
+
+
+
+
+ + ♻ ☆ From Sky to the Ground: A Large-scale Benchmark and Simple Baseline + Towards Real Rain Removal ICCV 2023 + + +
+ Learning-based image deraining methods have made great progress. However, the +lack of large-scale high-quality paired training samples is the main bottleneck +to hamper the real image deraining (RID). To address this dilemma and advance +RID, we construct a Large-scale High-quality Paired real rain benchmark +(LHP-Rain), including 3000 video sequences with 1 million high-resolution +(1920*1080) frame pairs. The advantages of the proposed dataset over the +existing ones are three-fold: rain with higher-diversity and larger-scale, +image with higher-resolution and higher-quality ground-truth. Specifically, the +real rains in LHP-Rain not only contain the classical rain +streak/veiling/occlusion in the sky, but also the \textbf{splashing on the +ground} overlooked by deraining community. Moreover, we propose a novel robust +low-rank tensor recovery model to generate the GT with better separating the +static background from the dynamic rain. In addition, we design a simple +transformer-based single image deraining baseline, which simultaneously utilize +the self-attention and cross-layer attention within the image and rain layer +with discriminative feature representation. Extensive experiments verify the +superiority of the proposed dataset and deraining method over state-of-the-art. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Learning to Generate Training Datasets for Robust Semantic Segmentation + + +
+ Semantic segmentation techniques have shown significant progress in recent +years, but their robustness to real-world perturbations and data samples not +seen during training remains a challenge, particularly in safety-critical +applications. In this paper, we propose a novel approach to improve the +robustness of semantic segmentation techniques by leveraging the synergy +between label-to-image generators and image-to-label segmentation models. +Specifically, we design and train Robusta, a novel robust conditional +generative adversarial network to generate realistic and plausible perturbed or +outlier images that can be used to train reliable segmentation models. We +conduct in-depth studies of the proposed generative model, assess the +performance and robustness of the downstream segmentation network, and +demonstrate that our approach can significantly enhance the robustness of +semantic segmentation techniques in the face of real-world perturbations, +distribution shifts, and out-of-distribution samples. Our results suggest that +this approach could be valuable in safety-critical applications, where the +reliability of semantic segmentation techniques is of utmost importance and +comes with a limited computational budget in inference. We will release our +code shortly. + +
+
+
+
+
+ + ♻ ☆ Self-supervised Character-to-Character Distillation for Text Recognition ICCV2023 + + +
+ When handling complicated text images (e.g., irregular structures, low +resolution, heavy occlusion, and uneven illumination), existing supervised text +recognition methods are data-hungry. Although these methods employ large-scale +synthetic text images to reduce the dependence on annotated real images, the +domain gap still limits the recognition performance. Therefore, exploring the +robust text feature representations on unlabeled real images by self-supervised +learning is a good solution. However, existing self-supervised text recognition +methods conduct sequence-to-sequence representation learning by roughly +splitting the visual features along the horizontal axis, which limits the +flexibility of the augmentations, as large geometric-based augmentations may +lead to sequence-to-sequence feature inconsistency. Motivated by this, we +propose a novel self-supervised Character-to-Character Distillation method, +CCD, which enables versatile augmentations to facilitate general text +representation learning. Specifically, we delineate the character structures of +unlabeled real images by designing a self-supervised character segmentation +module. Following this, CCD easily enriches the diversity of local characters +while keeping their pairwise alignment under flexible augmentations, using the +transformation matrix between two augmented views from images. Experiments +demonstrate that CCD achieves state-of-the-art results, with average +performance gains of 1.38% in text recognition, 1.7% in text segmentation, 0.24 +dB (PSNR) and 0.0321 (SSIM) in text super-resolution. Code is available at +https://github.com/TongkunGuan/CCD. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ♻ ☆ Foundation Models in Smart Agriculture: Basics, Opportunities, and + Challenges + + +
+ The past decade has witnessed the rapid development of ML and DL +methodologies in agricultural systems, showcased by great successes in variety +of agricultural applications. However, these conventional ML/DL models have +certain limitations: They heavily rely on large, costly-to-acquire labeled +datasets for training, require specialized expertise for development and +maintenance, and are mostly tailored for specific tasks, thus lacking +generalizability. Recently, foundation models have demonstrated remarkable +successes in language and vision tasks across various domains. These models are +trained on a vast amount of data from multiple domains and modalities. Once +trained, they can accomplish versatile tasks with just minor fine-tuning and +minimal task-specific labeled data. Despite their proven effectiveness and huge +potential, there has been little exploration of applying FMs to agriculture +fields. Therefore, this study aims to explore the potential of FMs in the field +of smart agriculture. In particular, we present conceptual tools and technical +background to facilitate the understanding of the problem space and uncover new +research directions in this field. To this end, we first review recent FMs in +the general computer science domain and categorize them into four categories: +language FMs, vision FMs, multimodal FMs, and reinforcement learning FMs. +Subsequently, we outline the process of developing agriculture FMs and discuss +their potential applications in smart agriculture. We also discuss the unique +challenges associated with developing AFMs, including model training, +validation, and deployment. Through this study, we contribute to the +advancement of AI in agriculture by introducing AFMs as a promising paradigm +that can significantly mitigate the reliance on extensive labeled datasets and +enhance the efficiency, effectiveness, and generalization of agricultural AI +systems. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ A serial dual-channel library occupancy detection system based on Faster + RCNN + + +
+ The phenomenon of seat occupancy in university libraries is a prevalent +issue. However, existing solutions, such as software-based seat reservations +and sensors-based occupancy detection, have proven to be inadequate in +effectively addressing this problem. In this study, we propose a novel +approach: a serial dual-channel object detection model based on Faster RCNN. +This model is designed to discern all instances of occupied seats within the +library and continuously update real-time information regarding seat occupancy +status. To train the neural network, a distinctive dataset is utilized, which +blends virtual images generated using Unreal Engine 5 (UE5) with real-world +images. Notably, our test results underscore the remarkable performance uplift +attained through the application of self-generated virtual datasets in training +Convolutional Neural Networks (CNNs), particularly within specialized +scenarios. Furthermore, this study introduces a pioneering detection model that +seamlessly amalgamates the Faster R-CNN-based object detection framework with a +transfer learning-based object classification algorithm. This amalgamation not +only significantly curtails the computational resources and time investments +needed for neural network training but also considerably heightens the +efficiency of single-frame detection rates. Additionally, a user-friendly web +interface and a mobile application have been meticulously developed, +constituting a computer vision-driven platform for detecting seat occupancy +within library premises. Noteworthy is the substantial enhancement in seat +occupancy recognition accuracy, coupled with a reduction in computational +resources required for neural network training, collectively contributing to a +considerable amplification in the overall efficiency of library seat +management. + +
+
+
+
+
+ + ♻ ☆ G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and + Game Theory ICCV2023 + + +
+ The recent video grounding works attempt to introduce vanilla contrastive +learning into video grounding. However, we claim that this naive solution is +suboptimal. Contrastive learning requires two key properties: (1) +\emph{alignment} of features of similar samples, and (2) \emph{uniformity} of +the induced distribution of the normalized features on the hypersphere. Due to +two annoying issues in video grounding: (1) the co-existence of some visual +entities in both ground truth and other moments, \ie semantic overlapping; (2) +only a few moments in the video are annotated, \ie sparse annotation dilemma, +vanilla contrastive learning is unable to model the correlations between +temporally distant moments and learned inconsistent video representations. Both +characteristics lead to vanilla contrastive learning being unsuitable for video +grounding. In this paper, we introduce Geodesic and Game Localization (G2L), a +semantically aligned and uniform video grounding framework via geodesic and +game theory. We quantify the correlations among moments leveraging the geodesic +distance that guides the model to learn the correct cross-modal +representations. Furthermore, from the novel perspective of game theory, we +propose semantic Shapley interaction based on geodesic distance sampling to +learn fine-grained semantic alignment in similar moments. Experiments on three +benchmarks demonstrate the effectiveness of our method. + +
+
+ comment: ICCV2023 oral +
+
+
+
+
+ + ♻ ☆ 3D Segmentation of Humans in Point Clouds with Synthetic Data + + +
+ Segmenting humans in 3D indoor scenes has become increasingly important with +the rise of human-centered robotics and AR/VR applications. To this end, we +propose the task of joint 3D human semantic segmentation, instance segmentation +and multi-human body-part segmentation. Few works have attempted to directly +segment humans in cluttered 3D scenes, which is largely due to the lack of +annotated training data of humans interacting with 3D scenes. We address this +challenge and propose a framework for generating training data of synthetic +humans interacting with real 3D scenes. Furthermore, we propose a novel +transformer-based model, Human3D, which is the first end-to-end model for +segmenting multiple human instances and their body-parts in a unified manner. +The key advantage of our synthetic data generation framework is its ability to +generate diverse and realistic human-scene interactions, with highly accurate +ground truth. Our experiments show that pre-training on synthetic data improves +performance on a wide variety of 3D human segmentation tasks. Finally, we +demonstrate that Human3D outperforms even task-specific state-of-the-art 3D +segmentation methods. + +
+
+ comment: project page: https://human-3d.github.io/ +
+
+
+
+
+ + ♻ ☆ LVOS: A Benchmark for Long-term Video Object Segmentation ICCV 2023 + + +
+ Existing video object segmentation (VOS) benchmarks focus on short-term +videos which just last about 3-5 seconds and where objects are visible most of +the time. These videos are poorly representative of practical applications, and +the absence of long-term datasets restricts further investigation of VOS on the +application in realistic scenarios. So, in this paper, we present a new +benchmark dataset named \textbf{LVOS}, which consists of 220 videos with a +total duration of 421 minutes. To the best of our knowledge, LVOS is the first +densely annotated long-term VOS dataset. The videos in our LVOS last 1.59 +minutes on average, which is 20 times longer than videos in existing VOS +datasets. Each video includes various attributes, especially challenges +deriving from the wild, such as long-term reappearing and cross-temporal +similar objeccts.Based on LVOS, we assess existing video object segmentation +algorithms and propose a Diverse Dynamic Memory network (DDMemory) that +consists of three complementary memory banks to exploit temporal information +adequately. The experimental results demonstrate the strength and weaknesses of +prior methods, pointing promising directions for further study. Data and code +are available at https://lingyihongfd.github.io/lvos.github.io/. + +
+
+ comment: Accepted by ICCV 2023. Project page: + https://lingyihongfd.github.io/lvos.github.io/ +
+
+
+
+
+ + ♻ ☆ Importance of Aligning Training Strategy with Evaluation for Diffusion + Models in 3D Multiclass Segmentation MICCAI 2023 + + +
+ Recently, denoising diffusion probabilistic models (DDPM) have been applied +to image segmentation by generating segmentation masks conditioned on images, +while the applications were mainly limited to 2D networks without exploiting +potential benefits from the 3D formulation. In this work, we studied the +DDPM-based segmentation model for 3D multiclass segmentation on two large +multiclass data sets (prostate MR and abdominal CT). We observed that the +difference between training and test methods led to inferior performance for +existing DDPM methods. To mitigate the inconsistency, we proposed a recycling +method which generated corrupted masks based on the model's prediction at a +previous time step instead of using ground truth. The proposed method achieved +statistically significantly improved performance compared to existing DDPMs, +independent of a number of other techniques for reducing train-test +discrepancy, including performing mask prediction, using Dice loss, and +reducing the number of diffusion time steps during training. The performance of +diffusion models was also competitive and visually similar to +non-diffusion-based U-net, within the same compute budget. The JAX-based +diffusion framework has been released at +https://github.com/mathpluscode/ImgX-DiffSeg. + +
+
+ comment: Accepted at Deep Generative Models workshop at MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Object Detection in Hyperspectral Image via Unified Spectral-Spatial + Feature Aggregation + + +
+ Deep learning-based hyperspectral image (HSI) classification and object +detection techniques have gained significant attention due to their vital role +in image content analysis, interpretation, and wider HSI applications. However, +current hyperspectral object detection approaches predominantly emphasize +either spectral or spatial information, overlooking the valuable complementary +relationship between these two aspects. In this study, we present a novel +\textbf{S}pectral-\textbf{S}patial \textbf{A}ggregation (S2ADet) object +detector that effectively harnesses the rich spectral and spatial complementary +information inherent in hyperspectral images. S2ADet comprises a hyperspectral +information decoupling (HID) module, a two-stream feature extraction network, +and a one-stage detection head. The HID module processes hyperspectral images +by aggregating spectral and spatial information via band selection and +principal components analysis, consequently reducing redundancy. Based on the +acquired spatial and spectral aggregation information, we propose a feature +aggregation two-stream network for interacting spectral-spatial features. +Furthermore, to address the limitations of existing databases, we annotate an +extensive dataset, designated as HOD3K, containing 3,242 hyperspectral images +captured across diverse real-world scenes and encompassing three object +classes. These images possess a resolution of 512x256 pixels and cover 16 bands +ranging from 470 nm to 620 nm. Comprehensive experiments on two datasets +demonstrate that S2ADet surpasses existing state-of-the-art methods, achieving +robust and reliable results. The demo code and dataset of this work are +publicly available at \url{https://github.com/hexiao-cs/S2ADet}. + +
+
+
+
+
+ + ♻ ☆ GP-PCS: One-shot Feature-Preserving Point Cloud Simplification with + Gaussian Processes on Riemannian Manifolds + + +
+ The processing, storage and transmission of large-scale point clouds is an +ongoing challenge in the computer vision community which hinders progress in +the application of 3D models to real-world settings, such as autonomous +driving, virtual reality and remote sensing. We propose a novel, one-shot point +cloud simplification method which preserves both the salient structural +features and the overall shape of a point cloud without any prior surface +reconstruction step. Our method employs Gaussian processes suitable for +functions defined on Riemannian manifolds, allowing us to model the surface +variation function across any given point cloud. A simplified version of the +original cloud is obtained by sequentially selecting points using a greedy +sparsification scheme. The selection criterion used for this scheme ensures +that the simplified cloud best represents the surface variation of the original +point cloud. We evaluate our method on several benchmark and self-acquired +point clouds, compare it to a range of existing methods, demonstrate its +application in downstream tasks of registration and surface reconstruction, and +show that our method is competitive both in terms of empirical performance and +computational efficiency. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ PatchCT: Aligning Patch Set and Label Set with Conditional Transport for + Multi-Label Image Classification ICCV23 + + +
+ Multi-label image classification is a prediction task that aims to identify +more than one label from a given image. This paper considers the semantic +consistency of the latent space between the visual patch and linguistic label +domains and introduces the conditional transport (CT) theory to bridge the +acknowledged gap. While recent cross-modal attention-based studies have +attempted to align such two representations and achieved impressive +performance, they required carefully-designed alignment modules and extra +complex operations in the attention computation. We find that by formulating +the multi-label classification as a CT problem, we can exploit the interactions +between the image and label efficiently by minimizing the bidirectional CT +cost. Specifically, after feeding the images and textual labels into the +modality-specific encoders, we view each image as a mixture of patch embeddings +and a mixture of label embeddings, which capture the local region features and +the class prototypes, respectively. CT is then employed to learn and align +those two semantic sets by defining the forward and backward navigators. +Importantly, the defined navigators in CT distance model the similarities +between patches and labels, which provides an interpretable tool to visualize +the learned prototypes. Extensive experiments on three public image benchmarks +show that the proposed model consistently outperforms the previous methods. + +
+
+ comment: accepted by ICCV23 +
+
+
+
+
+ + ♻ ☆ Robust Evaluation of Diffusion-Based Adversarial Purification ICCV 2023 + + +
+ We question the current evaluation practice on diffusion-based purification +methods. Diffusion-based purification methods aim to remove adversarial effects +from an input data point at test time. The approach gains increasing attention +as an alternative to adversarial training due to the disentangling between +training and testing. Well-known white-box attacks are often employed to +measure the robustness of the purification. However, it is unknown whether +these attacks are the most effective for the diffusion-based purification since +the attacks are often tailored for adversarial training. We analyze the current +practices and provide a new guideline for measuring the robustness of +purification methods against adversarial attacks. Based on our analysis, we +further propose a new purification strategy improving robustness compared to +the current diffusion-based purification methods. + +
+
+ comment: Accepted by ICCV 2023, Oral presentation +
+
+
+
+
+ + ♻ ☆ Automated Semiconductor Defect Inspection in Scanning Electron + Microscope Images: a Systematic Review + + +
+ A growing need exists for efficient and accurate methods for detecting +defects in semiconductor materials and devices. These defects can have a +detrimental impact on the efficiency of the manufacturing process, because they +cause critical failures and wafer-yield limitations. As nodes and patterns get +smaller, even high-resolution imaging techniques such as Scanning Electron +Microscopy (SEM) produce noisy images due to operating close to sensitivity +levels and due to varying physical properties of different underlayers or +resist materials. This inherent noise is one of the main challenges for defect +inspection. One promising approach is the use of machine learning algorithms, +which can be trained to accurately classify and locate defects in semiconductor +samples. Recently, convolutional neural networks have proved to be particularly +useful in this regard. This systematic review provides a comprehensive overview +of the state of automated semiconductor defect inspection on SEM images, +including the most recent innovations and developments. 38 publications were +selected on this topic, indexed in IEEE Xplore and SPIE databases. For each of +these, the application, methodology, dataset, results, limitations and future +work were summarized. A comprehensive overview and analysis of their methods is +provided. Finally, promising avenues for future work in the field of SEM-based +defect inspection are suggested. + +
+
+ comment: 16 pages, 12 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ REAP: A Large-Scale Realistic Adversarial Patch Benchmark ICCV 2023 + + +
+ Machine learning models are known to be susceptible to adversarial +perturbation. One famous attack is the adversarial patch, a sticker with a +particularly crafted pattern that makes the model incorrectly predict the +object it is placed on. This attack presents a critical threat to +cyber-physical systems that rely on cameras such as autonomous cars. Despite +the significance of the problem, conducting research in this setting has been +difficult; evaluating attacks and defenses in the real world is exceptionally +costly while synthetic data are unrealistic. In this work, we propose the REAP +(REalistic Adversarial Patch) benchmark, a digital benchmark that allows the +user to evaluate patch attacks on real images, and under real-world conditions. +Built on top of the Mapillary Vistas dataset, our benchmark contains over +14,000 traffic signs. Each sign is augmented with a pair of geometric and +lighting transformations, which can be used to apply a digitally generated +patch realistically onto the sign. Using our benchmark, we perform the first +large-scale assessments of adversarial patch attacks under realistic +conditions. Our experiments suggest that adversarial patch attacks may present +a smaller threat than previously believed and that the success rate of an +attack on simpler digital simulations is not predictive of its actual +effectiveness in practice. We release our benchmark publicly at +https://github.com/wagner-group/reap-benchmark. + +
+
+ comment: ICCV 2023. Code and benchmark can be found at + https://github.com/wagner-group/reap-benchmark +
+
+
+
+
+ + ♻ ☆ SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models with + Large Language Models ACM MM 2023 + + +
+ Diffusion models, which have emerged to become popular text-to-image +generation models, can produce high-quality and content-rich images guided by +textual prompts. However, there are limitations to semantic understanding and +commonsense reasoning in existing models when the input prompts are concise +narrative, resulting in low-quality image generation. To improve the capacities +for narrative prompts, we propose a simple-yet-effective parameter-efficient +fine-tuning approach called the Semantic Understanding and Reasoning adapter +(SUR-adapter) for pre-trained diffusion models. To reach this goal, we first +collect and annotate a new dataset SURD which consists of more than 57,000 +semantically corrected multi-modal samples. Each sample contains a simple +narrative prompt, a complex keyword-based prompt, and a high-quality image. +Then, we align the semantic representation of narrative prompts to the complex +prompts and transfer knowledge of large language models (LLMs) to our +SUR-adapter via knowledge distillation so that it can acquire the powerful +semantic understanding and reasoning capabilities to build a high-quality +textual semantic representation for text-to-image generation. We conduct +experiments by integrating multiple LLMs and popular pre-trained diffusion +models to show the effectiveness of our approach in enabling diffusion models +to understand and reason concise natural language without image quality +degradation. Our approach can make text-to-image diffusion models easier to use +with better user experience, which demonstrates our approach has the potential +for further advancing the development of user-friendly text-to-image generation +models by bridging the semantic gap between simple narrative prompts and +complex keyword-based prompts. The code is released at +https://github.com/Qrange-group/SUR-adapter. + +
+
+ comment: accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Streamlined Lensed Quasar Identification in Multiband Images via + Ensemble Networks + + +
+ Quasars experiencing strong lensing offer unique viewpoints on subjects +related to the cosmic expansion rate, the dark matter profile within the +foreground deflectors, and the quasar host galaxies. Unfortunately, identifying +them in astronomical images is challenging since they are overwhelmed by the +abundance of non-lenses. To address this, we have developed a novel approach by +ensembling cutting-edge convolutional networks (CNNs) -- for instance, ResNet, +Inception, NASNet, MobileNet, EfficientNet, and RegNet -- along with vision +transformers (ViTs) trained on realistic galaxy-quasar lens simulations based +on the Hyper Suprime-Cam (HSC) multiband images. While the individual model +exhibits remarkable performance when evaluated against the test dataset, +achieving an area under the receiver operating characteristic curve of $>$97.3% +and a median false positive rate of 3.6%, it struggles to generalize in real +data, indicated by numerous spurious sources picked by each classifier. A +significant improvement is achieved by averaging these CNNs and ViTs, resulting +in the impurities being downsized by factors up to 50. Subsequently, combining +the HSC images with the UKIRT, VISTA, and unWISE data, we retrieve +approximately 60 million sources as parent samples and reduce this to 892,609 +after employing a photometry preselection to discover $z>1.5$ lensed quasars +with Einstein radii of $\theta_\mathrm{E}<5$ arcsec. Afterward, the ensemble +classifier indicates 3080 sources with a high probability of being lenses, for +which we visually inspect, yielding 210 prevailing candidates awaiting +spectroscopic confirmation. These outcomes suggest that automated deep learning +pipelines hold great potential in effectively detecting strong lenses in vast +datasets with minimal manual visual inspection involved. + +
+
+ comment: Accepted for publication in the Astronomy & Astrophysics journal. 28 + pages, 11 figures, and 3 tables. We welcome comments from the reader +
+
+
+
+
+ + ♻ ☆ TrajectoryFormer: 3D Object Tracking Transformer with Predictive + Trajectory Hypotheses ICCV 2023 + + +
+ 3D multi-object tracking (MOT) is vital for many applications including +autonomous driving vehicles and service robots. With the commonly used +tracking-by-detection paradigm, 3D MOT has made important progress in recent +years. However, these methods only use the detection boxes of the current frame +to obtain trajectory-box association results, which makes it impossible for the +tracker to recover objects missed by the detector. In this paper, we present +TrajectoryFormer, a novel point-cloud-based 3D MOT framework. To recover the +missed object by detector, we generates multiple trajectory hypotheses with +hybrid candidate boxes, including temporally predicted boxes and current-frame +detection boxes, for trajectory-box association. The predicted boxes can +propagate object's history trajectory information to the current frame and thus +the network can tolerate short-term miss detection of the tracked objects. We +combine long-term object motion feature and short-term object appearance +feature to create per-hypothesis feature embedding, which reduces the +computational overhead for spatial-temporal encoding. Additionally, we +introduce a Global-Local Interaction Module to conduct information interaction +among all hypotheses and models their spatial relations, leading to accurate +estimation of hypotheses. Our TrajectoryFormer achieves state-of-the-art +performance on the Waymo 3D MOT benchmarks. Code is available at +https://github.com/poodarchu/EFG . + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Too Large; Data Reduction for Vision-Language Pre-Training ICCV2023 + + +
+ This paper examines the problems of severe image-text misalignment and high +redundancy in the widely-used large-scale Vision-Language Pre-Training (VLP) +datasets. To address these issues, we propose an efficient and straightforward +Vision-Language learning algorithm called TL;DR, which aims to compress the +existing large VLP data into a small, high-quality set. Our approach consists +of two major steps. First, a codebook-based encoder-decoder captioner is +developed to select representative samples. Second, a new caption is generated +to complement the original captions for selected samples, mitigating the +text-image misalignment problem while maintaining uniqueness. As the result, +TL;DR enables us to reduce the large dataset into a small set of high-quality +data, which can serve as an alternative pre-training dataset. This algorithm +significantly speeds up the time-consuming pretraining process. Specifically, +TL;DR can compress the mainstream VLP datasets at a high ratio, e.g., reduce +well-cleaned CC3M dataset from 2.82M to 0.67M ($\sim$24\%) and noisy YFCC15M +from 15M to 2.5M ($\sim$16.7\%). Extensive experiments with three popular VLP +models over seven downstream tasks show that VLP model trained on the +compressed dataset provided by TL;DR can perform similar or even better results +compared with training on the full-scale dataset. The code will be made +available at \url{https://github.com/showlab/datacentric.vlp}. + +
+
+ comment: ICCV2023. Code: https://github.com/showlab/datacentric.vlp +
+
+
+
+
+ + ♻ ☆ Deformable Mixer Transformer with Gating for Multi-Task Learning of + Dense Prediction AAAI 2023 + + +
+ CNNs and Transformers have their own advantages and both have been widely +used for dense prediction in multi-task learning (MTL). Most of the current +studies on MTL solely rely on CNN or Transformer. In this work, we present a +novel MTL model by combining both merits of deformable CNN and query-based +Transformer with shared gating for multi-task learning of dense prediction. +This combination may offer a simple and efficient solution owing to its +powerful and flexible task-specific learning and advantages of lower cost, less +complexity and smaller parameters than the traditional MTL methods. We +introduce deformable mixer Transformer with gating (DeMTG), a simple and +effective encoder-decoder architecture up-to-date that incorporates the +convolution and attention mechanism in a unified network for MTL. It is +exquisitely designed to use advantages of each block, and provide deformable +and comprehensive features for all tasks from local and global perspective. +First, the deformable mixer encoder contains two types of operators: the +channel-aware mixing operator leveraged to allow communication among different +channels, and the spatial-aware deformable operator with deformable convolution +applied to efficiently sample more informative spatial locations. Second, the +task-aware gating transformer decoder is used to perform the task-specific +predictions, in which task interaction block integrated with self-attention is +applied to capture task interaction features, and the task query block +integrated with gating attention is leveraged to select corresponding +task-specific features. Further, the experiment results demonstrate that the +proposed DeMTG uses fewer GFLOPs and significantly outperforms current +Transformer-based and CNN-based competitive models on a variety of metrics on +three dense prediction datasets. Our code and models are available at +https://github.com/yangyangxu0/DeMTG. + +
+
+ comment: submitted to IJCV; an extension to our previous AAAI 2023 paper + arXiv:2301.03461 +
+
+
+
+
+ + ♻ ☆ Unsupervised Light Field Depth Estimation via Multi-view Feature + Matching with Occlusion Prediction + + +
+ Depth estimation from light field (LF) images is a fundamental step for +numerous applications. Recently, learning-based methods have achieved higher +accuracy and efficiency than the traditional methods. However, it is costly to +obtain sufficient depth labels for supervised training. In this paper, we +propose an unsupervised framework to estimate depth from LF images. First, we +design a disparity estimation network (DispNet) with a coarse-to-fine structure +to predict disparity maps from different view combinations. It explicitly +performs multi-view feature matching to learn the correspondences effectively. +As occlusions may cause the violation of photo-consistency, we introduce an +occlusion prediction network (OccNet) to predict the occlusion maps, which are +used as the element-wise weights of photometric loss to solve the occlusion +issue and assist the disparity learning. With the disparity maps estimated by +multiple input combinations, we then propose a disparity fusion strategy based +on the estimated errors with effective occlusion handling to obtain the final +disparity map with higher accuracy. Experimental results demonstrate that our +method achieves superior performance on both the dense and sparse LF images, +and also shows better robustness and generalization on the real-world LF images +compared to the other methods. + +
+
+
+
+
+ + ♻ ☆ UniVTG: Towards Unified Video-Language Temporal Grounding ICCV 2023 + + +
+ Video Temporal Grounding (VTG), which aims to ground target clips from videos +(such as consecutive intervals or disjoint shots) according to custom language +queries (e.g., sentences or words), is key for video browsing on social media. +Most methods in this direction develop taskspecific models that are trained +with type-specific labels, such as moment retrieval (time interval) and +highlight detection (worthiness curve), which limits their abilities to +generalize to various VTG tasks and labels. In this paper, we propose to Unify +the diverse VTG labels and tasks, dubbed UniVTG, along three directions: +Firstly, we revisit a wide range of VTG labels and tasks and define a unified +formulation. Based on this, we develop data annotation schemes to create +scalable pseudo supervision. Secondly, we develop an effective and flexible +grounding model capable of addressing each task and making full use of each +label. Lastly, thanks to the unified framework, we are able to unlock temporal +grounding pretraining from large-scale diverse labels and develop stronger +grounding abilities e.g., zero-shot grounding. Extensive experiments on three +tasks (moment retrieval, highlight detection and video summarization) across +seven datasets (QVHighlights, Charades-STA, TACoS, Ego4D, YouTube Highlights, +TVSum, and QFVS) demonstrate the effectiveness and flexibility of our proposed +framework. The codes are available at https://github.com/showlab/UniVTG. + +
+
+ comment: Accepted by ICCV 2023. 16 pages, 10 figures, 13 tables. Code: + https://github.com/showlab/UniVTG +
+
+
+
+
+ + ♻ ☆ SyncDiffusion: Coherent Montage via Synchronized Joint Diffusions + + +
+ The remarkable capabilities of pretrained image diffusion models have been +utilized not only for generating fixed-size images but also for creating +panoramas. However, naive stitching of multiple images often results in visible +seams. Recent techniques have attempted to address this issue by performing +joint diffusions in multiple windows and averaging latent features in +overlapping regions. However, these approaches, which focus on seamless montage +generation, often yield incoherent outputs by blending different scenes within +a single image. To overcome this limitation, we propose SyncDiffusion, a +plug-and-play module that synchronizes multiple diffusions through gradient +descent from a perceptual similarity loss. Specifically, we compute the +gradient of the perceptual loss using the predicted denoised images at each +denoising step, providing meaningful guidance for achieving coherent montages. +Our experimental results demonstrate that our method produces significantly +more coherent outputs compared to previous methods (66.35% vs. 33.65% in our +user study) while still maintaining fidelity (as assessed by GIQA) and +compatibility with the input prompt (as measured by CLIP score). + +
+
+ comment: Project page: https://syncdiffusion.github.io +
+
+
+
+
+ + ♻ ☆ Implicit Identity Representation Conditioned Memory Compensation Network + for Talking Head video Generation ICCV2023 + + +
+ Talking head video generation aims to animate a human face in a still image +with dynamic poses and expressions using motion information derived from a +target-driving video, while maintaining the person's identity in the source +image. However, dramatic and complex motions in the driving video cause +ambiguous generation, because the still source image cannot provide sufficient +appearance information for occluded regions or delicate expression variations, +which produces severe artifacts and significantly degrades the generation +quality. To tackle this problem, we propose to learn a global facial +representation space, and design a novel implicit identity representation +conditioned memory compensation network, coined as MCNet, for high-fidelity +talking head generation.~Specifically, we devise a network module to learn a +unified spatial facial meta-memory bank from all training samples, which can +provide rich facial structure and appearance priors to compensate warped source +facial features for the generation. Furthermore, we propose an effective query +mechanism based on implicit identity representations learned from the discrete +keypoints of the source image. It can greatly facilitate the retrieval of more +correlated information from the memory bank for the compensation. Extensive +experiments demonstrate that MCNet can learn representative and complementary +facial memory, and can clearly outperform previous state-of-the-art talking +head generation methods on VoxCeleb1 and CelebV datasets. Please check our +\href{https://github.com/harlanhong/ICCV2023-MCNET}{Project}. + +
+
+ comment: Accepted by ICCV2023, update the reference and figures +
+
+
+
+
+ + ♻ ☆ GPS-GLASS: Learning Nighttime Semantic Segmentation Using Daytime Video + and GPS data ICCV + + +
+ Semantic segmentation for autonomous driving should be robust against various +in-the-wild environments. Nighttime semantic segmentation is especially +challenging due to a lack of annotated nighttime images and a large domain gap +from daytime images with sufficient annotation. In this paper, we propose a +novel GPS-based training framework for nighttime semantic segmentation. Given +GPS-aligned pairs of daytime and nighttime images, we perform cross-domain +correspondence matching to obtain pixel-level pseudo supervision. Moreover, we +conduct flow estimation between daytime video frames and apply GPS-based +scaling to acquire another pixel-level pseudo supervision. Using these pseudo +supervisions with a confidence map, we train a nighttime semantic segmentation +network without any annotation from nighttime images. Experimental results +demonstrate the effectiveness of the proposed method on several nighttime +semantic segmentation datasets. Our source code is available at +https://github.com/jimmy9704/GPS-GLASS. + +
+
+ comment: ICCVW 2023 +
+
+
+
+
+ + ♻ ☆ What does CLIP know about a red circle? Visual prompt engineering for + VLMs ICCV 2023 + + +
+ Large-scale Vision-Language Models, such as CLIP, learn powerful image-text +representations that have found numerous applications, from zero-shot +classification to text-to-image generation. Despite that, their capabilities +for solving novel discriminative tasks via prompting fall behind those of large +language models, such as GPT-3. Here we explore the idea of visual prompt +engineering for solving computer vision tasks beyond classification by editing +in image space instead of text. In particular, we discover an emergent ability +of CLIP, where, by simply drawing a red circle around an object, we can direct +the model's attention to that region, while also maintaining global +information. We show the power of this simple approach by achieving +state-of-the-art in zero-shot referring expressions comprehension and strong +performance in keypoint localization tasks. Finally, we draw attention to some +potential ethical concerns of large language-vision models. + +
+
+ comment: ICCV 2023 Oral +
+
+
+
+
+ + ♻ ☆ GHN-Q: Parameter Prediction for Unseen Quantized Convolutional + Architectures via Graph Hypernetworks + + +
+ Deep convolutional neural network (CNN) training via iterative optimization +has had incredible success in finding optimal parameters. However, modern CNN +architectures often contain millions of parameters. Thus, any given model for a +single architecture resides in a massive parameter space. Models with similar +loss could have drastically different characteristics such as adversarial +robustness, generalizability, and quantization robustness. For deep learning on +the edge, quantization robustness is often crucial. Finding a model that is +quantization-robust can sometimes require significant efforts. Recent works +using Graph Hypernetworks (GHN) have shown remarkable performance predicting +high-performant parameters of varying CNN architectures. Inspired by these +successes, we wonder if the graph representations of GHN-2 can be leveraged to +predict quantization-robust parameters as well, which we call GHN-Q. We conduct +the first-ever study exploring the use of graph hypernetworks for predicting +parameters of unseen quantized CNN architectures. We focus on a reduced CNN +search space and find that GHN-Q can in fact predict quantization-robust +parameters for various 8-bit quantized CNNs. Decent quantized accuracies are +observed even with 4-bit quantization despite GHN-Q not being trained on it. +Quantized finetuning of GHN-Q at lower bitwidths may bring further improvements +and is currently being explored. + +
+
+ comment: Updated Figure 1 and added additional results in Table 1. Initial + extended abstract version accepted at Edge Intelligence Workshop 2022 for + poster presentation +
+
+
+
+
+ + ♻ ☆ Sketch-based Video Object Localization + + +
+ We introduce Sketch-based Video Object Localization (SVOL), a new task aimed +at localizing spatio-temporal object boxes in video queried by the input +sketch. We first outline the challenges in the SVOL task and build the +Sketch-Video Attention Network (SVANet) with the following design principles: +(i) to consider temporal information of video and bridge the domain gap between +sketch and video; (ii) to accurately identify and localize multiple objects +simultaneously; (iii) to handle various styles of sketches; (iv) to be +classification-free. In particular, SVANet is equipped with a Cross-modal +Transformer that models the interaction between learnable object tokens, query +sketch, and video through attention operations, and learns upon a per-frame set +matching strategy that enables frame-wise prediction while utilizing global +video context. We evaluate SVANet on a newly curated SVOL dataset. By design, +SVANet successfully learns the mapping between the query sketches and video +objects, achieving state-of-the-art results on the SVOL benchmark. We further +confirm the effectiveness of SVANet via extensive ablation studies and +visualizations. Lastly, we demonstrate its transfer capability on unseen +datasets and novel categories, suggesting its high scalability in real-world +applications + +
+
+
+
+
+ + ♻ ☆ Multi-Visual-Inertial System: Analysis, Calibration and Estimation + + +
+ In this paper, we study state estimation of multi-visual-inertial systems +(MVIS) and develop sensor fusion algorithms to optimally fuse an arbitrary +number of asynchronous inertial measurement units (IMUs) or gyroscopes and +global and(or) rolling shutter cameras. We are especially interested in the +full calibration of the associated visual-inertial sensors, including the IMU +or camera intrinsics and the IMU-IMU(or camera) spatiotemporal extrinsics as +well as the image readout time of rolling-shutter cameras (if used). To this +end, we develop a new analytic combined IMU integration with intrinsics-termed +ACI3-to preintegrate IMU measurements, which is leveraged to fuse auxiliary +IMUs and(or) gyroscopes alongside a base IMU. We model the multi-inertial +measurements to include all the necessary inertial intrinsic and IMU-IMU +spatiotemporal extrinsic parameters, while leveraging IMU-IMU rigid-body +constraints to eliminate the necessity of auxiliary inertial poses and thus +reducing computational complexity. By performing observability analysis of +MVIS, we prove that the standard four unobservable directions remain - no +matter how many inertial sensors are used, and also identify, for the first +time, degenerate motions for IMU-IMU spatiotemporal extrinsics and auxiliary +inertial intrinsics. In addition to the extensive simulations that validate our +analysis and algorithms, we have built our own MVIS sensor rig and collected +over 25 real-world datasets to experimentally verify the proposed calibration +against the state-of-the-art calibration method such as Kalibr. We show that +the proposed MVIS calibration is able to achieve competing accuracy with +improved convergence and repeatability, which is open sourced to better benefit +the community. + +
+
+
+
+
+ + ♻ ☆ Unified Open-Vocabulary Dense Visual Prediction + + +
+ In recent years, open-vocabulary (OV) dense visual prediction (such as OV +object detection, semantic, instance and panoptic segmentations) has attracted +increasing research attention. However, most of existing approaches are +task-specific and individually tackle each task. In this paper, we propose a +Unified Open-Vocabulary Network (UOVN) to jointly address four common dense +prediction tasks. Compared with separate models, a unified network is more +desirable for diverse industrial applications. Moreover, OV dense prediction +training data is relatively less. Separate networks can only leverage +task-relevant training data, while a unified approach can integrate diverse +training data to boost individual tasks. We address two major challenges in +unified OV prediction. Firstly, unlike unified methods for fixed-set +predictions, OV networks are usually trained with multi-modal data. Therefore, +we propose a multi-modal, multi-scale and multi-task (MMM) decoding mechanism +to better leverage multi-modal data. Secondly, because UOVN uses data from +different tasks for training, there are significant domain and task gaps. We +present a UOVN training mechanism to reduce such gaps. Experiments on four +datasets demonstrate the effectiveness of our UOVN. + +
+
+
+
+
+ + ♻ ☆ ALIP: Adaptive Language-Image Pre-training with Synthetic Caption ICCV2023 + + +
+ Contrastive Language-Image Pre-training (CLIP) has significantly boosted the +performance of various vision-language tasks by scaling up the dataset with +image-text pairs collected from the web. However, the presence of intrinsic +noise and unmatched image-text pairs in web data can potentially affect the +performance of representation learning. To address this issue, we first utilize +the OFA model to generate synthetic captions that focus on the image content. +The generated captions contain complementary information that is beneficial for +pre-training. Then, we propose an Adaptive Language-Image Pre-training (ALIP), +a bi-path model that integrates supervision from both raw text and synthetic +caption. As the core components of ALIP, the Language Consistency Gate (LCG) +and Description Consistency Gate (DCG) dynamically adjust the weights of +samples and image-text/caption pairs during the training process. Meanwhile, +the adaptive contrastive loss can effectively reduce the impact of noise data +and enhances the efficiency of pre-training data. We validate ALIP with +experiments on different scales of models and pre-training datasets. +Experiments results show that ALIP achieves state-of-the-art performance on +multiple downstream tasks including zero-shot image-text retrieval and linear +probe. To facilitate future research, the code and pre-trained models are +released at https://github.com/deepglint/ALIP. + +
+
+ comment: 15pages, 10figures, ICCV2023 +
+
+
+
+
+ + ♻ ☆ Robust Single-view Cone-beam X-ray Pose Estimation with Neural Tuned + Tomography (NeTT) and Masked Neural Radiance Fields (mNeRF) + + +
+ Many tasks performed in image-guided, mini-invasive, medical procedures can +be cast as pose estimation problems, where an X-ray projection is utilized to +reach a target in 3D space. Expanding on recent advances in the differentiable +rendering of optically reflective materials, we introduce new methods for pose +estimation of radiolucent objects using X-ray projections, and we demonstrate +the critical role of optimal view synthesis in performing this task. We first +develop an algorithm (DiffDRR) that efficiently computes Digitally +Reconstructed Radiographs (DRRs) and leverages automatic differentiation within +TensorFlow. Pose estimation is performed by iterative gradient descent using a +loss function that quantifies the similarity of the DRR synthesized from a +randomly initialized pose and the true fluoroscopic image at the target pose. +We propose two novel methods for high-fidelity view synthesis, Neural Tuned +Tomography (NeTT) and masked Neural Radiance Fields (mNeRF). Both methods rely +on classic Cone-Beam Computerized Tomography (CBCT); NeTT directly optimizes +the CBCT densities, while the non-zero values of mNeRF are constrained by a 3D +mask of the anatomic region segmented from CBCT. We demonstrate that both NeTT +and mNeRF distinctly improve pose estimation within our framework. By defining +a successful pose estimate to be a 3D angle error of less than 3 deg, we find +that NeTT and mNeRF can achieve similar results, both with overall success +rates more than 93%. However, the computational cost of NeTT is significantly +lower than mNeRF in both training and pose estimation. Furthermore, we show +that a NeTT trained for a single subject can generalize to synthesize +high-fidelity DRRs and ensure robust pose estimations for all other subjects. +Therefore, we suggest that NeTT is an attractive option for robust pose +estimation using fluoroscopic projections. + +
+
+
+
+
+ + ♻ ☆ InfLoR-SNN: Reducing Information Loss for Spiking Neural Networks ECCV2022 + + +
+ The Spiking Neural Network (SNN) has attracted more and more attention +recently. It adopts binary spike signals to transmit information. Benefitting +from the information passing paradigm of SNNs, the multiplications of +activations and weights can be replaced by additions, which are more +energy-efficient. However, its "Hard Reset" mechanism for the firing activity +would ignore the difference among membrane potentials when the membrane +potential is above the firing threshold, causing information loss. Meanwhile, +quantifying the membrane potential to 0/1 spikes at the firing instants will +inevitably introduce the quantization error thus bringing about information +loss too. To address these problems, we propose to use the "Soft Reset" +mechanism for the supervised training-based SNNs, which will drive the membrane +potential to a dynamic reset potential according to its magnitude, and Membrane +Potential Rectifier (MPR) to reduce the quantization error via redistributing +the membrane potential to a range close to the spikes. Results show that the +SNNs with the "Soft Reset" mechanism and MPR outperform their vanilla +counterparts on both static and dynamic datasets. + +
+
+ comment: Accepted by ECCV2022 +
+
+
+
+
+ + ♻ ☆ Audio-Visual Spatial Integration and Recursive Attention for Robust + Sound Source Localization ACM MM 2023 + + +
+ The objective of the sound source localization task is to enable machines to +detect the location of sound-making objects within a visual scene. While the +audio modality provides spatial cues to locate the sound source, existing +approaches only use audio as an auxiliary role to compare spatial regions of +the visual modality. Humans, on the other hand, utilize both audio and visual +modalities as spatial cues to locate sound sources. In this paper, we propose +an audio-visual spatial integration network that integrates spatial cues from +both modalities to mimic human behavior when detecting sound-making objects. +Additionally, we introduce a recursive attention network to mimic human +behavior of iterative focusing on objects, resulting in more accurate attention +regions. To effectively encode spatial information from both modalities, we +propose audio-visual pair matching loss and spatial region alignment loss. By +utilizing the spatial cues of audio-visual modalities and recursively focusing +objects, our method can perform more robust sound source localization. +Comprehensive experimental results on the Flickr SoundNet and VGG-Sound Source +datasets demonstrate the superiority of our proposed method over existing +approaches. Our code is available at: https://github.com/VisualAIKHU/SIRA-SSL + +
+
+ comment: Camera-Ready, ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Unbiased Image Synthesis via Manifold-Driven Sampling in Diffusion + Models + + +
+ Diffusion models are a potent class of generative models capable of producing +high-quality images. However, they can face challenges related to data bias, +favoring specific modes of data, especially when the training data does not +accurately represent the true data distribution and exhibits skewed or +imbalanced patterns. For instance, the CelebA dataset contains more female +images than male images, leading to biased generation results and impacting +downstream applications. To address this issue, we propose a novel method that +leverages manifold guidance to mitigate data bias in diffusion models. Our key +idea is to estimate the manifold of the training data using an unsupervised +approach, and then use it to guide the sampling process of diffusion models. +This encourages the generated images to be uniformly distributed on the data +manifold without altering the model architecture or necessitating labels or +retraining. Theoretical analysis and empirical evidence demonstrate the +effectiveness of our method in improving the quality and unbiasedness of image +generation compared to standard diffusion models. + +
+
+
+
+
+ + ♻ ☆ Among Us: Adversarially Robust Collaborative Perception by Consensus ICCV 2023 + + +
+ Multiple robots could perceive a scene (e.g., detect objects) collaboratively +better than individuals, although easily suffer from adversarial attacks when +using deep learning. This could be addressed by the adversarial defense, but +its training requires the often-unknown attacking mechanism. Differently, we +propose ROBOSAC, a novel sampling-based defense strategy generalizable to +unseen attackers. Our key idea is that collaborative perception should lead to +consensus rather than dissensus in results compared to individual perception. +This leads to our hypothesize-and-verify framework: perception results with and +without collaboration from a random subset of teammates are compared until +reaching a consensus. In such a framework, more teammates in the sampled subset +often entail better perception performance but require longer sampling time to +reject potential attackers. Thus, we derive how many sampling trials are needed +to ensure the desired size of an attacker-free subset, or equivalently, the +maximum size of such a subset that we can successfully sample within a given +number of trials. We validate our method on the task of collaborative 3D object +detection in autonomous driving scenarios. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ DeepAccident: A Motion and Accident Prediction Benchmark for V2X + Autonomous Driving + + +
+ Safety is the primary priority of autonomous driving. Nevertheless, no +published dataset currently supports the direct and explainable safety +evaluation for autonomous driving. In this work, we propose DeepAccident, a +large-scale dataset generated via a realistic simulator containing diverse +accident scenarios that frequently occur in real-world driving. The proposed +DeepAccident dataset includes 57K annotated frames and 285K annotated samples, +approximately 7 times more than the large-scale nuScenes dataset with 40k +annotated samples. In addition, we propose a new task, end-to-end motion and +accident prediction, which can be used to directly evaluate the accident +prediction ability for different autonomous driving algorithms. Furthermore, +for each scenario, we set four vehicles along with one infrastructure to record +data, thus providing diverse viewpoints for accident scenarios and enabling V2X +(vehicle-to-everything) research on perception and prediction tasks. Finally, +we present a baseline V2X model named V2XFormer that demonstrates superior +performance for motion and accident prediction and 3D object detection compared +to the single-vehicle model. + +
+
+
+
+
+ + ♻ ☆ CoVLR: Coordinating Cross-Modal Consistency and Intra-Modal Structure + for Vision-Language Retrieval + + +
+ Current vision-language retrieval aims to perform cross-modal instance +search, in which the core idea is to learn the consistent visionlanguage +representations. Although the performance of cross-modal retrieval has greatly +improved with the development of deep models, we unfortunately find that +traditional hard consistency may destroy the original relationships among +single-modal instances, leading the performance degradation for single-modal +retrieval. To address this challenge, in this paper, we experimentally observe +that the vision-language divergence may cause the existence of strong and weak +modalities, and the hard cross-modal consistency cannot guarantee that strong +modal instances' relationships are not affected by weak modality, resulting in +the strong modal instances' relationships perturbed despite learned consistent +representations.To this end, we propose a novel and directly Coordinated +VisionLanguage Retrieval method (dubbed CoVLR), which aims to study and +alleviate the desynchrony problem between the cross-modal alignment and +single-modal cluster-preserving tasks. CoVLR addresses this challenge by +developing an effective meta-optimization based strategy, in which the +cross-modal consistency objective and the intra-modal relation preserving +objective are acted as the meta-train and meta-test tasks, thereby CoVLR +encourages both tasks to be optimized in a coordinated way. Consequently, we +can simultaneously insure cross-modal consistency and intra-modal structure. +Experiments on different datasets validate CoVLR can improve single-modal +retrieval accuracy whilst preserving crossmodal retrieval capacity compared +with the baselines. + +
+
+ comment: I apologize for my operational mistake, which has resulted in the + absence of a revised version of the manuscript. Furthermore, I am concerned + that the submission process of this paper may potentially lead to conflicts. + Therefore, I kindly request the withdrawal of the manuscript +
+
+
+
+
+ + ♻ ☆ Iterative Soft Shrinkage Learning for Efficient Image Super-Resolution ICCV 2023 + + +
+ Image super-resolution (SR) has witnessed extensive neural network designs +from CNN to transformer architectures. However, prevailing SR models suffer +from prohibitive memory footprint and intensive computations, which limits +further deployment on edge devices. This work investigates the potential of +network pruning for super-resolution to take advantage of off-the-shelf network +designs and reduce the underlying computational overhead. Two main challenges +remain in applying pruning methods for SR. First, the widely-used filter +pruning technique reflects limited granularity and restricted adaptability to +diverse network structures. Second, existing pruning methods generally operate +upon a pre-trained network for the sparse structure determination, hard to get +rid of dense model training in the traditional SR paradigm. To address these +challenges, we adopt unstructured pruning with sparse models directly trained +from scratch. Specifically, we propose a novel Iterative Soft +Shrinkage-Percentage (ISS-P) method by optimizing the sparse structure of a +randomly initialized network at each iteration and tweaking unimportant weights +with a small amount proportional to the magnitude scale on-the-fly. We observe +that the proposed ISS-P can dynamically learn sparse structures adapting to the +optimization process and preserve the sparse model's trainability by yielding a +more regularized gradient throughput. Experiments on benchmark datasets +demonstrate the effectiveness of the proposed ISS-P over diverse network +architectures. Code is available at +https://github.com/Jiamian-Wang/Iterative-Soft-Shrinkage-SR + +
+
+ comment: Accepted by ICCV 2023, code released at + https://github.com/Jiamian-Wang/Iterative-Soft-Shrinkage-SR +
+
+
+
+
+ + ♻ ☆ Dual-Stream Diffusion Net for Text-to-Video Generation + + +
+ With the emerging diffusion models, recently, text-to-video generation has +aroused increasing attention. But an important bottleneck therein is that +generative videos often tend to carry some flickers and artifacts. In this +work, we propose a dual-stream diffusion net (DSDN) to improve the consistency +of content variations in generating videos. In particular, the designed two +diffusion streams, video content and motion branches, could not only run +separately in their private spaces for producing personalized video variations +as well as content, but also be well-aligned between the content and motion +domains through leveraging our designed cross-transformer interaction module, +which would benefit the smoothness of generated videos. Besides, we also +introduce motion decomposer and combiner to faciliate the operation on video +motion. Qualitative and quantitative experiments demonstrate that our method +could produce amazing continuous videos with fewer flickers. + +
+
+ comment: 8pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Deformer: Dynamic Fusion Transformer for Robust Hand Pose Estimation ICCV 2023 + + +
+ Accurately estimating 3D hand pose is crucial for understanding how humans +interact with the world. Despite remarkable progress, existing methods often +struggle to generate plausible hand poses when the hand is heavily occluded or +blurred. In videos, the movements of the hand allow us to observe various parts +of the hand that may be occluded or blurred in a single frame. To adaptively +leverage the visual clue before and after the occlusion or blurring for robust +hand pose estimation, we propose the Deformer: a framework that implicitly +reasons about the relationship between hand parts within the same image +(spatial dimension) and different timesteps (temporal dimension). We show that +a naive application of the transformer self-attention mechanism is not +sufficient because motion blur or occlusions in certain frames can lead to +heavily distorted hand features and generate imprecise keys and queries. To +address this challenge, we incorporate a Dynamic Fusion Module into Deformer, +which predicts the deformation of the hand and warps the hand mesh predictions +from nearby frames to explicitly support the current frame estimation. +Furthermore, we have observed that errors are unevenly distributed across +different hand parts, with vertices around fingertips having disproportionately +higher errors than those around the palm. We mitigate this issue by introducing +a new loss function called maxMSE that automatically adjusts the weight of +every vertex to focus the model on critical hand parts. Extensive experiments +show that our method significantly outperforms state-of-the-art methods by 10%, +and is more robust to occlusions (over 14%). + +
+
+ comment: In ICCV 2023. Project: https://fuqichen1998.github.io/Deformer/ +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ MUSE: Music Recommender System with Shuffle Play Recommendation + Enhancement CIKM 2023 + + +
+ Recommender systems have become indispensable in music streaming services, +enhancing user experiences by personalizing playlists and facilitating the +serendipitous discovery of new music. However, the existing recommender systems +overlook the unique challenges inherent in the music domain, specifically +shuffle play, which provides subsequent tracks in a random sequence. Based on +our observation that the shuffle play sessions hinder the overall training +process of music recommender systems mainly due to the high unique transition +rates of shuffle play sessions, we propose a Music Recommender System with +Shuffle Play Recommendation Enhancement (MUSE). MUSE employs the +self-supervised learning framework that maximizes the agreement between the +original session and the augmented session, which is augmented by our novel +session augmentation method, called transition-based augmentation. To further +facilitate the alignment of the representations between the two views, we +devise two fine-grained matching strategies, i.e., item- and similarity-based +matching strategies. Through rigorous experiments conducted across diverse +environments, we demonstrate MUSE's efficacy over 12 baseline models on a +large-scale Music Streaming Sessions Dataset (MSSD) from Spotify. The source +code of MUSE is available at \url{https://github.com/yunhak0/MUSE}. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ☆ ReCon: Reducing Congestion in Job Recommendation using Optimal Transport + + +
+ Recommender systems may suffer from congestion, meaning that there is an +unequal distribution of the items in how often they are recommended. Some items +may be recommended much more than others. Recommenders are increasingly used in +domains where items have limited availability, such as the job market, where +congestion is especially problematic: Recommending a vacancy -- for which +typically only one person will be hired -- to a large number of job seekers may +lead to frustration for job seekers, as they may be applying for jobs where +they are not hired. This may also leave vacancies unfilled and result in job +market inefficiency. + We propose a novel approach to job recommendation called ReCon, accounting +for the congestion problem. Our approach is to use an optimal transport +component to ensure a more equal spread of vacancies over job seekers, combined +with a job recommendation model in a multi-objective optimization problem. We +evaluated our approach on two real-world job market datasets. The evaluation +results show that ReCon has good performance on both congestion-related (e.g., +Congestion) and desirability (e.g., NDCG) measures. + +
+
+
+
+
+ + ☆ Attention Calibration for Transformer-based Sequential Recommendation CIKM2023 + + +
+ Transformer-based sequential recommendation (SR) has been booming in recent +years, with the self-attention mechanism as its key component. Self-attention +has been widely believed to be able to effectively select those informative and +relevant items from a sequence of interacted items for next-item prediction via +learning larger attention weights for these items. However, this may not always +be true in reality. Our empirical analysis of some representative +Transformer-based SR models reveals that it is not uncommon for large attention +weights to be assigned to less relevant items, which can result in inaccurate +recommendations. Through further in-depth analysis, we find two factors that +may contribute to such inaccurate assignment of attention weights: sub-optimal +position encoding and noisy input. To this end, in this paper, we aim to +address this significant yet challenging gap in existing works. To be specific, +we propose a simple yet effective framework called Attention Calibration for +Transformer-based Sequential Recommendation (AC-TSR). In AC-TSR, a novel +spatial calibrator and adversarial calibrator are designed respectively to +directly calibrates those incorrectly assigned attention weights. The former is +devised to explicitly capture the spatial relationships (i.e., order and +distance) among items for more precise calculation of attention weights. The +latter aims to redistribute the attention weights based on each item's +contribution to the next-item prediction. AC-TSR is readily adaptable and can +be seamlessly integrated into various existing transformer-based SR models. +Extensive experimental results on four benchmark real-world datasets +demonstrate the superiority of our proposed ACTSR via significant +recommendation performance enhancements. The source code is available at +https://github.com/AIM-SE/AC-TSR. + +
+
+ comment: Accepted by CIKM2023 +
+
+
+
+
+ + ☆ SHARK: A Lightweight Model Compression Approach for Large-scale + Recommender Systems + + +
+ Increasing the size of embedding layers has shown to be effective in +improving the performance of recommendation models, yet gradually causing their +sizes to exceed terabytes in industrial recommender systems, and hence the +increase of computing and storage costs. To save resources while maintaining +model performances, we propose SHARK, the model compression practice we have +summarized in the recommender system of industrial scenarios. SHARK consists of +two main components. First, we use the novel first-order component of Taylor +expansion as importance scores to prune the number of embedding tables (feature +fields). Second, we introduce a new row-wise quantization method to apply +different quantization strategies to each embedding. We conduct extensive +experiments on both public and industrial datasets, demonstrating that each +component of our proposed SHARK framework outperforms previous approaches. We +conduct A/B tests in multiple models on Kuaishou, such as short video, +e-commerce, and advertising recommendation models. The results of the online +A/B test showed SHARK can effectively reduce the memory footprint of the +embedded layer. For the short-video scenarios, the compressed model without any +performance drop significantly saves 70% storage and thousands of machines, +improves 30\% queries per second (QPS), and has been deployed to serve hundreds +of millions of users and process tens of billions of requests every day. + +
+
+ comment: accepted by cikm 2023 +
+
+
+
+
+ + ☆ How Discriminative Are Your Qrels? How To Study the Statistical + Significance of Document Adjudication Methods + + +
+ Creating test collections for offline retrieval evaluation requires human +effort to judge documents' relevance. This expensive activity motivated much +work in developing methods for constructing benchmarks with fewer assessment +costs. In this respect, adjudication methods actively decide both which +documents and the order in which experts review them, in order to better +exploit the assessment budget or to lower it. Researchers evaluate the quality +of those methods by measuring the correlation between the known gold ranking of +systems under the full collection and the observed ranking of systems under the +lower-cost one. This traditional analysis ignores whether and how the low-cost +judgements impact on the statistically significant differences among systems +with respect to the full collection. We fill this void by proposing a novel +methodology to evaluate how the low-cost adjudication methods preserve the +pairwise significant differences between systems as the full collection. In +other terms, while traditional approaches look for stability in answering the +question "is system A better than system B?", our proposed approach looks for +stability in answering the question "is system A significantly better than +system B?", which is the ultimate questions researchers need to answer to +guarantee the generalisability of their results. Among other results, we found +that the best methods in terms of ranking of systems correlation do not always +match those preserving statistical significance. + +
+
+
+
+
+ + ☆ Meta-learning enhanced next POI recommendation by leveraging check-ins + from auxiliary cities + + +
+ Most existing point-of-interest (POI) recommenders aim to capture user +preference by employing city-level user historical check-ins, thus facilitating +users' exploration of the city. However, the scarcity of city-level user +check-ins brings a significant challenge to user preference learning. Although +prior studies attempt to mitigate this challenge by exploiting various context +information, e.g., spatio-temporal information, they ignore to transfer the +knowledge (i.e., common behavioral pattern) from other relevant cities (i.e., +auxiliary cities). In this paper, we investigate the effect of knowledge +distilled from auxiliary cities and thus propose a novel Meta-learning Enhanced +next POI Recommendation framework (MERec). The MERec leverages the correlation +of check-in behaviors among various cities into the meta-learning paradigm to +help infer user preference in the target city, by holding the principle of +"paying more attention to more correlated knowledge". Particularly, a +city-level correlation strategy is devised to attentively capture common +patterns among cities, so as to transfer more relevant knowledge from more +correlated cities. Extensive experiments verify the superiority of the proposed +MERec against state-of-the-art algorithms. + +
+
+
+
+
+ + ☆ Differentiable Retrieval Augmentation via Generative Language Modeling + for E-commerce Query Intent Classification CIKM2023 + + +
+ Retrieval augmentation, which enhances downstream models by a knowledge +retriever and an external corpus instead of by merely increasing the number of +model parameters, has been successfully applied to many natural language +processing (NLP) tasks such as text classification, question answering and so +on. However, existing methods that separately or asynchronously train the +retriever and downstream model mainly due to the non-differentiability between +the two parts, usually lead to degraded performance compared to end-to-end +joint training. + +
+
+ comment: 5 pages, 2 figures; accepted by CIKM2023 +
+
+
+
+
+ + ☆ Graph-based Alignment and Uniformity for Recommendation + + +
+ Collaborative filtering-based recommender systems (RecSys) rely on learning +representations for users and items to predict preferences accurately. +Representation learning on the hypersphere is a promising approach due to its +desirable properties, such as alignment and uniformity. However, the sparsity +issue arises when it encounters RecSys. To address this issue, we propose a +novel approach, graph-based alignment and uniformity (GraphAU), that explicitly +considers high-order connectivities in the user-item bipartite graph. GraphAU +aligns the user/item embedding to the dense vector representations of +high-order neighbors using a neighborhood aggregator, eliminating the need to +compute the burdensome alignment to high-order neighborhoods individually. To +address the discrepancy in alignment losses, GraphAU includes a layer-wise +alignment pooling module to integrate alignment losses layer-wise. Experiments +on four datasets show that GraphAU significantly alleviates the sparsity issue +and achieves state-of-the-art performance. We open-source GraphAU at +https://github.com/YangLiangwei/GraphAU. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ Taken by Surprise: Contrast effect for Similarity Scores + + +
+ Accurately evaluating the similarity of object vector embeddings is of +critical importance for natural language processing, information retrieval and +classification tasks. Popular similarity scores (e.g cosine similarity) are +based on pairs of embedding vectors and disregard the distribution of the +ensemble from which objects are drawn. Human perception of object similarity +significantly depends on the context in which the objects appear. In this work +we propose the \emph{surprise score}, an ensemble-normalized similarity metric +that encapsulates the contrast effect of human perception and significantly +improves the classification performance on zero- and few-shot document +classification tasks. This score quantifies the surprise to find a given +similarity between two elements relative to the pairwise ensemble similarities. +We evaluate this metric on zero/few shot classification and clustering tasks +and typically find 10-15\% better performance compared to raw cosine +similarity. Our code is available at +https://github.com/MeetElise/surprise-similarity. + +
+
+ comment: 9 pages, 2 figures and 4 tables +
+
+
+
+
+ + ☆ Eigenvalue-based Incremental Spectral Clustering + + +
+ Our previous experiments demonstrated that subsets collections of (short) +documents (with several hundred entries) share a common normalized in some way +eigenvalue spectrum of combinatorial Laplacian. Based on this insight, we +propose a method of incremental spectral clustering. The method consists of the +following steps: (1) split the data into manageable subsets, (2) cluster each +of the subsets, (3) merge clusters from different subsets based on the +eigenvalue spectrum similarity to form clusters of the entire set. This method +can be especially useful for clustering methods of complexity strongly +increasing with the size of the data sample,like in case of typical spectral +clustering. Experiments were performed showing that in fact the clustering and +merging the subsets yields clusters close to clustering the entire dataset. + +
+
+ comment: 14 tables, 6 figures +
+
+
+
+
+ + ♻ ☆ Heterogeneous Knowledge Fusion: A Novel Approach for Personalized + Recommendation via LLM RecSys 2023 + + +
+ The analysis and mining of user heterogeneous behavior are of paramount +importance in recommendation systems. However, the conventional approach of +incorporating various types of heterogeneous behavior into recommendation +models leads to feature sparsity and knowledge fragmentation issues. To address +this challenge, we propose a novel approach for personalized recommendation via +Large Language Model (LLM), by extracting and fusing heterogeneous knowledge +from user heterogeneous behavior information. In addition, by combining +heterogeneous knowledge and recommendation tasks, instruction tuning is +performed on LLM for personalized recommendations. The experimental results +demonstrate that our method can effectively integrate user heterogeneous +behavior and significantly improve recommendation performance. + +
+
+ comment: Accepted at RecSys 2023 +
+
+
+
+
+ + ♻ ☆ A Survey on Large Language Models for Recommendation + + +
+ Large Language Models (LLMs) have emerged as powerful tools in the field of +Natural Language Processing (NLP) and have recently gained significant +attention in the domain of Recommendation Systems (RS). These models, trained +on massive amounts of data using self-supervised learning, have demonstrated +remarkable success in learning universal representations and have the potential +to enhance various aspects of recommendation systems by some effective transfer +techniques such as fine-tuning and prompt tuning, and so on. The crucial aspect +of harnessing the power of language models in enhancing recommendation quality +is the utilization of their high-quality representations of textual features +and their extensive coverage of external knowledge to establish correlations +between items and users. To provide a comprehensive understanding of the +existing LLM-based recommendation systems, this survey presents a taxonomy that +categorizes these models into two major paradigms, respectively Discriminative +LLM for Recommendation (DLLM4Rec) and Generative LLM for Recommendation +(GLLM4Rec), with the latter being systematically sorted out for the first time. +Furthermore, we systematically review and analyze existing LLM-based +recommendation systems within each paradigm, providing insights into their +methodologies, techniques, and performance. Additionally, we identify key +challenges and several valuable findings to provide researchers and +practitioners with inspiration. We have also created a GitHub repository to +index relevant papers on LLMs for recommendation, +https://github.com/WLiK/LLM4Rec. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Dimension Independent Mixup for Hard Negative Sample in Collaborative + Filtering + + +
+ Collaborative filtering (CF) is a widely employed technique that predicts +user preferences based on past interactions. Negative sampling plays a vital +role in training CF-based models with implicit feedback. In this paper, we +propose a novel perspective based on the sampling area to revisit existing +sampling methods. We point out that current sampling methods mainly focus on +Point-wise or Line-wise sampling, lacking flexibility and leaving a significant +portion of the hard sampling area un-explored. To address this limitation, we +propose Dimension Independent Mixup for Hard Negative Sampling (DINS), which is +the first Area-wise sampling method for training CF-based models. DINS +comprises three modules: Hard Boundary Definition, Dimension Independent Mixup, +and Multi-hop Pooling. Experiments with real-world datasets on both matrix +factorization and graph-based models demonstrate that DINS outperforms other +negative sampling methods, establishing its effectiveness and superiority. Our +work contributes a new perspective, introduces Area-wise sampling, and presents +DINS as a novel approach that achieves state-of-the-art performance for +negative sampling. Our implementations are available in PyTorch. + +
+
+
+
+
+
+
+
+ + Machine Learning 112 + +
+
+
+ + ☆ Robust Monocular Depth Estimation under Challenging Conditions ICCV 2023 + + +
+ While state-of-the-art monocular depth estimation approaches achieve +impressive results in ideal settings, they are highly unreliable under +challenging illumination and weather conditions, such as at nighttime or in the +presence of rain. In this paper, we uncover these safety-critical issues and +tackle them with md4all: a simple and effective solution that works reliably +under both adverse and ideal conditions, as well as for different types of +learning supervision. We achieve this by exploiting the efficacy of existing +methods under perfect settings. Therefore, we provide valid training signals +independently of what is in the input. First, we generate a set of complex +samples corresponding to the normal training ones. Then, we train the model by +guiding its self- or full-supervision by feeding the generated samples and +computing the standard losses on the corresponding original images. Doing so +enables a single model to recover information across diverse conditions without +modifications at inference time. Extensive experiments on two challenging +public datasets, namely nuScenes and Oxford RobotCar, demonstrate the +effectiveness of our techniques, outperforming prior works by a large margin in +both standard and challenging conditions. Source code and data are available +at: https://md4all.github.io. + +
+
+ comment: ICCV 2023. Source code and data: https://md4all.github.io +
+
+
+
+
+ + ☆ Neural-network quantum state study of the long-range antiferromagnetic + Ising chain + + +
+ We investigate quantum phase transitions in the transverse field Ising chain +with algebraically decaying long-range antiferromagnetic interactions by using +the variational Monte Carlo method with the restricted Boltzmann machine being +employed as a trial wave function ansatz. In the finite-size scaling analysis +with the order parameter and the second R\'enyi entropy, we find that the +central charge deviates from 1/2 at a small decay exponent $\alpha_\mathrm{LR}$ +in contrast to the critical exponents staying very close to the short-range +(SR) Ising values regardless of $\alpha_\mathrm{LR}$ examined, supporting the +previously proposed scenario of conformal invariance breakdown. To identify the +threshold of the Ising universality and the conformal symmetry, we perform two +additional tests for the universal Binder ratio and the conformal field theory +(CFT) description of the correlation function. It turns out that both indicate +a noticeable deviation from the SR Ising class at $\alpha_\mathrm{LR} < 2$. +However, a closer look at the scaled correlation function for +$\alpha_\mathrm{LR} \ge 2$ shows a gradual change from the asymptotic line of +the CFT verified at $\alpha_\mathrm{LR} = 3$, providing a rough estimate of the +threshold being in the range of $2 \lesssim \alpha_\mathrm{LR} < 3$. + +
+
+
+
+
+ + ☆ Do you know what q-means? + + +
+ Clustering is one of the most important tools for analysis of large datasets, +and perhaps the most popular clustering algorithm is Lloyd's iteration for +$k$-means. This iteration takes $N$ vectors $v_1,\dots,v_N\in\mathbb{R}^d$ and +outputs $k$ centroids $c_1,\dots,c_k\in\mathbb{R}^d$; these partition the +vectors into clusters based on which centroid is closest to a particular +vector. We present an overall improved version of the "$q$-means" algorithm, +the quantum algorithm originally proposed by Kerenidis, Landman, Luongo, and +Prakash (2019) which performs $\varepsilon$-$k$-means, an approximate version +of $k$-means clustering. This algorithm does not rely on the quantum linear +algebra primitives of prior work, instead only using its QRAM to prepare and +measure simple states based on the current iteration's clusters. The time +complexity is $O\big(\frac{k^{2}}{\varepsilon^2}(\sqrt{k}d + \log(Nd))\big)$ +and maintains the polylogarithmic dependence on $N$ while improving the +dependence on most of the other parameters. We also present a "dequantized" +algorithm for $\varepsilon$-$k$-means which runs in +$O\big(\frac{k^{2}}{\varepsilon^2}(kd + \log(Nd))\big)$ time. Notably, this +classical algorithm matches the polylogarithmic dependence on $N$ attained by +the quantum algorithms. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ A Lightweight Transformer for Faster and Robust EBSD Data Collection + + +
+ Three dimensional electron back-scattered diffraction (EBSD) microscopy is a +critical tool in many applications in materials science, yet its data quality +can fluctuate greatly during the arduous collection process, particularly via +serial-sectioning. Fortunately, 3D EBSD data is inherently sequential, opening +up the opportunity to use transformers, state-of-the-art deep learning +architectures that have made breakthroughs in a plethora of domains, for data +processing and recovery. To be more robust to errors and accelerate this 3D +EBSD data collection, we introduce a two step method that recovers missing +slices in an 3D EBSD volume, using an efficient transformer model and a +projection algorithm to process the transformer's outputs. Overcoming the +computational and practical hurdles of deep learning with scarce high +dimensional data, we train this model using only synthetic 3D EBSD data with +self-supervision and obtain superior recovery accuracy on real 3D EBSD data, +compared to existing methods. + +
+
+
+
+
+ + ☆ Reduced Order Modeling of a MOOSE-based Advanced Manufacturing Model + with Operator Learning + + +
+ Advanced Manufacturing (AM) has gained significant interest in the nuclear +community for its potential application on nuclear materials. One challenge is +to obtain desired material properties via controlling the manufacturing process +during runtime. Intelligent AM based on deep reinforcement learning (DRL) +relies on an automated process-level control mechanism to generate optimal +design variables and adaptive system settings for improved end-product +properties. A high-fidelity thermo-mechanical model for direct energy +deposition has recently been developed within the MOOSE framework at the Idaho +National Laboratory (INL). The goal of this work is to develop an accurate and +fast-running reduced order model (ROM) for this MOOSE-based AM model that can +be used in a DRL-based process control and optimization method. Operator +learning (OL)-based methods will be employed due to their capability to learn a +family of differential equations, in this work, produced by changing process +variables in the Gaussian point heat source for the laser. We will develop +OL-based ROM using Fourier neural operator, and perform a benchmark comparison +of its performance with a conventional deep neural network-based ROM. + +
+
+ comment: 10 Pages, 7 Figures, 2 Tables. arXiv admin note: text overlap with + arXiv:2308.02462 +
+
+
+
+
+ + ☆ Graph of Thoughts: Solving Elaborate Problems with Large Language Models + + +
+ We introduce Graph of Thoughts (GoT): a framework that advances prompting +capabilities in large language models (LLMs) beyond those offered by paradigms +such as Chain-ofThought or Tree of Thoughts (ToT). The key idea and primary +advantage of GoT is the ability to model the information generated by an LLM as +an arbitrary graph, where units of information ("LLM thoughts") are vertices, +and edges correspond to dependencies between these vertices. This approach +enables combining arbitrary LLM thoughts into synergistic outcomes, distilling +the essence of whole networks of thoughts, or enhancing thoughts using feedback +loops. We illustrate that GoT offers advantages over state of the art on +different tasks, for example increasing the quality of sorting by 62% over ToT, +while simultaneously reducing costs by >31%. We ensure that GoT is extensible +with new thought transformations and thus can be used to spearhead new +prompting schemes. This work brings the LLM reasoning closer to human thinking +or brain mechanisms such as recurrence, both of which form complex networks. + +
+
+
+
+
+ + ☆ Audiovisual Moments in Time: A Large-Scale Annotated Dataset of + Audiovisual Actions + + +
+ We present Audiovisual Moments in Time (AVMIT), a large-scale dataset of +audiovisual action events. In an extensive annotation task 11 participants +labelled a subset of 3-second audiovisual videos from the Moments in Time +dataset (MIT). For each trial, participants assessed whether the labelled +audiovisual action event was present and whether it was the most prominent +feature of the video. The dataset includes the annotation of 57,177 audiovisual +videos, each independently evaluated by 3 of 11 trained participants. From this +initial collection, we created a curated test set of 16 distinct action +classes, with 60 videos each (960 videos). We also offer 2 sets of pre-computed +audiovisual feature embeddings, using VGGish/YamNet for audio data and +VGG16/EfficientNetB0 for visual data, thereby lowering the barrier to entry for +audiovisual DNN research. We explored the advantages of AVMIT annotations and +feature embeddings to improve performance on audiovisual event recognition. A +series of 6 Recurrent Neural Networks (RNNs) were trained on either +AVMIT-filtered audiovisual events or modality-agnostic events from MIT, and +then tested on our audiovisual test set. In all RNNs, top 1 accuracy was +increased by 2.71-5.94\% by training exclusively on audiovisual events, even +outweighing a three-fold increase in training data. We anticipate that the +newly annotated AVMIT dataset will serve as a valuable resource for research +and comparative experiments involving computational models and human +participants, specifically when addressing research questions where audiovisual +correspondence is of critical importance. + +
+
+
+
+
+ + ☆ Variational optimization of the amplitude of neural-network quantum + many-body ground states + + +
+ Neural-network quantum states (NQSs), variationally optimized by combining +traditional methods and deep learning techniques, is a new way to find quantum +many-body ground states and gradually becomes a competitor of traditional +variational methods. However, there are still some difficulties in the +optimization of NQSs, such as local minima, slow convergence, and sign +structure optimization. Here, we split a quantum many-body variational wave +function into a multiplication of a real-valued amplitude neural network and a +sign structure, and focus on the optimization of the amplitude network while +keeping the sign structure fixed. The amplitude network is a convolutional +neural network (CNN) with residual blocks, namely a ResNet. Our method is +tested on three typical quantum many-body systems. The obtained ground state +energies are lower than or comparable to those from traditional variational +Monte Carlo (VMC) methods and density matrix renormalization group (DMRG). +Surprisingly, for the frustrated Heisenberg $J_1$-$J_2$ model, our results are +better than those of the complex-valued CNN in the literature, implying that +the sign structure of the complex-valued NQS is difficult to be optimized. We +will study the optimization of the sign structure of NQSs in the future. + +
+
+ comment: 7 pages, 2 figures, 3 tables +
+
+
+
+
+ + ☆ GiGaMAE: Generalizable Graph Masked Autoencoder via Collaborative Latent + Space Reconstruction CIKM 2023 + + +
+ Self-supervised learning with masked autoencoders has recently gained +popularity for its ability to produce effective image or textual +representations, which can be applied to various downstream tasks without +retraining. However, we observe that the current masked autoencoder models lack +good generalization ability on graph data. To tackle this issue, we propose a +novel graph masked autoencoder framework called GiGaMAE. Different from +existing masked autoencoders that learn node presentations by explicitly +reconstructing the original graph components (e.g., features or edges), in this +paper, we propose to collaboratively reconstruct informative and integrated +latent embeddings. By considering embeddings encompassing graph topology and +attribute information as reconstruction targets, our model could capture more +generalized and comprehensive knowledge. Furthermore, we introduce a mutual +information based reconstruction loss that enables the effective reconstruction +of multiple targets. This learning objective allows us to differentiate between +the exclusive knowledge learned from a single target and common knowledge +shared by multiple targets. We evaluate our method on three downstream tasks +with seven datasets as benchmarks. Extensive experiments demonstrate the +superiority of GiGaMAE against state-of-the-art baselines. We hope our results +will shed light on the design of foundation models on graph-structured data. +Our code is available at: https://github.com/sycny/GiGaMAE. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ☆ Robust Uncertainty Quantification using Conformalised Monte Carlo + Prediction + + +
+ Deploying deep learning models in safety-critical applications remains a very +challenging task, mandating the provision of assurances for the dependable +operation of these models. Uncertainty quantification (UQ) methods estimate the +model's confidence per prediction, informing decision-making by considering the +effect of randomness and model misspecification. Despite the advances of +state-of-the-art UQ methods, they are computationally expensive or produce +conservative prediction sets/intervals. We introduce MC-CP, a novel hybrid UQ +method that combines a new adaptive Monte Carlo (MC) dropout method with +conformal prediction (CP). MC-CP adaptively modulates the traditional MC +dropout at runtime to save memory and computation resources, enabling +predictions to be consumed by CP, yielding robust prediction sets/intervals. +Throughout comprehensive experiments, we show that MC-CP delivers significant +improvements over advanced UQ methods, like MC dropout, RAPS and CQR, both in +classification and regression benchmarks. MC-CP can be easily added to existing +models, making its deployment simple. + +
+
+
+
+
+ + ☆ biquality-learn: a Python library for Biquality Learning + + +
+ The democratization of Data Mining has been widely successful thanks in part +to powerful and easy-to-use Machine Learning libraries. These libraries have +been particularly tailored to tackle Supervised Learning. However, strong +supervision signals are scarce in practice, and practitioners must resort to +weak supervision. In addition to weaknesses of supervision, dataset shifts are +another kind of phenomenon that occurs when deploying machine learning models +in the real world. That is why Biquality Learning has been proposed as a +machine learning framework to design algorithms capable of handling multiple +weaknesses of supervision and dataset shifts without assumptions on their +nature and level by relying on the availability of a small trusted dataset +composed of cleanly labeled and representative samples. Thus we propose +biquality-learn: a Python library for Biquality Learning with an intuitive and +consistent API to learn machine learning models from biquality data, with +well-proven algorithms, accessible and easy to use for everyone, and enabling +researchers to experiment in a reproducible way on biquality data. + +
+
+
+
+
+ + ☆ Revisiting Skin Tone Fairness in Dermatological Lesion Classification MICCAI + + +
+ Addressing fairness in lesion classification from dermatological images is +crucial due to variations in how skin diseases manifest across skin tones. +However, the absence of skin tone labels in public datasets hinders building a +fair classifier. To date, such skin tone labels have been estimated prior to +fairness analysis in independent studies using the Individual Typology Angle +(ITA). Briefly, ITA calculates an angle based on pixels extracted from skin +images taking into account the lightness and yellow-blue tints. These angles +are then categorised into skin tones that are subsequently used to analyse +fairness in skin cancer classification. In this work, we review and compare +four ITA-based approaches of skin tone classification on the ISIC18 dataset, a +common benchmark for assessing skin cancer classification fairness in the +literature. Our analyses reveal a high disagreement among previously published +studies demonstrating the risks of ITA-based skin tone estimation methods. +Moreover, we investigate the causes of such large discrepancy among these +approaches and find that the lack of diversity in the ISIC18 dataset limits its +use as a testbed for fairness analysis. Finally, we recommend further research +on robust ITA estimation and diverse dataset acquisition with skin tone +annotation to facilitate conclusive fairness assessments of artificial +intelligence tools in dermatology. Our code is available at +https://github.com/tkalbl/RevisitingSkinToneFairness. + +
+
+ comment: Accepted at 2023 MICCAI FAIMI Workshop +
+
+
+
+
+ + ☆ Development of a Neural Network-based Method for Improved Imputation of + Missing Values in Time Series Data by Repurposing DataWig + + +
+ Time series data are observations collected over time intervals. Successful +analysis of time series data captures patterns such as trends, cyclicity and +irregularity, which are crucial for decision making in research, business, and +governance. However, missing values in time series data occur often and present +obstacles to successful analysis, thus they need to be filled with alternative +values, a process called imputation. Although various approaches have been +attempted for robust imputation of time series data, even the most advanced +methods still face challenges including limited scalability, poor capacity to +handle heterogeneous data types and inflexibility due to requiring strong +assumptions of data missing mechanisms. Moreover, the imputation accuracy of +these methods still has room for improvement. In this study, I developed +tsDataWig (time-series DataWig) by modifying DataWig, a neural network-based +method that possesses the capacity to process large datasets and heterogeneous +data types but was designed for non-time series data imputation. Unlike the +original DataWig, tsDataWig can directly handle values of time variables and +impute missing values in complex time series datasets. Using one simulated and +three different complex real-world time series datasets, I demonstrated that +tsDataWig outperforms the original DataWig and the current state-of-the-art +methods for time series data imputation and potentially has broad application +due to not requiring strong assumptions of data missing mechanisms. This study +provides a valuable solution for robustly imputing missing values in +challenging time series datasets, which often contain millions of samples, high +dimensional variables, and heterogeneous data types. + +
+
+ comment: 16 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ VALERIE22 -- A photorealistic, richly metadata annotated dataset of + urban environments + + +
+ The VALERIE tool pipeline is a synthetic data generator developed with the +goal to contribute to the understanding of domain-specific factors that +influence perception performance of DNNs (deep neural networks). This work was +carried out under the German research project KI Absicherung in order to +develop a methodology for the validation of DNNs in the context of pedestrian +detection in urban environments for automated driving. The VALERIE22 dataset +was generated with the VALERIE procedural tools pipeline providing a +photorealistic sensor simulation rendered from automatically synthesized +scenes. The dataset provides a uniquely rich set of metadata, allowing +extraction of specific scene and semantic features (like pixel-accurate +occlusion rates, positions in the scene and distance + angle to the camera). +This enables a multitude of possible tests on the data and we hope to stimulate +research on understanding performance of DNNs. Based on performance metric a +comparison with several other publicly available datasets is provided, +demonstrating that VALERIE22 is one of best performing synthetic datasets +currently available in the open domain. + +
+
+
+
+
+ + ☆ Learning Computational Efficient Bots with Costly Features + + +
+ Deep reinforcement learning (DRL) techniques have become increasingly used in +various fields for decision-making processes. However, a challenge that often +arises is the trade-off between both the computational efficiency of the +decision-making process and the ability of the learned agent to solve a +particular task. This is particularly critical in real-time settings such as +video games where the agent needs to take relevant decisions at a very high +frequency, with a very limited inference time. + In this work, we propose a generic offline learning approach where the +computation cost of the input features is taken into account. We derive the +Budgeted Decision Transformer as an extension of the Decision Transformer that +incorporates cost constraints to limit its cost at inference. As a result, the +model can dynamically choose the best input features at each timestep. We +demonstrate the effectiveness of our method on several tasks, including D4RL +benchmarks and complex 3D environments similar to those found in video games, +and show that it can achieve similar performance while using significantly +fewer computational resources compared to classical approaches. + +
+
+
+
+
+ + ☆ Constrained Bayesian Optimization Using a Lagrange Multiplier Applied to + Power Transistor Design + + +
+ We propose a novel constrained Bayesian Optimization (BO) algorithm +optimizing the design process of Laterally-Diffused Metal-Oxide-Semiconductor +(LDMOS) transistors while realizing a target Breakdown Voltage (BV). We convert +the constrained BO problem into a conventional BO problem using a Lagrange +multiplier. Instead of directly optimizing the traditional Figure-of-Merit +(FOM), we set the Lagrangian as the objective function of BO. This adaptive +objective function with a changeable Lagrange multiplier can address +constrained BO problems which have constraints that require costly evaluations, +without the need for additional surrogate models to approximate constraints. +Our algorithm enables a device designer to set the target BV in the design +space, and obtain a device that satisfies the optimized FOM and the target BV +constraint automatically. Utilizing this algorithm, we have also explored the +physical limits of the FOM for our devices in 30 - 50 V range within the +defined design space. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ Solving PDEs on Spheres with Physics-Informed Convolutional Neural + Networks + + +
+ Physics-informed neural networks (PINNs) have been demonstrated to be +efficient in solving partial differential equations (PDEs) from a variety of +experimental perspectives. Some recent studies have also proposed PINN +algorithms for PDEs on surfaces, including spheres. However, theoretical +understanding of the numerical performance of PINNs, especially PINNs on +surfaces or manifolds, is still lacking. In this paper, we establish rigorous +analysis of the physics-informed convolutional neural network (PICNN) for +solving PDEs on the sphere. By using and improving the latest approximation +results of deep convolutional neural networks and spherical harmonic analysis, +we prove an upper bound for the approximation error with respect to the Sobolev +norm. Subsequently, we integrate this with innovative localization complexity +analysis to establish fast convergence rates for PICNN. Our theoretical results +are also confirmed and supplemented by our experiments. In light of these +findings, we explore potential strategies for circumventing the curse of +dimensionality that arises when solving high-dimensional PDEs. + +
+
+
+
+
+ + ☆ Breaking the Complexity Barrier in Compositional Minimax Optimization + + +
+ Compositional minimax optimization is a pivotal yet under-explored challenge +across machine learning, including distributionally robust training and policy +evaluation for reinforcement learning. Current techniques exhibit suboptimal +complexity or rely heavily on large batch sizes. This paper proposes Nested +STOchastic Recursive Momentum (NSTORM), attaining the optimal sample complexity +of $O(\kappa^3/\epsilon^3)$ for finding an $\epsilon$-accurate solution. +However, NSTORM requires low learning rates, potentially limiting +applicability. Thus we introduce ADAptive NSTORM (ADA-NSTORM) with adaptive +learning rates, proving it achieves the same sample complexity while +experiments demonstrate greater effectiveness. Our methods match lower bounds +for minimax optimization without large batch requirements, validated through +extensive experiments. This work significantly advances compositional minimax +optimization, a crucial capability for distributional robustness and policy +evaluation + +
+
+
+
+
+ + ☆ Disparity, Inequality, and Accuracy Tradeoffs in Graph Neural Networks + for Node Classification CIKM 2023 + + +
+ Graph neural networks (GNNs) are increasingly used in critical human +applications for predicting node labels in attributed graphs. Their ability to +aggregate features from nodes' neighbors for accurate classification also has +the capacity to exacerbate existing biases in data or to introduce new ones +towards members from protected demographic groups. Thus, it is imperative to +quantify how GNNs may be biased and to what extent their harmful effects may be +mitigated. To this end, we propose two new GNN-agnostic interventions namely, +(i) PFR-AX which decreases the separability between nodes in protected and +non-protected groups, and (ii) PostProcess which updates model predictions +based on a blackbox policy to minimize differences between error rates across +demographic groups. Through a large set of experiments on four datasets, we +frame the efficacies of our approaches (and three variants) in terms of their +algorithmic fairness-accuracy tradeoff and benchmark our results against three +strong baseline interventions on three state-of-the-art GNN models. Our results +show that no single intervention offers a universally optimal tradeoff, but +PFR-AX and PostProcess provide granular control and improve model confidence +when correctly predicting positive outcomes for nodes in protected groups. + +
+
+ comment: Accepted to CIKM 2023 +
+
+
+
+
+ + ☆ WizardMath: Empowering Mathematical Reasoning for Large Language Models + via Reinforced Evol-Instruct + + +
+ Large language models (LLMs), such as GPT-4, have shown remarkable +performance in natural language processing (NLP) tasks, including challenging +mathematical reasoning. However, most existing open-source models are only +pre-trained on large-scale internet data and without math-related optimization. +In this paper, we present WizardMath, which enhances the mathematical reasoning +abilities of Llama-2, by applying our proposed Reinforcement Learning from +Evol-Instruct Feedback (RLEIF) method to the domain of math. Through extensive +experiments on two mathematical reasoning benchmarks, namely GSM8k and MATH, we +reveal the extraordinary capabilities of our model. WizardMath surpasses all +other open-source LLMs by a substantial margin. Furthermore, our model even +outperforms ChatGPT-3.5, Claude Instant-1, PaLM-2 and Minerva on GSM8k, +simultaneously surpasses Text-davinci-002, PaLM-1 and GPT-3 on MATH. More +details and model weights are public at https://github.com/nlpxucan/WizardLM +and https://huggingface.co/WizardLM. + +
+
+ comment: LLM, Mathematical Reasoning +
+
+
+
+
+ + ☆ Physics-Informed Boundary Integral Networks (PIBI-Nets): A Data-Driven + Approach for Solving Partial Differential Equations + + +
+ Partial differential equations (PDEs) can describe many relevant phenomena in +dynamical systems. In real-world applications, we commonly need to combine +formal PDE models with (potentially noisy) observations. This is especially +relevant in settings where we lack information about boundary or initial +conditions, or where we need to identify unknown model parameters. In recent +years, Physics-informed neural networks (PINNs) have become a popular tool for +problems of this kind. In high-dimensional settings, however, PINNs often +suffer from computational problems because they usually require dense +collocation points over the entire computational domain. To address this +problem, we present Physics-Informed Boundary Integral Networks (PIBI-Nets) as +a data-driven approach for solving PDEs in one dimension less than the original +problem space. PIBI-Nets only need collocation points at the computational +domain boundary, while still achieving highly accurate results, and in several +practical settings, they clearly outperform PINNs. Exploiting elementary +properties of fundamental solutions of linear differential operators, we +present a principled and simple way to handle point sources in inverse +problems. We demonstrate the excellent performance of PIBI-Nets for the Laplace +and Poisson equations, both on artificial data sets and within a real-world +application concerning the reconstruction of groundwater flows. + +
+
+ comment: Preprint. Submitted to Journal of Computational Science, Elsevier, + for special issue "Machine Learning and Data Assimilation for Dynamical + Systems" +
+
+
+
+
+ + ☆ Investigating the Interplay between Features and Structures in Graph + Learning + + +
+ In the past, the dichotomy between homophily and heterophily has inspired +research contributions toward a better understanding of Deep Graph Networks' +inductive bias. In particular, it was believed that homophily strongly +correlates with better node classification predictions of message-passing +methods. More recently, however, researchers pointed out that such dichotomy is +too simplistic as we can construct node classification tasks where graphs are +completely heterophilic but the performances remain high. Most of these works +have also proposed new quantitative metrics to understand when a graph +structure is useful, which implicitly or explicitly assume the correlation +between node features and target labels. Our work empirically investigates what +happens when this strong assumption does not hold, by formalising two +generative processes for node classification tasks that allow us to build and +study ad-hoc problems. To quantitatively measure the influence of the node +features on the target labels, we also use a metric we call Feature +Informativeness. We construct six synthetic tasks and evaluate the performance +of six models, including structure-agnostic ones. Our findings reveal that +previously defined metrics are not adequate when we relax the above assumption. +Our contribution to the workshop aims at presenting novel research findings +that could help advance our understanding of the field. + +
+
+
+
+
+ + ☆ Normalization Is All You Need: Understanding Layer-Normalized Federated + Learning under Extreme Label Shift + + +
+ Layer normalization (LN) is a widely adopted deep learning technique +especially in the era of foundation models. Recently, LN has been shown to be +surprisingly effective in federated learning (FL) with non-i.i.d. data. +However, exactly why and how it works remains mysterious. In this work, we +reveal the profound connection between layer normalization and the label shift +problem in federated learning. To understand layer normalization better in FL, +we identify the key contributing mechanism of normalization methods in FL, +called feature normalization (FN), which applies normalization to the latent +feature representation before the classifier head. Although LN and FN do not +improve expressive power, they control feature collapse and local overfitting +to heavily skewed datasets, and thus accelerates global training. Empirically, +we show that normalization leads to drastic improvements on standard benchmarks +under extreme label shift. Moreover, we conduct extensive ablation studies to +understand the critical factors of layer normalization in FL. Our results +verify that FN is an essential ingredient inside LN to significantly improve +the convergence of FL while remaining robust to learning rate choices, +especially under extreme label shift where each client has access to few +classes. + +
+
+ comment: 23 pages, 9 figures +
+
+
+
+
+ + ☆ Attesting Distributional Properties of Training Data for Machine + Learning + + +
+ The success of machine learning (ML) has been accompanied by increased +concerns about its trustworthiness. Several jurisdictions are preparing ML +regulatory frameworks. One such concern is ensuring that model training data +has desirable distributional properties for certain sensitive attributes. For +example, draft regulations indicate that model trainers are required to show +that training datasets have specific distributional properties, such as +reflecting diversity of the population. + We propose the notion of property attestation allowing a prover (e.g., model +trainer) to demonstrate relevant distributional properties of training data to +a verifier (e.g., a customer) without revealing the data. We present an +effective hybrid property attestation combining property inference with +cryptographic mechanisms. + +
+
+
+
+
+ + ☆ Adapt Your Teacher: Improving Knowledge Distillation for Exemplar-free + Continual Learning ICCV 2023 + + +
+ In this work, we investigate exemplar-free class incremental learning (CIL) +with knowledge distillation (KD) as a regularization strategy, aiming to +prevent forgetting. KD-based methods are successfully used in CIL, but they +often struggle to regularize the model without access to exemplars of the +training data from previous tasks. Our analysis reveals that this issue +originates from substantial representation shifts in the teacher network when +dealing with out-of-distribution data. This causes large errors in the KD loss +component, leading to performance degradation in CIL. Inspired by recent +test-time adaptation methods, we introduce Teacher Adaptation (TA), a method +that concurrently updates the teacher and the main model during incremental +training. Our method seamlessly integrates with KD-based CIL approaches and +allows for consistent enhancement of their performance across multiple +exemplar-free CIL benchmarks. + +
+
+ comment: VCL workshop at ICCV 2023 +
+
+
+
+
+ + ☆ Latent State Models of Training Dynamics + + +
+ The impact of randomness on model training is poorly understood. How do +differences in data order and initialization actually manifest in the model, +such that some training runs outperform others or converge faster? Furthermore, +how can we interpret the resulting training dynamics and the phase transitions +that characterize different trajectories? To understand the effect of +randomness on the dynamics and outcomes of neural network training, we train +models multiple times with different random seeds and compute a variety of +metrics throughout training, such as the $L_2$ norm, mean, and variance of the +neural network's weights. We then fit a hidden Markov model (HMM) over the +resulting sequences of metrics. The HMM represents training as a stochastic +process of transitions between latent states, providing an intuitive overview +of significant changes during training. Using our method, we produce a +low-dimensional, discrete representation of training dynamics on grokking +tasks, image classification, and masked language modeling. We use the HMM +representation to study phase transitions and identify latent "detour" states +that slow down convergence. + +
+
+
+
+
+ + ☆ Decoupled conditional contrastive learning with variable metadata for + prostate lesion detection MICCAI + + +
+ Early diagnosis of prostate cancer is crucial for efficient treatment. +Multi-parametric Magnetic Resonance Images (mp-MRI) are widely used for lesion +detection. The Prostate Imaging Reporting and Data System (PI-RADS) has +standardized interpretation of prostate MRI by defining a score for lesion +malignancy. PI-RADS data is readily available from radiology reports but is +subject to high inter-reports variability. We propose a new contrastive loss +function that leverages weak metadata with multiple annotators per sample and +takes advantage of inter-reports variability by defining metadata confidence. +By combining metadata of varying confidence with unannotated data into a single +conditional contrastive loss function, we report a 3% AUC increase on lesion +detection on the public PI-CAI challenge dataset. + Code is available at: https://github.com/camilleruppli/decoupled_ccl + +
+
+ comment: Accepted at MILLanD workshop (MICCAI) +
+
+
+
+
+ + ☆ Privacy-Preserving 3-Layer Neural Network Training using Mere + Homomorphic Encryption Technique + + +
+ In this manuscript, we consider the problem of privacy-preserving training of +neural networks in the mere homomorphic encryption setting. We combine several +exsiting techniques available, extend some of them, and finally enable the +training of 3-layer neural networks for both the regression and classification +problems using mere homomorphic encryption technique. + +
+
+
+
+
+ + ☆ Transitivity-Preserving Graph Representation Learning for Bridging Local + Connectivity and Role-based Similarity + + +
+ Graph representation learning (GRL) methods, such as graph neural networks +and graph transformer models, have been successfully used to analyze +graph-structured data, mainly focusing on node classification and link +prediction tasks. However, the existing studies mostly only consider local +connectivity while ignoring long-range connectivity and the roles of nodes. In +this paper, we propose Unified Graph Transformer Networks (UGT) that +effectively integrate local and global structural information into fixed-length +vector representations. First, UGT learns local structure by identifying the +local substructures and aggregating features of the $k$-hop neighborhoods of +each node. Second, we construct virtual edges, bridging distant nodes with +structural similarity to capture the long-range dependencies. Third, UGT learns +unified representations through self-attention, encoding structural distance +and $p$-step transition probability between node pairs. Furthermore, we propose +a self-supervised learning task that effectively learns transition probability +to fuse local and global structural features, which could then be transferred +to other downstream tasks. Experimental results on real-world benchmark +datasets over various downstream tasks showed that UGT significantly +outperformed baselines that consist of state-of-the-art models. In addition, +UGT reaches the expressive power of the third-order Weisfeiler-Lehman +isomorphism test (3d-WL) in distinguishing non-isomorphic graph pairs. The +source code is available at +https://github.com/NSLab-CUK/Unified-Graph-Transformer. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Spatial LibriSpeech: An Augmented Dataset for Spatial Audio Learning + + +
+ We present Spatial LibriSpeech, a spatial audio dataset with over 650 hours +of 19-channel audio, first-order ambisonics, and optional distractor noise. +Spatial LibriSpeech is designed for machine learning model training, and it +includes labels for source position, speaking direction, room acoustics and +geometry. Spatial LibriSpeech is generated by augmenting LibriSpeech samples +with 200k+ simulated acoustic conditions across 8k+ synthetic rooms. To +demonstrate the utility of our dataset, we train models on four spatial audio +tasks, resulting in a median absolute error of 6.60{\deg} on 3D source +localization, 0.43m on distance, 90.66ms on T30, and 2.74dB on DRR estimation. +We show that the same models generalize well to widely-used evaluation +datasets, e.g., obtaining a median absolute error of 12.43{\deg} on 3D source +localization on TUT Sound Events 2018, and 157.32ms on T30 estimation on ACE +Challenge. + +
+
+
+
+
+ + ☆ Bridged-GNN: Knowledge Bridge Learning for Effective Knowledge Transfer CIKM2023 + + +
+ The data-hungry problem, characterized by insufficiency and low-quality of +data, poses obstacles for deep learning models. Transfer learning has been a +feasible way to transfer knowledge from high-quality external data of source +domains to limited data of target domains, which follows a domain-level +knowledge transfer to learn a shared posterior distribution. However, they are +usually built on strong assumptions, e.g., the domain invariant posterior +distribution, which is usually unsatisfied and may introduce noises, resulting +in poor generalization ability on target domains. Inspired by Graph Neural +Networks (GNNs) that aggregate information from neighboring nodes, we redefine +the paradigm as learning a knowledge-enhanced posterior distribution for target +domains, namely Knowledge Bridge Learning (KBL). KBL first learns the scope of +knowledge transfer by constructing a Bridged-Graph that connects knowledgeable +samples to each target sample and then performs sample-wise knowledge transfer +via GNNs.KBL is free from strong assumptions and is robust to noises in the +source data. Guided by KBL, we propose the Bridged-GNN} including an Adaptive +Knowledge Retrieval module to build Bridged-Graph and a Graph Knowledge +Transfer module. Comprehensive experiments on both un-relational and relational +data-hungry scenarios demonstrate the significant improvements of Bridged-GNN +compared with SOTA methods + +
+
+ comment: Accepted by CIKM2023 +
+
+
+
+
+ + ☆ Predictive Authoring for Brazilian Portuguese Augmentative and + Alternative Communication + + +
+ Individuals with complex communication needs (CCN) often rely on augmentative +and alternative communication (AAC) systems to have conversations and +communique their wants. Such systems allow message authoring by arranging +pictograms in sequence. However, the difficulty of finding the desired item to +complete a sentence can increase as the user's vocabulary increases. This paper +proposes using BERTimbau, a Brazilian Portuguese version of BERT, for pictogram +prediction in AAC systems. To finetune BERTimbau, we constructed an AAC corpus +for Brazilian Portuguese to use as a training corpus. We tested different +approaches to representing a pictogram for prediction: as a word (using +pictogram captions), as a concept (using a dictionary definition), and as a set +of synonyms (using related terms). We also evaluated the usage of images for +pictogram prediction. The results demonstrate that using embeddings computed +from the pictograms' caption, synonyms, or definitions have a similar +performance. Using synonyms leads to lower perplexity, but using captions leads +to the highest accuracies. This paper provides insight into how to represent a +pictogram for prediction using a BERT-like model and the potential of using +images for pictogram prediction. + +
+
+
+
+
+ + ☆ Balancing Transparency and Risk: The Security and Privacy Risks of + Open-Source Machine Learning Models + + +
+ The field of artificial intelligence (AI) has experienced remarkable progress +in recent years, driven by the widespread adoption of open-source machine +learning models in both research and industry. Considering the +resource-intensive nature of training on vast datasets, many applications opt +for models that have already been trained. Hence, a small number of key players +undertake the responsibility of training and publicly releasing large +pre-trained models, providing a crucial foundation for a wide range of +applications. However, the adoption of these open-source models carries +inherent privacy and security risks that are often overlooked. To provide a +concrete example, an inconspicuous model may conceal hidden functionalities +that, when triggered by specific input patterns, can manipulate the behavior of +the system, such as instructing self-driving cars to ignore the presence of +other vehicles. The implications of successful privacy and security attacks +encompass a broad spectrum, ranging from relatively minor damage like service +interruptions to highly alarming scenarios, including physical harm or the +exposure of sensitive user data. In this work, we present a comprehensive +overview of common privacy and security threats associated with the use of +open-source models. By raising awareness of these dangers, we strive to promote +the responsible and secure use of AI systems. + +
+
+
+
+
+ + ☆ Data augmentation and explainability for bias discovery and mitigation + in deep learning + + +
+ This dissertation explores the impact of bias in deep neural networks and +presents methods for reducing its influence on model performance. The first +part begins by categorizing and describing potential sources of bias and errors +in data and models, with a particular focus on bias in machine learning +pipelines. The next chapter outlines a taxonomy and methods of Explainable AI +as a way to justify predictions and control and improve the model. Then, as an +example of a laborious manual data inspection and bias discovery process, a +skin lesion dataset is manually examined. A Global Explanation for the Bias +Identification method is proposed as an alternative semi-automatic approach to +manual data exploration for discovering potential biases in data. Relevant +numerical methods and metrics are discussed for assessing the effects of the +identified biases on the model. Whereas identifying errors and bias is +critical, improving the model and reducing the number of flaws in the future is +an absolute priority. Hence, the second part of the thesis focuses on +mitigating the influence of bias on ML models. Three approaches are proposed +and discussed: Style Transfer Data Augmentation, Targeted Data Augmentations, +and Attribution Feedback. Style Transfer Data Augmentation aims to address +shape and texture bias by merging a style of a malignant lesion with a +conflicting shape of a benign one. Targeted Data Augmentations randomly insert +possible biases into all images in the dataset during the training, as a way to +make the process random and, thus, destroy spurious correlations. Lastly, +Attribution Feedback is used to fine-tune the model to improve its accuracy by +eliminating obvious mistakes and teaching it to ignore insignificant input +parts via an attribution loss. The goal of these approaches is to reduce the +influence of bias on machine learning models, rather than eliminate it +entirely. + +
+
+ comment: A PhD Thesis +
+
+
+
+
+ + ☆ Reconstructing $S$-matrix Phases with Machine Learning + + +
+ An important element of the $S$-matrix bootstrap program is the relationship +between the modulus of an $S$-matrix element and its phase. Unitarity relates +them by an integral equation. Even in the simplest case of elastic scattering, +this integral equation cannot be solved analytically and numerical approaches +are required. We apply modern machine learning techniques to studying the +unitarity constraint. We find that for a given modulus, when a phase exists it +can generally be reconstructed to good accuracy with machine learning. +Moreover, the loss of the reconstruction algorithm provides a good proxy for +whether a given modulus can be consistent with unitarity at all. In addition, +we study the question of whether multiple phases can be consistent with a +single modulus, finding novel phase-ambiguous solutions. In particular, we find +a new phase-ambiguous solution which pushes the known limit on such solutions +significantly beyond the previous bound. + +
+
+ comment: 43 pages, 21 figures +
+
+
+
+
+ + ☆ Defending Label Inference Attacks in Split Learning under Regression + Setting + + +
+ As a privacy-preserving method for implementing Vertical Federated Learning, +Split Learning has been extensively researched. However, numerous studies have +indicated that the privacy-preserving capability of Split Learning is +insufficient. In this paper, we primarily focus on label inference attacks in +Split Learning under regression setting, which are mainly implemented through +the gradient inversion method. To defend against label inference attacks, we +propose Random Label Extension (RLE), where labels are extended to obfuscate +the label information contained in the gradients, thereby preventing the +attacker from utilizing gradients to train an attack model that can infer the +original labels. To further minimize the impact on the original task, we +propose Model-based adaptive Label Extension (MLE), where original labels are +preserved in the extended labels and dominate the training process. The +experimental results show that compared to the basic defense methods, our +proposed defense methods can significantly reduce the attack model's +performance while preserving the original task's performance. + +
+
+
+
+
+ + ☆ An Efficient 1 Iteration Learning Algorithm for Gaussian Mixture Model + And Gaussian Mixture Embedding For Neural Network + + +
+ We propose an Gaussian Mixture Model (GMM) learning algorithm, based on our +previous work of GMM expansion idea. The new algorithm brings more robustness +and simplicity than classic Expectation Maximization (EM) algorithm. It also +improves the accuracy and only take 1 iteration for learning. We theoretically +proof that this new algorithm is guarantee to converge regardless the +parameters initialisation. We compare our GMM expansion method with classic +probability layers in neural network leads to demonstrably better capability to +overcome data uncertainty and inverse problem. Finally, we test GMM based +generator which shows a potential to build further application that able to +utilized distribution random sampling for stochastic variation as well as +variation control. + +
+
+
+
+
+ + ☆ From Hope to Safety: Unlearning Biases of Deep Models by Enforcing the + Right Reasons in Latent Space + + +
+ Deep Neural Networks are prone to learning spurious correlations embedded in +the training data, leading to potentially biased predictions. This poses risks +when deploying these models for high-stake decision-making, such as in medical +applications. Current methods for post-hoc model correction either require +input-level annotations, which are only possible for spatially localized +biases, or augment the latent feature space, thereby hoping to enforce the +right reasons. We present a novel method ensuring the right reasons on the +concept level by reducing the model's sensitivity towards biases through the +gradient. When modeling biases via Concept Activation Vectors, we highlight the +importance of choosing robust directions, as traditional regression-based +approaches such as Support Vector Machines tend to result in diverging +directions. We effectively mitigate biases in controlled and real-world +settings on the ISIC, Bone Age, ImageNet and CelebA datasets using VGG, ResNet +and EfficientNet architectures. + +
+
+
+
+
+ + ☆ Can ultrasound confidence maps predict sonographers' labeling + variability? + + +
+ Measuring cross-sectional areas in ultrasound images is a standard tool to +evaluate disease progress or treatment response. Often addressed today with +supervised deep-learning segmentation approaches, existing solutions highly +depend upon the quality of experts' annotations. However, the annotation +quality in ultrasound is anisotropic and position-variant due to the inherent +physical imaging principles, including attenuation, shadows, and missing +boundaries, commonly exacerbated with depth. This work proposes a novel +approach that guides ultrasound segmentation networks to account for +sonographers' uncertainties and generate predictions with variability similar +to the experts. We claim that realistic variability can reduce overconfident +predictions and improve physicians' acceptance of deep-learning cross-sectional +segmentation solutions. Our method provides CM's certainty for each pixel for +minimal computational overhead as it can be precalculated directly from the +image. We show that there is a correlation between low values in the confidence +maps and expert's label uncertainty. Therefore, we propose to give the +confidence maps as additional information to the networks. We study the effect +of the proposed use of ultrasound CMs in combination with four state-of-the-art +neural networks and in two configurations: as a second input channel and as +part of the loss. We evaluate our method on 3D ultrasound datasets of the +thyroid and lower limb muscles. Our results show ultrasound CMs increase the +Dice score, improve the Hausdorff and Average Surface Distances, and decrease +the number of isolated pixel predictions. Furthermore, our findings suggest +that ultrasound CMs improve the penalization of uncertain areas in the ground +truth data, thereby improving problematic interpolations. Our code and example +data will be made public at +https://github.com/IFL-CAMP/Confidence-segmentation. + +
+
+
+
+
+ + ☆ End-to-end topographic networks as models of cortical map formation and + human visual behaviour: moving beyond convolutions + + +
+ Computational models are an essential tool for understanding the origin and +functions of the topographic organisation of the primate visual system. Yet, +vision is most commonly modelled by convolutional neural networks that ignore +topography by learning identical features across space. Here, we overcome this +limitation by developing All-Topographic Neural Networks (All-TNNs). Trained on +visual input, several features of primate topography emerge in All-TNNs: smooth +orientation maps and cortical magnification in their first layer, and +category-selective areas in their final layer. In addition, we introduce a +novel dataset of human spatial biases in object recognition, which enables us +to directly link models to behaviour. We demonstrate that All-TNNs +significantly better align with human behaviour than previous state-of-the-art +convolutional models due to their topographic nature. All-TNNs thereby mark an +important step forward in understanding the spatial organisation of the visual +brain and how it mediates visual behaviour. + +
+
+
+
+
+ + ☆ Towards Understanding the Generalizability of Delayed Stochastic + Gradient Descent + + +
+ Stochastic gradient descent (SGD) performed in an asynchronous manner plays a +crucial role in training large-scale machine learning models. However, the +generalization performance of asynchronous delayed SGD, which is an essential +metric for assessing machine learning algorithms, has rarely been explored. +Existing generalization error bounds are rather pessimistic and cannot reveal +the correlation between asynchronous delays and generalization. In this paper, +we investigate sharper generalization error bound for SGD with asynchronous +delay $\tau$. Leveraging the generating function analysis tool, we first +establish the average stability of the delayed gradient algorithm. Based on +this algorithmic stability, we provide upper bounds on the generalization error +of $\tilde{\mathcal{O}}(\frac{T-\tau}{n\tau})$ and +$\tilde{\mathcal{O}}(\frac{1}{n})$ for quadratic convex and strongly convex +problems, respectively, where $T$ refers to the iteration number and $n$ is the +amount of training data. Our theoretical results indicate that asynchronous +delays reduce the generalization error of the delayed SGD algorithm. Analogous +analysis can be generalized to the random delay setting, and the experimental +results validate our theoretical findings. + +
+
+
+
+
+ + ☆ Self-Supervised Single-Image Deconvolution with Siamese Neural Networks MICCAI 2023 + + +
+ Inverse problems in image reconstruction are fundamentally complicated by +unknown noise properties. Classical iterative deconvolution approaches amplify +noise and require careful parameter selection for an optimal trade-off between +sharpness and grain. Deep learning methods allow for flexible parametrization +of the noise and learning its properties directly from the data. Recently, +self-supervised blind-spot neural networks were successfully adopted for image +deconvolution by including a known point-spread function in the end-to-end +training. However, their practical application has been limited to 2D images in +the biomedical domain because it implies large kernels that are poorly +optimized. We tackle this problem with Fast Fourier Transform convolutions that +provide training speed-up in 3D microscopy deconvolution tasks. Further, we +propose to adopt a Siamese invariance loss for deconvolution and empirically +identify its optimal position in the neural network between blind-spot and full +image branches. The experimental results show that our improved framework +outperforms the previous state-of-the-art deconvolution methods with a known +point spread function. + +
+
+ comment: Accepted for DALI @ MICCAI 2023 +
+
+
+
+
+ + ☆ Machine-Learning Solutions for the Analysis of Single-Particle Diffusion + Trajectories + + +
+ Single-particle traces of the diffusive motion of molecules, cells, or +animals are by-now routinely measured, similar to stochastic records of stock +prices or weather data. Deciphering the stochastic mechanism behind the +recorded dynamics is vital in understanding the observed systems. Typically, +the task is to decipher the exact type of diffusion and/or to determine system +parameters. The tools used in this endeavor are currently revolutionized by +modern machine-learning techniques. In this Perspective we provide an overview +over recently introduced methods in machine-learning for diffusive time series, +most notably, those successfully competing in the +Anomalous-Diffusion-Challenge. As such methods are often criticized for their +lack of interpretability, we focus on means to include uncertainty estimates +and feature-based approaches, both improving interpretability and providing +concrete insight into the learning process of the machine. We expand the +discussion by examining predictions on different out-of-distribution data. We +also comment on expected future developments. + +
+
+ comment: 25 pages, 11 figures +
+
+
+
+
+ + ☆ Metadata Improves Segmentation Through Multitasking Elicitation MICCAI 2023 + + +
+ Metainformation is a common companion to biomedical images. However, this +potentially powerful additional source of signal from image acquisition has had +limited use in deep learning methods, for semantic segmentation in particular. +Here, we incorporate metadata by employing a channel modulation mechanism in +convolutional networks and study its effect on semantic segmentation tasks. We +demonstrate that metadata as additional input to a convolutional network can +improve segmentation results while being inexpensive in implementation as a +nimble add-on to popular models. We hypothesize that this benefit of metadata +can be attributed to facilitating multitask switching. This aspect of +metadata-driven systems is explored and discussed in detail. + +
+
+ comment: Accepted for DART @ MICCAI 2023 +
+
+
+
+
+ + ☆ Learning MDL logic programs from noisy data + + +
+ Many inductive logic programming approaches struggle to learn programs from +noisy data. To overcome this limitation, we introduce an approach that learns +minimal description length programs from noisy data, including recursive +programs. Our experiments on several domains, including drug design, game +playing, and program synthesis, show that our approach can outperform existing +approaches in terms of predictive accuracies and scale to moderate amounts of +noise. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2206.01614 +
+
+
+
+
+ + ☆ On Gradient-like Explanation under a Black-box Setting: When Black-box + Explanations Become as Good as White-box + + +
+ Attribution methods shed light on the explainability of data-driven +approaches such as deep learning models by revealing the most contributing +features to decisions that have been made. A widely accepted way of deriving +feature attributions is to analyze the gradients of the target function with +respect to input features. Analysis of gradients requires full access to the +target system, meaning that solutions of this kind treat the target system as a +white-box. However, the white-box assumption may be untenable due to security +and safety concerns, thus limiting their practical applications. As an answer +to the limited flexibility, this paper presents GEEX (gradient-estimation-based +explanation), an explanation method that delivers gradient-like explanations +under a black-box setting. Furthermore, we integrate the proposed method with a +path method. The resulting approach iGEEX (integrated GEEX) satisfies the four +fundamental axioms of attribution methods: sensitivity, insensitivity, +implementation invariance, and linearity. With a focus on image data, the +exhaustive experiments empirically show that the proposed methods outperform +state-of-the-art black-box methods and achieve competitive performance compared +to the ones with full access. + +
+
+
+
+
+ + ☆ Deciphering knee osteoarthritis diagnostic features with explainable + artificial intelligence: A systematic review + + +
+ Existing artificial intelligence (AI) models for diagnosing knee +osteoarthritis (OA) have faced criticism for their lack of transparency and +interpretability, despite achieving medical-expert-like performance. This +opacity makes them challenging to trust in clinical practice. Recently, +explainable artificial intelligence (XAI) has emerged as a specialized +technique that can provide confidence in the model's prediction by revealing +how the prediction is derived, thus promoting the use of AI systems in +healthcare. This paper presents the first survey of XAI techniques used for +knee OA diagnosis. The XAI techniques are discussed from two perspectives: data +interpretability and model interpretability. The aim of this paper is to +provide valuable insights into XAI's potential towards a more reliable knee OA +diagnosis approach and encourage its adoption in clinical practice. + +
+
+
+
+
+ + ☆ Image Processing and Machine Learning for Hyperspectral Unmixing: An + Overview and the HySUPP Python Package + + +
+ Spectral pixels are often a mixture of the pure spectra of the materials, +called endmembers, due to the low spatial resolution of hyperspectral sensors, +double scattering, and intimate mixtures of materials in the scenes. Unmixing +estimates the fractional abundances of the endmembers within the pixel. +Depending on the prior knowledge of endmembers, linear unmixing can be divided +into three main groups: supervised, semi-supervised, and unsupervised (blind) +linear unmixing. Advances in Image processing and machine learning +substantially affected unmixing. This paper provides an overview of advanced +and conventional unmixing approaches. Additionally, we draw a critical +comparison between advanced and conventional techniques from the three +categories. We compare the performance of the unmixing techniques on three +simulated and two real datasets. The experimental results reveal the advantages +of different unmixing categories for different unmixing scenarios. Moreover, we +provide an open-source Python-based package available at +https://github.com/BehnoodRasti/HySUPP to reproduce the results. + +
+
+
+
+
+ + ☆ Noise Sensitivity and Stability of Deep Neural Networks for Binary + Classification + + +
+ A first step is taken towards understanding often observed non-robustness +phenomena of deep neural net (DNN) classifiers. This is done from the +perspective of Boolean functions by asking if certain sequences of Boolean +functions represented by common DNN models are noise sensitive or noise stable, +concepts defined in the Boolean function literature. Due to the natural +randomness in DNN models, these concepts are extended to annealed and quenched +versions. Here we sort out the relation between these definitions and +investigate the properties of two standard DNN architectures, the fully +connected and convolutional models, when initiated with Gaussian weights. + +
+
+
+
+
+ + ☆ Which Transformer to Favor: A Comparative Analysis of Efficiency in + Vision Transformers + + +
+ The growing popularity of Vision Transformers as the go-to models for image +classification has led to an explosion of architectural modifications claiming +to be more efficient than the original ViT. However, a wide diversity of +experimental conditions prevents a fair comparison between all of them, based +solely on their reported results. To address this gap in comparability, we +conduct a comprehensive analysis of more than 30 models to evaluate the +efficiency of vision transformers and related architectures, considering +various performance metrics. Our benchmark provides a comparable baseline +across the landscape of efficiency-oriented transformers, unveiling a plethora +of surprising insights. For example, we discover that ViT is still Pareto +optimal across multiple efficiency metrics, despite the existence of several +alternative approaches claiming to be more efficient. Results also indicate +that hybrid attention-CNN models fare particularly well when it comes to low +inference memory and number of parameters, and also that it is better to scale +the model size, than the image size. Furthermore, we uncover a strong positive +correlation between the number of FLOPS and the training memory, which enables +the estimation of required VRAM from theoretical measurements alone. + Thanks to our holistic evaluation, this study offers valuable insights for +practitioners and researchers, facilitating informed decisions when selecting +models for specific applications. We publicly release our code and data at +https://github.com/tobna/WhatTransformerToFavor + +
+
+
+
+
+ + ☆ A tailored Handwritten-Text-Recognition System for Medieval Latin + + +
+ The Bavarian Academy of Sciences and Humanities aims to digitize its Medieval +Latin Dictionary. This dictionary entails record cards referring to lemmas in +medieval Latin, a low-resource language. A crucial step of the digitization +process is the Handwritten Text Recognition (HTR) of the handwritten lemmas +found on these record cards. In our work, we introduce an end-to-end pipeline, +tailored to the medieval Latin dictionary, for locating, extracting, and +transcribing the lemmas. We employ two state-of-the-art (SOTA) image +segmentation models to prepare the initial data set for the HTR task. +Furthermore, we experiment with different transformer-based models and conduct +a set of experiments to explore the capabilities of different combinations of +vision encoders with a GPT-2 decoder. Additionally, we also apply extensive +data augmentation resulting in a highly competitive model. The best-performing +setup achieved a Character Error Rate (CER) of 0.015, which is even superior to +the commercial Google Cloud Vision model, and shows more stable performance. + +
+
+ comment: This paper has been accepted at the First Workshop on Ancient + Language Processing, co-located with RANLP 2023. This is the author's version + of the work. The definite version of record will be published in the + proceedings +
+
+
+
+
+ + ☆ On the Approximation of Bi-Lipschitz Maps by Invertible Neural Networks + + +
+ Invertible neural networks (INNs) represent an important class of deep neural +network architectures that have been widely used in several applications. The +universal approximation properties of INNs have also been established recently. +However, the approximation rate of INNs is largely missing. In this work, we +provide an analysis of the capacity of a class of coupling-based INNs to +approximate bi-Lipschitz continuous mappings on a compact domain, and the +result shows that it can well approximate both forward and inverse maps +simultaneously. Furthermore, we develop an approach for approximating +bi-Lipschitz maps on infinite-dimensional spaces that simultaneously +approximate the forward and inverse maps, by combining model reduction with +principal component analysis and INNs for approximating the reduced map, and we +analyze the overall approximation error of the approach. Preliminary numerical +results show the feasibility of the approach for approximating the solution +operator for parameterized second-order elliptic problems. + +
+
+ comment: 32 pages +
+
+
+
+
+ + ☆ Multi-feature concatenation and multi-classifier stacking: an + interpretable and generalizable machine learning method for MDD + discrimination with rsfMRI + + +
+ Major depressive disorder is a serious and heterogeneous psychiatric disorder +that needs accurate diagnosis. Resting-state functional MRI (rsfMRI), which +captures multiple perspectives on brain structure, function, and connectivity, +is increasingly applied in the diagnosis and pathological research of mental +diseases. Different machine learning algorithms are then developed to exploit +the rich information in rsfMRI and discriminate MDD patients from normal +controls. Despite recent advances reported, the discrimination accuracy has +room for further improvement. The generalizability and interpretability of the +method are not sufficiently addressed either. Here, we propose a machine +learning method (MFMC) for MDD discrimination by concatenating multiple +features and stacking multiple classifiers. MFMC is tested on the REST-meta-MDD +data set that contains 2428 subjects collected from 25 different sites. MFMC +yields 96.9% MDD discrimination accuracy, demonstrating a significant +improvement over existing methods. In addition, the generalizability of MFMC is +validated by the good performance when the training and testing subjects are +from independent sites. The use of XGBoost as the meta classifier allows us to +probe the decision process of MFMC. We identify 13 feature values related to 9 +brain regions including the posterior cingulate gyrus, superior frontal gyrus +orbital part, and angular gyrus, which contribute most to the classification +and also demonstrate significant differences at the group level. The use of +these 13 feature values alone can reach 87% of MFMC's full performance when +taking all feature values. These features may serve as clinically useful +diagnostic and prognostic biomarkers for mental disorders in the future. + +
+
+
+
+
+ + ☆ RLIPv2: Fast Scaling of Relational Language-Image Pre-training ICCV 2023 + + +
+ Relational Language-Image Pre-training (RLIP) aims to align vision +representations with relational texts, thereby advancing the capability of +relational reasoning in computer vision tasks. However, hindered by the slow +convergence of RLIPv1 architecture and the limited availability of existing +scene graph data, scaling RLIPv1 is challenging. In this paper, we propose +RLIPv2, a fast converging model that enables the scaling of relational +pre-training to large-scale pseudo-labelled scene graph data. To enable fast +scaling, RLIPv2 introduces Asymmetric Language-Image Fusion (ALIF), a mechanism +that facilitates earlier and deeper gated cross-modal fusion with sparsified +language encoding layers. ALIF leads to comparable or better performance than +RLIPv1 in a fraction of the time for pre-training and fine-tuning. To obtain +scene graph data at scale, we extend object detection datasets with free-form +relation labels by introducing a captioner (e.g., BLIP) and a designed Relation +Tagger. The Relation Tagger assigns BLIP-generated relation texts to region +pairs, thus enabling larger-scale relational pre-training. Through extensive +experiments conducted on Human-Object Interaction Detection and Scene Graph +Generation, RLIPv2 shows state-of-the-art performance on three benchmarks under +fully-finetuning, few-shot and zero-shot settings. Notably, the largest RLIPv2 +achieves 23.29mAP on HICO-DET without any fine-tuning, yields 32.22mAP with +just 1% data and yields 45.09mAP with 100% data. Code and models are publicly +available at https://github.com/JacobYuan7/RLIPv2. + +
+
+ comment: Accepted to ICCV 2023. Code and models: + https://github.com/JacobYuan7/RLIPv2 +
+
+
+
+
+ + ☆ Denoising diffusion-based MR to CT image translation enables whole spine + vertebral segmentation in 2D and 3D without manual annotations + + +
+ Background: Automated segmentation of spinal MR images plays a vital role +both scientifically and clinically. However, accurately delineating posterior +spine structures presents challenges. + Methods: This retrospective study, approved by the ethical committee, +involved translating T1w and T2w MR image series into CT images in a total of +n=263 pairs of CT/MR series. Landmark-based registration was performed to align +image pairs. We compared 2D paired (Pix2Pix, denoising diffusion implicit +models (DDIM) image mode, DDIM noise mode) and unpaired (contrastive unpaired +translation, SynDiff) image-to-image translation using "peak signal to noise +ratio" (PSNR) as quality measure. A publicly available segmentation network +segmented the synthesized CT datasets, and Dice scores were evaluated on +in-house test sets and the "MRSpineSeg Challenge" volumes. The 2D findings were +extended to 3D Pix2Pix and DDIM. + Results: 2D paired methods and SynDiff exhibited similar translation +performance and Dice scores on paired data. DDIM image mode achieved the +highest image quality. SynDiff, Pix2Pix, and DDIM image mode demonstrated +similar Dice scores (0.77). For craniocaudal axis rotations, at least two +landmarks per vertebra were required for registration. The 3D translation +outperformed the 2D approach, resulting in improved Dice scores (0.80) and +anatomically accurate segmentations in a higher resolution than the original MR +image. + Conclusion: Two landmarks per vertebra registration enabled paired +image-to-image translation from MR to CT and outperformed all unpaired +approaches. The 3D techniques provided anatomically correct segmentations, +avoiding underprediction of small structures like the spinous process. + +
+
+ comment: 35 pages, 7 figures, Code and a model weights available + https://doi.org/10.5281/zenodo.8221159 and + https://doi.org/10.5281/zenodo.8198697 +
+
+
+
+
+ + ☆ Surprise machines: revealing Harvard Art Museums' image collection + + +
+ Surprise Machines is a project of experimental museology that sets out to +visualize the entire image collection of the Harvard Art Museums, intending to +open up unexpected vistas on more than 200,000 objects usually inaccessible to +visitors. Part of the exhibition Curatorial A(i)gents organized by metaLAB (at) +Harvard, the project explores the limits of artificial intelligence to display +a large set of images and create surprise among visitors. To achieve such a +feeling of surprise, a choreographic interface was designed to connect the +audience's movement with several unique views of the collection. + +
+
+ comment: 14 pages and 7 figures +
+
+
+
+
+ + ☆ Document Automation Architectures: Updated Survey in Light of Large + Language Models + + +
+ This paper surveys the current state of the art in document automation (DA). +The objective of DA is to reduce the manual effort during the generation of +documents by automatically creating and integrating input from different +sources and assembling documents conforming to defined templates. There have +been reviews of commercial solutions of DA, particularly in the legal domain, +but to date there has been no comprehensive review of the academic research on +DA architectures and technologies. The current survey of DA reviews the +academic literature and provides a clearer definition and characterization of +DA and its features, identifies state-of-the-art DA architectures and +technologies in academic research, and provides ideas that can lead to new +research opportunities within the DA field in light of recent advances in +generative AI and large language models. + +
+
+ comment: The current paper is the updated version of an earlier survey on + document automation [Ahmadi Achachlouei et al. 2021]. Updates in the current + paper are as follows: We shortened almost all sections to reduce the size of + the main paper (without references) from 28 pages to 10 pages, added a review + of selected papers on large language models, removed certain sections and + most of diagrams. arXiv admin note: substantial text overlap with + arXiv:2109.11603 +
+
+
+
+
+ + ☆ Towards Attack-tolerant Federated Learning via Critical Parameter + Analysis ICCV'23 + + +
+ Federated learning is used to train a shared model in a decentralized way +without clients sharing private data with each other. Federated learning +systems are susceptible to poisoning attacks when malicious clients send false +updates to the central server. Existing defense strategies are ineffective +under non-IID data settings. This paper proposes a new defense strategy, FedCPA +(Federated learning with Critical Parameter Analysis). Our attack-tolerant +aggregation method is based on the observation that benign local models have +similar sets of top-k and bottom-k critical parameters, whereas poisoned local +models do not. Experiments with different attack scenarios on multiple datasets +demonstrate that our model outperforms existing defense strategies in defending +against poisoning attacks. + +
+
+ comment: ICCV'23 Accepted +
+
+
+
+
+ + ☆ Path Signatures for Seizure Forecasting + + +
+ Forecasting the state of a system from an observed time series is the subject +of research in many domains, such as computational neuroscience. Here, the +prediction of epileptic seizures from brain measurements is an unresolved +problem. There are neither complete models describing underlying brain +dynamics, nor do individual patients exhibit a single seizure onset pattern, +which complicates the development of a `one-size-fits-all' solution. Based on a +longitudinal patient data set, we address the automated discovery and +quantification of statistical features (biomarkers) that can be used to +forecast seizures in a patient-specific way. We use existing and novel feature +extraction algorithms, in particular the path signature, a recent development +in time series analysis. Of particular interest is how this set of complex, +nonlinear features performs compared to simpler, linear features on this task. +Our inference is based on statistical classification algorithms with in-built +subset selection to discern time series with and without an impending seizure +while selecting only a small number of relevant features. This study may be +seen as a step towards a generalisable pattern recognition pipeline for time +series in a broader context. + +
+
+
+
+
+ + ☆ Variance reduction techniques for stochastic proximal point algorithms + + +
+ In the context of finite sums minimization, variance reduction techniques are +widely used to improve the performance of state-of-the-art stochastic gradient +methods. Their practical impact is clear, as well as their theoretical +properties. Stochastic proximal point algorithms have been studied as an +alternative to stochastic gradient algorithms since they are more stable with +respect to the choice of the stepsize but a proper variance reduced version is +missing. In this work, we propose the first study of variance reduction +techniques for stochastic proximal point algorithms. We introduce a stochastic +proximal version of SVRG, SAGA, and some of their variants for smooth and +convex functions. We provide several convergence results for the iterates and +the objective function values. In addition, under the Polyak-{\L}ojasiewicz +(PL) condition, we obtain linear convergence rates for the iterates and the +function values. Our numerical experiments demonstrate the advantages of the +proximal variance reduction methods over their gradient counterparts, +especially about the stability with respect to the choice of the step size. + +
+
+
+
+
+ + ☆ Meta-learning enhanced next POI recommendation by leveraging check-ins + from auxiliary cities + + +
+ Most existing point-of-interest (POI) recommenders aim to capture user +preference by employing city-level user historical check-ins, thus facilitating +users' exploration of the city. However, the scarcity of city-level user +check-ins brings a significant challenge to user preference learning. Although +prior studies attempt to mitigate this challenge by exploiting various context +information, e.g., spatio-temporal information, they ignore to transfer the +knowledge (i.e., common behavioral pattern) from other relevant cities (i.e., +auxiliary cities). In this paper, we investigate the effect of knowledge +distilled from auxiliary cities and thus propose a novel Meta-learning Enhanced +next POI Recommendation framework (MERec). The MERec leverages the correlation +of check-in behaviors among various cities into the meta-learning paradigm to +help infer user preference in the target city, by holding the principle of +"paying more attention to more correlated knowledge". Particularly, a +city-level correlation strategy is devised to attentively capture common +patterns among cities, so as to transfer more relevant knowledge from more +correlated cities. Extensive experiments verify the superiority of the proposed +MERec against state-of-the-art algorithms. + +
+
+
+
+
+ + ☆ Online Class Incremental Learning on Stochastic Blurry Task Boundary via + Mask and Visual Prompt Tuning + + +
+ Continual learning aims to learn a model from a continuous stream of data, +but it mainly assumes a fixed number of data and tasks with clear task +boundaries. However, in real-world scenarios, the number of input data and +tasks is constantly changing in a statistical way, not a static way. Although +recently introduced incremental learning scenarios having blurry task +boundaries somewhat address the above issues, they still do not fully reflect +the statistical properties of real-world situations because of the fixed ratio +of disjoint and blurry samples. In this paper, we propose a new Stochastic +incremental Blurry task boundary scenario, called Si-Blurry, which reflects the +stochastic properties of the real-world. We find that there are two major +challenges in the Si-Blurry scenario: (1) inter- and intra-task forgettings and +(2) class imbalance problem. To alleviate them, we introduce Mask and Visual +Prompt tuning (MVP). In MVP, to address the inter- and intra-task forgetting +issues, we propose a novel instance-wise logit masking and contrastive visual +prompt tuning loss. Both of them help our model discern the classes to be +learned in the current batch. It results in consolidating the previous +knowledge. In addition, to alleviate the class imbalance problem, we introduce +a new gradient similarity-based focal loss and adaptive feature scaling to ease +overfitting to the major classes and underfitting to the minor classes. +Extensive experiments show that our proposed MVP significantly outperforms the +existing state-of-the-art methods in our challenging Si-Blurry scenario. + +
+
+
+
+
+ + ☆ Learning Reward Machines through Preference Queries over Sequences + + +
+ Reward machines have shown great promise at capturing non-Markovian reward +functions for learning tasks that involve complex action sequencing. However, +no algorithm currently exists for learning reward machines with realistic weak +feedback in the form of preferences. We contribute REMAP, a novel algorithm for +learning reward machines from preferences, with correctness and termination +guarantees. REMAP introduces preference queries in place of membership queries +in the L* algorithm, and leverages a symbolic observation table along with +unification and constraint solving to narrow the hypothesis reward machine +search space. In addition to the proofs of correctness and termination for +REMAP, we present empirical evidence measuring correctness: how frequently the +resulting reward machine is isomorphic under a consistent yet inexact teacher, +and the regret between the ground truth and learned reward machines. + +
+
+ comment: 24 pages, 10 figures +
+
+
+
+
+ + ☆ CARLA: A Self-supervised Contrastive Representation Learning Approach + for Time Series Anomaly Detection + + +
+ We introduce a Self-supervised Contrastive Representation Learning Approach +for Time Series Anomaly Detection (CARLA), an innovative end-to-end +self-supervised framework carefully developed to identify anomalous patterns in +both univariate and multivariate time series data. By taking advantage of +contrastive representation learning, We introduce an innovative end-to-end +self-supervised deep learning framework carefully developed to identify +anomalous patterns in both univariate and multivariate time series data. By +taking advantage of contrastive representation learning, CARLA effectively +generates robust representations for time series windows. It achieves this by +1) learning similar representations for temporally close windows and dissimilar +representations for windows and their equivalent anomalous windows and 2) +employing a self-supervised approach to classify normal/anomalous +representations of windows based on their nearest/furthest neighbours in the +representation space. Most of the existing models focus on learning normal +behaviour. The normal boundary is often tightly defined, which can result in +slight deviations being classified as anomalies, resulting in a high false +positive rate and limited ability to generalise normal patterns. CARLA's +contrastive learning methodology promotes the production of highly consistent +and discriminative predictions, thereby empowering us to adeptly address the +inherent challenges associated with anomaly detection in time series data. +Through extensive experimentation on 7 standard real-world time series anomaly +detection benchmark datasets, CARLA demonstrates F1 and AU-PR superior to +existing state-of-the-art results. Our research highlights the immense +potential of contrastive representation learning in advancing the field of time +series anomaly detection, thus paving the way for novel applications and +in-depth exploration in this domain. + +
+
+ comment: 33 pages, 9 figures, 10 tables +
+
+
+
+
+ + ☆ How important are specialized transforms in Neural Operators? + + +
+ Simulating physical systems using Partial Differential Equations (PDEs) has +become an indispensible part of modern industrial process optimization. +Traditionally, numerical solvers have been used to solve the associated PDEs, +however recently Transform-based Neural Operators such as the Fourier Neural +Operator and Wavelet Neural Operator have received a lot of attention for their +potential to provide fast solutions for systems of PDEs. In this work, we +investigate the importance of the transform layers to the reported success of +transform based neural operators. In particular, we record the cost in terms of +performance, if all the transform layers are replaced by learnable linear +layers. Surprisingly, we observe that linear layers suffice to provide +performance comparable to the best-known transform-based layers and seem to do +so with a compute time advantage as well. We believe that this observation can +have significant implications for future work on Neural Operators, and might +point to other sources of efficiencies for these architectures. + +
+
+ comment: 8 pages, 3 figures, 4 tables +
+
+
+
+
+ + ☆ Graph-based Alignment and Uniformity for Recommendation + + +
+ Collaborative filtering-based recommender systems (RecSys) rely on learning +representations for users and items to predict preferences accurately. +Representation learning on the hypersphere is a promising approach due to its +desirable properties, such as alignment and uniformity. However, the sparsity +issue arises when it encounters RecSys. To address this issue, we propose a +novel approach, graph-based alignment and uniformity (GraphAU), that explicitly +considers high-order connectivities in the user-item bipartite graph. GraphAU +aligns the user/item embedding to the dense vector representations of +high-order neighbors using a neighborhood aggregator, eliminating the need to +compute the burdensome alignment to high-order neighborhoods individually. To +address the discrepancy in alignment losses, GraphAU includes a layer-wise +alignment pooling module to integrate alignment losses layer-wise. Experiments +on four datasets show that GraphAU significantly alleviates the sparsity issue +and achieves state-of-the-art performance. We open-source GraphAU at +https://github.com/YangLiangwei/GraphAU. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ HyperLoRA for PDEs + + +
+ Physics-informed neural networks (PINNs) have been widely used to develop +neural surrogates for solutions of Partial Differential Equations. A drawback +of PINNs is that they have to be retrained with every change in +initial-boundary conditions and PDE coefficients. The Hypernetwork, a +model-based meta learning technique, takes in a parameterized task embedding as +input and predicts the weights of PINN as output. Predicting weights of a +neural network however, is a high-dimensional regression problem, and +hypernetworks perform sub-optimally while predicting parameters for large base +networks. To circumvent this issue, we use a low ranked adaptation (LoRA) +formulation to decompose every layer of the base network into low-ranked +tensors and use hypernetworks to predict the low-ranked tensors. Despite the +reduced dimensionality of the resulting weight-regression problem, LoRA-based +Hypernetworks violate the underlying physics of the given task. We demonstrate +that the generalization capabilities of LoRA-based hypernetworks drastically +improve when trained with an additional physics-informed loss component +(HyperPINN) to satisfy the governing differential equations. We observe that +LoRA-based HyperPINN training allows us to learn fast solutions for +parameterized PDEs like Burger's equation and Navier Stokes: Kovasznay flow, +while having an 8x reduction in prediction parameters on average without +compromising on accuracy when compared to all other baselines. + +
+
+ comment: 8 pages, 4 figures, 3 Tables +
+
+
+
+
+ + ☆ A hybrid Decoder-DeepONet operator regression framework for unaligned + observation data + + +
+ Deep neural operators (DNOs) have been utilized to approximate nonlinear +mappings between function spaces. However, DNOs face the challenge of increased +dimensionality and computational cost associated with unaligned observation +data. In this study, we propose a hybrid Decoder-DeepONet operator regression +framework to handle unaligned data effectively. Additionally, we introduce a +Multi-Decoder-DeepONet, which utilizes an average field of training data as +input augmentation. The consistencies of the frameworks with the operator +approximation theory are provided, on the basis of the universal approximation +theorem. Two numerical experiments, Darcy problem and flow-field around an +airfoil, are conducted to validate the efficiency and accuracy of the proposed +methods. Results illustrate the advantages of Decoder-DeepONet and +Multi-Decoder-DeepONet in handling unaligned observation data and showcase +their potentials in improving prediction accuracy. + +
+
+ comment: 35 pages, 10 figures, 11 tables +
+
+
+
+
+ + ☆ Multi-Task Pseudo-Label Learning for Non-Intrusive Speech Quality + Assessment Model + + +
+ This study introduces multi-task pseudo-label (MPL) learning for a +non-intrusive speech quality assessment model. MPL consists of two stages which +are obtaining pseudo-label scores from a pretrained model and performing +multi-task learning. The 3QUEST metrics, namely Speech-MOS (S-MOS), Noise-MOS +(N-MOS), and General-MOS (G-MOS) are selected as the primary ground-truth +labels. Additionally, the pretrained MOSA-Net model is utilized to estimate +three pseudo-labels: perceptual evaluation of speech quality (PESQ), short-time +objective intelligibility (STOI), and speech distortion index (SDI). Multi-task +learning stage of MPL is then employed to train the MTQ-Net model (multi-target +speech quality assessment network). The model is optimized by incorporating +Loss supervision (derived from the difference between the estimated score and +the real ground-truth labels) and Loss semi-supervision (derived from the +difference between the estimated score and pseudo-labels), where Huber loss is +employed to calculate the loss function. Experimental results first demonstrate +the advantages of MPL compared to training the model from scratch and using +knowledge transfer mechanisms. Secondly, the benefits of Huber Loss in +improving the prediction model of MTQ-Net are verified. Finally, the MTQ-Net +with the MPL approach exhibits higher overall prediction capabilities when +compared to other SSL-based speech assessment models. + +
+
+
+
+
+ + ☆ Distribution shift mitigation at test time with performance guarantees + + +
+ Due to inappropriate sample selection and limited training data, a +distribution shift often exists between the training and test sets. This shift +can adversely affect the test performance of Graph Neural Networks (GNNs). +Existing approaches mitigate this issue by either enhancing the robustness of +GNNs to distribution shift or reducing the shift itself. However, both +approaches necessitate retraining the model, which becomes unfeasible when the +model structure and parameters are inaccessible. To address this challenge, we +propose FR-GNN, a general framework for GNNs to conduct feature reconstruction. +FRGNN constructs a mapping relationship between the output and input of a +well-trained GNN to obtain class representative embeddings and then uses these +embeddings to reconstruct the features of labeled nodes. These reconstructed +features are then incorporated into the message passing mechanism of GNNs to +influence the predictions of unlabeled nodes at test time. Notably, the +reconstructed node features can be directly utilized for testing the +well-trained model, effectively reducing the distribution shift and leading to +improved test performance. This remarkable achievement is attained without any +modifications to the model structure or parameters. We provide theoretical +guarantees for the effectiveness of our framework. Furthermore, we conduct +comprehensive experiments on various public datasets. The experimental results +demonstrate the superior performance of FRGNN in comparison to mainstream +methods. + +
+
+
+
+
+ + ☆ Capacity Bounds for Hyperbolic Neural Network Representations of Latent + Tree Structures + + +
+ We study the representation capacity of deep hyperbolic neural networks +(HNNs) with a ReLU activation function. We establish the first proof that HNNs +can $\varepsilon$-isometrically embed any finite weighted tree into a +hyperbolic space of dimension $d$ at least equal to $2$ with prescribed +sectional curvature $\kappa<0$, for any $\varepsilon> 1$ (where $\varepsilon=1$ +being optimal). We establish rigorous upper bounds for the network complexity +on an HNN implementing the embedding. We find that the network complexity of +HNN implementing the graph representation is independent of the representation +fidelity/distortion. We contrast this result against our lower bounds on +distortion which any ReLU multi-layer perceptron (MLP) must exert when +embedding a tree with $L>2^d$ leaves into a $d$-dimensional Euclidean space, +which we show at least $\Omega(L^{1/d})$; independently of the depth, width, +and (possibly discontinuous) activation function defining the MLP. + +
+
+ comment: 22 Pages + References, 1 Table, 4 Figures +
+
+
+
+
+ + ☆ Active and Passive Causal Inference Learning + + +
+ This paper serves as a starting point for machine learning researchers, +engineers and students who are interested in but not yet familiar with causal +inference. We start by laying out an important set of assumptions that are +collectively needed for causal identification, such as exchangeability, +positivity, consistency and the absence of interference. From these +assumptions, we build out a set of important causal inference techniques, which +we do so by categorizing them into two buckets; active and passive approaches. +We describe and discuss randomized controlled trials and bandit-based +approaches from the active category. We then describe classical approaches, +such as matching and inverse probability weighting, in the passive category, +followed by more recent deep learning based algorithms. By finishing the paper +with some of the missing aspects of causal inference from this paper, such as +collider biases, we expect this paper to provide readers with a diverse set of +starting points for further reading and research in causal inference and +discovery. + +
+
+
+
+
+ + ☆ Generalized Sum Pooling for Metric Learning ICCV + + +
+ A common architectural choice for deep metric learning is a convolutional +neural network followed by global average pooling (GAP). Albeit simple, GAP is +a highly effective way to aggregate information. One possible explanation for +the effectiveness of GAP is considering each feature vector as representing a +different semantic entity and GAP as a convex combination of them. Following +this perspective, we generalize GAP and propose a learnable generalized sum +pooling method (GSP). GSP improves GAP with two distinct abilities: i) the +ability to choose a subset of semantic entities, effectively learning to ignore +nuisance information, and ii) learning the weights corresponding to the +importance of each entity. Formally, we propose an entropy-smoothed optimal +transport problem and show that it is a strict generalization of GAP, i.e., a +specific realization of the problem gives back GAP. We show that this +optimization problem enjoys analytical gradients enabling us to use it as a +direct learnable replacement for GAP. We further propose a zero-shot loss to +ease the learning of GSP. We show the effectiveness of our method with +extensive evaluations on 4 popular metric learning benchmarks. Code is +available at: GSP-DML Framework + +
+
+ comment: Accepted as a conference paper at International Conference on + Computer Vision (ICCV) 2023 +
+
+
+
+
+ + ☆ DMCVR: Morphology-Guided Diffusion Model for 3D Cardiac Volume + Reconstruction MICCAI 2023 + + +
+ Accurate 3D cardiac reconstruction from cine magnetic resonance imaging +(cMRI) is crucial for improved cardiovascular disease diagnosis and +understanding of the heart's motion. However, current cardiac MRI-based +reconstruction technology used in clinical settings is 2D with limited +through-plane resolution, resulting in low-quality reconstructed cardiac +volumes. To better reconstruct 3D cardiac volumes from sparse 2D image stacks, +we propose a morphology-guided diffusion model for 3D cardiac volume +reconstruction, DMCVR, that synthesizes high-resolution 2D images and +corresponding 3D reconstructed volumes. Our method outperforms previous +approaches by conditioning the cardiac morphology on the generative model, +eliminating the time-consuming iterative optimization process of the latent +code, and improving generation quality. The learned latent spaces provide +global semantics, local cardiac morphology and details of each 2D cMRI slice +with highly interpretable value to reconstruct 3D cardiac shape. Our +experiments show that DMCVR is highly effective in several aspects, such as 2D +generation and 3D reconstruction performance. With DMCVR, we can produce +high-resolution 3D cardiac MRI reconstructions, surpassing current techniques. +Our proposed framework has great potential for improving the accuracy of +cardiac disease diagnosis and treatment planning. Code can be accessed at +https://github.com/hexiaoxiao-cs/DMCVR. + +
+
+ comment: Accepted in MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Segmenting Known Objects and Unseen Unknowns without Prior Knowledge ICCV 2023 + + +
+ Panoptic segmentation methods assign a known class to each pixel given in +input. Even for state-of-the-art approaches, this inevitably enforces decisions +that systematically lead to wrong predictions for objects outside the training +categories. However, robustness against out-of-distribution samples and corner +cases is crucial in safety-critical settings to avoid dangerous consequences. +Since real-world datasets cannot contain enough data points to adequately +sample the long tail of the underlying distribution, models must be able to +deal with unseen and unknown scenarios as well. Previous methods targeted this +by re-identifying already-seen unlabeled objects. In this work, we propose the +necessary step to extend segmentation with a new setting which we term holistic +segmentation. Holistic segmentation aims to identify and separate objects of +unseen, unknown categories into instances without any prior knowledge about +them while performing panoptic segmentation of known classes. We tackle this +new problem with U3HS, which finds unknowns as highly uncertain regions and +clusters their corresponding instance-aware embeddings into individual objects. +By doing so, for the first time in panoptic segmentation with unknown objects, +our U3HS is trained without unknown categories, reducing assumptions and +leaving the settings as unconstrained as in real-life scenarios. Extensive +experiments on public data from MS COCO, Cityscapes, and Lost&Found demonstrate +the effectiveness of U3HS for this new, challenging, and assumptions-free +setting called holistic segmentation. Project page: +https://holisticseg.github.io. + +
+
+ comment: ICCV 2023. Project page: https://holisticseg.github.io +
+
+
+
+
+ + ♻ ☆ End-to-End Feasible Optimization Proxies for Large-Scale Economic + Dispatch + + +
+ The paper proposes a novel End-to-End Learning and Repair (E2ELR) +architecture for training optimization proxies for economic dispatch problems. +E2ELR combines deep neural networks with closed-form, differentiable repair +layers, thereby integrating learning and feasibility in an end-to-end fashion. +E2ELR is also trained with self-supervised learning, removing the need for +labeled data and the solving of numerous optimization problems offline. E2ELR +is evaluated on industry-size power grids with tens of thousands of buses using +an economic dispatch that co-optimizes energy and reserves. The results +demonstrate that the self-supervised E2ELR achieves state-of-the-art +performance, with optimality gaps that outperform other baselines by at least +an order of magnitude. + +
+
+
+
+
+ + ♻ ☆ Human-Like Intuitive Behavior and Reasoning Biases Emerged in Language + Models -- and Disappeared in GPT-4 + + +
+ Large language models (LLMs) are currently at the forefront of intertwining +AI systems with human communication and everyday life. Therefore, it is of +great importance to evaluate their emerging abilities. In this study, we show +that LLMs, most notably GPT-3, exhibit behavior that strikingly resembles +human-like intuition -- and the cognitive errors that come with it. However, +LLMs with higher cognitive capabilities, in particular ChatGPT and GPT-4, +learned to avoid succumbing to these errors and perform in a hyperrational +manner. For our experiments, we probe LLMs with the Cognitive Reflection Test +(CRT) as well as semantic illusions that were originally designed to +investigate intuitive decision-making in humans. Moreover, we probe how sturdy +the inclination for intuitive-like decision-making is. Our study demonstrates +that investigating LLMs with methods from psychology has the potential to +reveal otherwise unknown emergent traits. + +
+
+ comment: Overlap with arXiv:2212.05206 +
+
+
+
+
+ + ♻ ☆ PC-Droid: Faster diffusion and improved quality for particle cloud + generation + + +
+ Building on the success of PC-JeDi we introduce PC-Droid, a substantially +improved diffusion model for the generation of jet particle clouds. By +leveraging a new diffusion formulation, studying more recent integration +solvers, and training on all jet types simultaneously, we are able to achieve +state-of-the-art performance for all types of jets across all evaluation +metrics. We study the trade-off between generation speed and quality by +comparing two attention based architectures, as well as the potential of +consistency distillation to reduce the number of diffusion steps. Both the +faster architecture and consistency models demonstrate performance surpassing +many competing models, with generation time up to two orders of magnitude +faster than PC-JeDi and three orders of magnitude faster than Delphes. + +
+
+ comment: 21 pages, 8 tables, 13 figures +
+
+
+
+
+ + ♻ ☆ A Tractable Online Learning Algorithm for the Multinomial Logit + Contextual Bandit + + +
+ In this paper, we consider the contextual variant of the MNL-Bandit problem. +More specifically, we consider a dynamic set optimization problem, where a +decision-maker offers a subset (assortment) of products to a consumer and +observes the response in every round. Consumers purchase products to maximize +their utility. We assume that a set of attributes describe the products, and +the mean utility of a product is linear in the values of these attributes. We +model consumer choice behavior using the widely used Multinomial Logit (MNL) +model and consider the decision maker problem of dynamically learning the model +parameters while optimizing cumulative revenue over the selling horizon $T$. +Though this problem has attracted considerable attention in recent times, many +existing methods often involve solving an intractable non-convex optimization +problem. Their theoretical performance guarantees depend on a problem-dependent +parameter which could be prohibitively large. In particular, existing +algorithms for this problem have regret bounded by $O(\sqrt{\kappa d T})$, +where $\kappa$ is a problem-dependent constant that can have an exponential +dependency on the number of attributes. In this paper, we propose an optimistic +algorithm and show that the regret is bounded by $O(\sqrt{dT} + \kappa)$, +significantly improving the performance over existing methods. Further, we +propose a convex relaxation of the optimization step, which allows for +tractable decision-making while retaining the favourable regret guarantee. + +
+
+ comment: There is a technical issue on how Theorem 9 is used, an update is + underway +
+
+
+
+
+ + ♻ ☆ Blockchain-Enabled Federated Learning: A Reference Architecture Design, + Implementation, and Verification + + +
+ This paper presents an innovative reference architecture for +blockchain-enabled federated learning (BCFL), a state-of-the-art approach that +amalgamates the strengths of federated learning and blockchain technology. This +results in a decentralized, collaborative machine learning system that respects +data privacy and user-controlled identity. Our architecture strategically +employs a decentralized identifier (DID)-based authentication system, allowing +participants to authenticate and then gain access to the federated learning +platform securely using their self-sovereign DIDs, which are recorded on the +blockchain. Ensuring robust security and efficient decentralization through the +execution of smart contracts is a key aspect of our approach. Moreover, our +BCFL reference architecture provides significant extensibility, accommodating +the integration of various additional elements, as per specific requirements +and use cases, thereby rendering it an adaptable solution for a wide range of +BCFL applications. Participants can authenticate and then gain access to the +federated learning platform securely using their self-sovereign DIDs, which are +securely recorded on the blockchain. The pivotal contribution of this study is +the successful implementation and validation of a realistic BCFL reference +architecture, marking a significant milestone in the field. We intend to make +the source code publicly accessible shortly, fostering further advancements and +adaptations within the community. This research not only bridges a crucial gap +in the current literature but also lays a solid foundation for future +explorations in the realm of BCFL. + +
+
+ comment: 14 pages, 15 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Remote Bio-Sensing: Open Source Benchmark Framework for Fair Evaluation + of rPPG + + +
+ rPPG (Remote photoplethysmography) is a technology that measures and analyzes +BVP (Blood Volume Pulse) by using the light absorption characteristics of +hemoglobin captured through a camera. Analyzing the measured BVP can derive +various physiological signals such as heart rate, stress level, and blood +pressure, which can be applied to various applications such as telemedicine, +remote patient monitoring, and early prediction of cardiovascular disease. rPPG +is rapidly evolving and attracting great attention from both academia and +industry by providing great usability and convenience as it can measure +biosignals using a camera-equipped device without medical or wearable devices. +Despite extensive efforts and advances in this field, serious challenges +remain, including issues related to skin color, camera characteristics, ambient +lighting, and other sources of noise and artifacts, which degrade accuracy +performance. We argue that fair and evaluable benchmarking is urgently required +to overcome these challenges and make meaningful progress from both academic +and commercial perspectives. In most existing work, models are trained, tested, +and validated only on limited datasets. Even worse, some studies lack available +code or reproducibility, making it difficult to fairly evaluate and compare +performance. Therefore, the purpose of this study is to provide a benchmarking +framework to evaluate various rPPG techniques across a wide range of datasets +for fair evaluation and comparison, including both conventional non-deep neural +network (non-DNN) and deep neural network (DNN) methods. GitHub URL: +https://github.com/remotebiosensing/rppg + +
+
+ comment: 20 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Hard No-Box Adversarial Attack on Skeleton-Based Human Action + Recognition with Skeleton-Motion-Informed Gradient ICCV 2023 + + +
+ Recently, methods for skeleton-based human activity recognition have been +shown to be vulnerable to adversarial attacks. However, these attack methods +require either the full knowledge of the victim (i.e. white-box attacks), +access to training data (i.e. transfer-based attacks) or frequent model queries +(i.e. black-box attacks). All their requirements are highly restrictive, +raising the question of how detrimental the vulnerability is. In this paper, we +show that the vulnerability indeed exists. To this end, we consider a new +attack task: the attacker has no access to the victim model or the training +data or labels, where we coin the term hard no-box attack. Specifically, we +first learn a motion manifold where we define an adversarial loss to compute a +new gradient for the attack, named skeleton-motion-informed (SMI) gradient. Our +gradient contains information of the motion dynamics, which is different from +existing gradient-based attack methods that compute the loss gradient assuming +each dimension in the data is independent. The SMI gradient can augment many +gradient-based attack methods, leading to a new family of no-box attack +methods. Extensive evaluation and comparison show that our method imposes a +real threat to existing classifiers. They also show that the SMI gradient +improves the transferability and imperceptibility of adversarial samples in +both no-box and transfer-based black-box settings. + +
+
+ comment: Camera-ready version for ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Quantum Convolutional Neural Networks with Interaction Layers for + Classification of Classical Data + + +
+ Quantum Machine Learning (QML) has come into the limelight due to the +exceptional computational abilities of quantum computers. With the promises of +near error-free quantum computers in the not-so-distant future, it is important +that the effect of multi-qubit interactions on quantum neural networks is +studied extensively. This paper introduces a Quantum Convolutional Network with +novel Interaction layers exploiting three-qubit interactions increasing the +network's expressibility and entangling capability, for classifying both image +and one-dimensional data. The proposed approach is tested on three publicly +available datasets namely MNIST, Fashion MNIST, and Iris datasets, to perform +binary and multiclass classifications and is found to supersede the performance +of the existing state-of-the-art methods. + +
+
+ comment: 20 pages, 14 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Window-Based Early-Exit Cascades for Uncertainty Estimation: When Deep + Ensembles are More Efficient than Single Models ICCV 2023 + + +
+ Deep Ensembles are a simple, reliable, and effective method of improving both +the predictive performance and uncertainty estimates of deep learning +approaches. However, they are widely criticised as being computationally +expensive, due to the need to deploy multiple independent models. Recent work +has challenged this view, showing that for predictive accuracy, ensembles can +be more computationally efficient (at inference) than scaling single models +within an architecture family. This is achieved by cascading ensemble members +via an early-exit approach. In this work, we investigate extending these +efficiency gains to tasks related to uncertainty estimation. As many such +tasks, e.g. selective classification, are binary classification, our key novel +insight is to only pass samples within a window close to the binary decision +boundary to later cascade stages. Experiments on ImageNet-scale data across a +number of network architectures and uncertainty tasks show that the proposed +window-based early-exit approach is able to achieve a superior +uncertainty-computation trade-off compared to scaling single models. For +example, a cascaded EfficientNet-B2 ensemble is able to achieve similar +coverage at 5% risk as a single EfficientNet-B4 with <30% the number of MACs. +We also find that cascades/ensembles give more reliable improvements on OOD +data vs scaling models up. Code for this work is available at: +https://github.com/Guoxoug/window-early-exit. + +
+
+ comment: Accepted to ICCV 2023 (camera-ready version, 9 pages) +
+
+
+
+
+ + ♻ ☆ Learning to Generate Training Datasets for Robust Semantic Segmentation + + +
+ Semantic segmentation techniques have shown significant progress in recent +years, but their robustness to real-world perturbations and data samples not +seen during training remains a challenge, particularly in safety-critical +applications. In this paper, we propose a novel approach to improve the +robustness of semantic segmentation techniques by leveraging the synergy +between label-to-image generators and image-to-label segmentation models. +Specifically, we design and train Robusta, a novel robust conditional +generative adversarial network to generate realistic and plausible perturbed or +outlier images that can be used to train reliable segmentation models. We +conduct in-depth studies of the proposed generative model, assess the +performance and robustness of the downstream segmentation network, and +demonstrate that our approach can significantly enhance the robustness of +semantic segmentation techniques in the face of real-world perturbations, +distribution shifts, and out-of-distribution samples. Our results suggest that +this approach could be valuable in safety-critical applications, where the +reliability of semantic segmentation techniques is of utmost importance and +comes with a limited computational budget in inference. We will release our +code shortly. + +
+
+
+
+
+ + ♻ ☆ DOMINO: Domain-invariant Hyperdimensional Classification for + Multi-Sensor Time Series Data + + +
+ With the rapid evolution of the Internet of Things, many real-world +applications utilize heterogeneously connected sensors to capture time-series +information. Edge-based machine learning (ML) methodologies are often employed +to analyze locally collected data. However, a fundamental issue across +data-driven ML approaches is distribution shift. It occurs when a model is +deployed on a data distribution different from what it was trained on, and can +substantially degrade model performance. Additionally, increasingly +sophisticated deep neural networks (DNNs) have been proposed to capture spatial +and temporal dependencies in multi-sensor time series data, requiring intensive +computational resources beyond the capacity of today's edge devices. While +brain-inspired hyperdimensional computing (HDC) has been introduced as a +lightweight solution for edge-based learning, existing HDCs are also vulnerable +to the distribution shift challenge. In this paper, we propose DOMINO, a novel +HDC learning framework addressing the distribution shift problem in noisy +multi-sensor time-series data. DOMINO leverages efficient and parallel matrix +operations on high-dimensional space to dynamically identify and filter out +domain-variant dimensions. Our evaluation on a wide range of multi-sensor time +series classification tasks shows that DOMINO achieves on average 2.04% higher +accuracy than state-of-the-art (SOTA) DNN-based domain generalization +techniques, and delivers 16.34x faster training and 2.89x faster inference. +More importantly, DOMINO performs notably better when learning from partially +labeled and highly imbalanced data, providing 10.93x higher robustness against +hardware noises than SOTA DNNs. + +
+
+
+
+
+ + ♻ ☆ RoCourseNet: Distributionally Robust Training of a Prediction Aware + Recourse Model + + +
+ Counterfactual (CF) explanations for machine learning (ML) models are +preferred by end-users, as they explain the predictions of ML models by +providing a recourse (or contrastive) case to individuals who are adversely +impacted by predicted outcomes. Existing CF explanation methods generate +recourses under the assumption that the underlying target ML model remains +stationary over time. However, due to commonly occurring distributional shifts +in training data, ML models constantly get updated in practice, which might +render previously generated recourses invalid and diminish end-users trust in +our algorithmic framework. To address this problem, we propose RoCourseNet, a +training framework that jointly optimizes predictions and recourses that are +robust to future data shifts. This work contains four key contributions: (1) We +formulate the robust recourse generation problem as a tri-level optimization +problem which consists of two sub-problems: (i) a bi-level problem that finds +the worst-case adversarial shift in the training data, and (ii) an outer +minimization problem to generate robust recourses against this worst-case +shift. (2) We leverage adversarial training to solve this tri-level +optimization problem by: (i) proposing a novel virtual data shift (VDS) +algorithm to find worst-case shifted ML models via explicitly considering the +worst-case data shift in the training dataset, and (ii) a block-wise coordinate +descent procedure to optimize for prediction and corresponding robust +recourses. (3) We evaluate RoCourseNet's performance on three real-world +datasets, and show that RoCourseNet consistently achieves more than 96% robust +validity and outperforms state-of-the-art baselines by at least 10% in +generating robust CF explanations. (4) Finally, we generalize the RoCourseNet +framework to accommodate any parametric post-hoc methods for improving robust +validity. + +
+
+
+
+
+ + ♻ ☆ Foundation Models in Smart Agriculture: Basics, Opportunities, and + Challenges + + +
+ The past decade has witnessed the rapid development of ML and DL +methodologies in agricultural systems, showcased by great successes in variety +of agricultural applications. However, these conventional ML/DL models have +certain limitations: They heavily rely on large, costly-to-acquire labeled +datasets for training, require specialized expertise for development and +maintenance, and are mostly tailored for specific tasks, thus lacking +generalizability. Recently, foundation models have demonstrated remarkable +successes in language and vision tasks across various domains. These models are +trained on a vast amount of data from multiple domains and modalities. Once +trained, they can accomplish versatile tasks with just minor fine-tuning and +minimal task-specific labeled data. Despite their proven effectiveness and huge +potential, there has been little exploration of applying FMs to agriculture +fields. Therefore, this study aims to explore the potential of FMs in the field +of smart agriculture. In particular, we present conceptual tools and technical +background to facilitate the understanding of the problem space and uncover new +research directions in this field. To this end, we first review recent FMs in +the general computer science domain and categorize them into four categories: +language FMs, vision FMs, multimodal FMs, and reinforcement learning FMs. +Subsequently, we outline the process of developing agriculture FMs and discuss +their potential applications in smart agriculture. We also discuss the unique +challenges associated with developing AFMs, including model training, +validation, and deployment. Through this study, we contribute to the +advancement of AI in agriculture by introducing AFMs as a promising paradigm +that can significantly mitigate the reliance on extensive labeled datasets and +enhance the efficiency, effectiveness, and generalization of agricultural AI +systems. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Generalized Bandit Regret Minimizer Framework in Imperfect Information + Extensive-Form Game + + +
+ Regret minimization methods are a powerful tool for learning approximate Nash +equilibrium (NE) in two-player zero-sum imperfect information extensive-form +games (IIEGs). We consider the problem in the interactive bandit-feedback +setting where we don't know the dynamics of the IIEG. In general, only the +interactive trajectory and the reached terminal node value $v(z^t)$ are +revealed. To learn NE, the regret minimizer is required to estimate the +full-feedback loss gradient $\ell^t$ by $v(z^t)$ and minimize the regret. In +this paper, we propose a generalized framework for this learning setting. It +presents a theoretical framework for the design and the modular analysis of the +bandit regret minimization methods. We demonstrate that the most recent bandit +regret minimization methods can be analyzed as a particular case of our +framework. Following this framework, we describe a novel method SIX-OMD to +learn approximate NE. It is model-free and extremely improves the best existing +convergence rate from the order of $O(\sqrt{X B/T}+\sqrt{Y C/T})$ to $O(\sqrt{ +M_{\mathcal{X}}/T} +\sqrt{ M_{\mathcal{Y}}/T})$. Moreover, SIX-OMD is +computationally efficient as it needs to perform the current strategy and +average strategy updates only along the sampled trajectory. + +
+
+ comment: The proof of this paper includes many errors, especially for SIX-OMD, + the regret bound of this algorithm is not right since this regret is lower + than the lowest theoretical regret bound obtained by information theory +
+
+
+
+
+ + ♻ ☆ On the Limitations of Model Stealing with Uncertainty Quantification + Models + + +
+ Model stealing aims at inferring a victim model's functionality at a fraction +of the original training cost. While the goal is clear, in practice the model's +architecture, weight dimension, and original training data can not be +determined exactly, leading to mutual uncertainty during stealing. In this +work, we explicitly tackle this uncertainty by generating multiple possible +networks and combining their predictions to improve the quality of the stolen +model. For this, we compare five popular uncertainty quantification models in a +model stealing task. Surprisingly, our results indicate that the considered +models only lead to marginal improvements in terms of label agreement (i.e., +fidelity) to the stolen model. To find the cause of this, we inspect the +diversity of the model's prediction by looking at the prediction variance as a +function of training iterations. We realize that during training, the models +tend to have similar predictions, indicating that the network diversity we +wanted to leverage using uncertainty quantification models is not (high) enough +for improvements on the model stealing task. + +
+
+ comment: 6 pages, 1 figure, 2 table, paper submitted to European Symposium on + Artificial Neural Networks, Computational Intelligence and Machine Learning +
+
+
+
+
+ + ♻ ☆ Local Function Complexity for Active Learning via Mixture of Gaussian + Processes + + +
+ Inhomogeneities in real-world data, e.g., due to changes in the observation +noise level or variations in the structural complexity of the source function, +pose a unique set of challenges for statistical inference. Accounting for them +can greatly improve predictive power when physical resources or computation +time is limited. In this paper, we draw on recent theoretical results on the +estimation of local function complexity (LFC), derived from the domain of local +polynomial smoothing (LPS), to establish a notion of local structural +complexity, which is used to develop a model-agnostic active learning (AL) +framework. Due to its reliance on pointwise estimates, the LPS model class is +not robust and scalable concerning large input space dimensions that typically +come along with real-world problems. Here, we derive and estimate the Gaussian +process regression (GPR)-based analog of the LPS-based LFC and use it as a +substitute in the above framework to make it robust and scalable. We assess the +effectiveness of our LFC estimate in an AL application on a prototypical +low-dimensional synthetic dataset, before taking on the challenging real-world +task of reconstructing a quantum chemical force field for a small organic +molecule and demonstrating state-of-the-art performance with a significantly +reduced training demand. + +
+
+ comment: 27 pages (+16 pages of references and appendices), 19 figures +
+
+
+
+
+ + ♻ ☆ Can Unstructured Pruning Reduce the Depth in Deep Neural Networks? + + +
+ Pruning is a widely used technique for reducing the size of deep neural +networks while maintaining their performance. However, such a technique, +despite being able to massively compress deep models, is hardly able to remove +entire layers from a model (even when structured): is this an addressable task? +In this study, we introduce EGP, an innovative Entropy Guided Pruning algorithm +aimed at reducing the size of deep neural networks while preserving their +performance. The key focus of EGP is to prioritize pruning connections in +layers with low entropy, ultimately leading to their complete removal. Through +extensive experiments conducted on popular models like ResNet-18 and Swin-T, +our findings demonstrate that EGP effectively compresses deep neural networks +while maintaining competitive performance levels. Our results not only shed +light on the underlying mechanism behind the advantages of unstructured +pruning, but also pave the way for further investigations into the intricate +relationship between entropy, pruning techniques, and deep learning +performance. The EGP algorithm and its insights hold great promise for +advancing the field of network compression and optimization. The source code +for EGP is released open-source. + +
+
+
+
+
+ + ♻ ☆ Robust Evaluation of Diffusion-Based Adversarial Purification ICCV 2023 + + +
+ We question the current evaluation practice on diffusion-based purification +methods. Diffusion-based purification methods aim to remove adversarial effects +from an input data point at test time. The approach gains increasing attention +as an alternative to adversarial training due to the disentangling between +training and testing. Well-known white-box attacks are often employed to +measure the robustness of the purification. However, it is unknown whether +these attacks are the most effective for the diffusion-based purification since +the attacks are often tailored for adversarial training. We analyze the current +practices and provide a new guideline for measuring the robustness of +purification methods against adversarial attacks. Based on our analysis, we +further propose a new purification strategy improving robustness compared to +the current diffusion-based purification methods. + +
+
+ comment: Accepted by ICCV 2023, Oral presentation +
+
+
+
+
+ + ♻ ☆ NASimEmu: Network Attack Simulator & Emulator for Training Agents + Generalizing to Novel Scenarios + + +
+ Current frameworks for training offensive penetration testing agents with +deep reinforcement learning struggle to produce agents that perform well in +real-world scenarios, due to the reality gap in simulation-based frameworks and +the lack of scalability in emulation-based frameworks. Additionally, existing +frameworks often use an unrealistic metric that measures the agents' +performance on the training data. NASimEmu, a new framework introduced in this +paper, addresses these issues by providing both a simulator and an emulator +with a shared interface. This approach allows agents to be trained in +simulation and deployed in the emulator, thus verifying the realism of the used +abstraction. Our framework promotes the development of general agents that can +transfer to novel scenarios unseen during their training. For the simulation +part, we adopt an existing simulator NASim and enhance its realism. The +emulator is implemented with industry-level tools, such as Vagrant, VirtualBox, +and Metasploit. Experiments demonstrate that a simulation-trained agent can be +deployed in emulation, and we show how to use the framework to train a general +agent that transfers into novel, structurally different scenarios. NASimEmu is +available as open-source. + +
+
+ comment: NASimEmu is available at https://github.com/jaromiru/NASimEmu and the + baseline agents at https://github.com/jaromiru/NASimEmu-agents +
+
+
+
+
+ + ♻ ☆ REAP: A Large-Scale Realistic Adversarial Patch Benchmark ICCV 2023 + + +
+ Machine learning models are known to be susceptible to adversarial +perturbation. One famous attack is the adversarial patch, a sticker with a +particularly crafted pattern that makes the model incorrectly predict the +object it is placed on. This attack presents a critical threat to +cyber-physical systems that rely on cameras such as autonomous cars. Despite +the significance of the problem, conducting research in this setting has been +difficult; evaluating attacks and defenses in the real world is exceptionally +costly while synthetic data are unrealistic. In this work, we propose the REAP +(REalistic Adversarial Patch) benchmark, a digital benchmark that allows the +user to evaluate patch attacks on real images, and under real-world conditions. +Built on top of the Mapillary Vistas dataset, our benchmark contains over +14,000 traffic signs. Each sign is augmented with a pair of geometric and +lighting transformations, which can be used to apply a digitally generated +patch realistically onto the sign. Using our benchmark, we perform the first +large-scale assessments of adversarial patch attacks under realistic +conditions. Our experiments suggest that adversarial patch attacks may present +a smaller threat than previously believed and that the success rate of an +attack on simpler digital simulations is not predictive of its actual +effectiveness in practice. We release our benchmark publicly at +https://github.com/wagner-group/reap-benchmark. + +
+
+ comment: ICCV 2023. Code and benchmark can be found at + https://github.com/wagner-group/reap-benchmark +
+
+
+
+
+ + ♻ ☆ Reconstruction, forecasting, and stability of chaotic dynamics from + partial data + + +
+ The forecasting and computation of the stability of chaotic systems from +partial observations are tasks for which traditional equation-based methods may +not be suitable. In this computational paper, we propose data-driven methods to +(i) infer the dynamics of unobserved (hidden) chaotic variables (full-state +reconstruction); (ii) time forecast the evolution of the full state; and (iii) +infer the stability properties of the full state. The tasks are performed with +long short-term memory (LSTM) networks, which are trained with observations +(data) limited to only part of the state: (i) the low-to-high resolution LSTM +(LH-LSTM), which takes partial observations as training input, and requires +access to the full system state when computing the loss; and (ii) the +physics-informed LSTM (PI-LSTM), which is designed to combine partial +observations with the integral formulation of the dynamical system's evolution +equations. First, we derive the Jacobian of the LSTMs. Second, we analyse a +chaotic partial differential equation, the Kuramoto-Sivashinsky (KS), and the +Lorenz-96 system. We show that the proposed networks can forecast the hidden +variables, both time-accurately and statistically. The Lyapunov exponents and +covariant Lyapunov vectors, which characterize the stability of the chaotic +attractors, are correctly inferred from partial observations. Third, the +PI-LSTM outperforms the LH-LSTM by successfully reconstructing the hidden +chaotic dynamics when the input dimension is smaller or similar to the +Kaplan-Yorke dimension of the attractor. This work opens new opportunities for +reconstructing the full state, inferring hidden variables, and computing the +stability of chaotic systems from partial data. + +
+
+
+
+
+ + ♻ ☆ Streamlined Lensed Quasar Identification in Multiband Images via + Ensemble Networks + + +
+ Quasars experiencing strong lensing offer unique viewpoints on subjects +related to the cosmic expansion rate, the dark matter profile within the +foreground deflectors, and the quasar host galaxies. Unfortunately, identifying +them in astronomical images is challenging since they are overwhelmed by the +abundance of non-lenses. To address this, we have developed a novel approach by +ensembling cutting-edge convolutional networks (CNNs) -- for instance, ResNet, +Inception, NASNet, MobileNet, EfficientNet, and RegNet -- along with vision +transformers (ViTs) trained on realistic galaxy-quasar lens simulations based +on the Hyper Suprime-Cam (HSC) multiband images. While the individual model +exhibits remarkable performance when evaluated against the test dataset, +achieving an area under the receiver operating characteristic curve of $>$97.3% +and a median false positive rate of 3.6%, it struggles to generalize in real +data, indicated by numerous spurious sources picked by each classifier. A +significant improvement is achieved by averaging these CNNs and ViTs, resulting +in the impurities being downsized by factors up to 50. Subsequently, combining +the HSC images with the UKIRT, VISTA, and unWISE data, we retrieve +approximately 60 million sources as parent samples and reduce this to 892,609 +after employing a photometry preselection to discover $z>1.5$ lensed quasars +with Einstein radii of $\theta_\mathrm{E}<5$ arcsec. Afterward, the ensemble +classifier indicates 3080 sources with a high probability of being lenses, for +which we visually inspect, yielding 210 prevailing candidates awaiting +spectroscopic confirmation. These outcomes suggest that automated deep learning +pipelines hold great potential in effectively detecting strong lenses in vast +datasets with minimal manual visual inspection involved. + +
+
+ comment: Accepted for publication in the Astronomy & Astrophysics journal. 28 + pages, 11 figures, and 3 tables. We welcome comments from the reader +
+
+
+
+
+ + ♻ ☆ Relation-aware graph structure embedding with co-contrastive learning + for drug-drug interaction prediction + + +
+ Relation-aware graph structure embedding is promising for predicting +multi-relational drug-drug interactions (DDIs). Typically, most existing +methods begin by constructing a multi-relational DDI graph and then learning +relation-aware graph structure embeddings (RaGSEs) of drugs from the DDI graph. +Nevertheless, most existing approaches are usually limited in learning RaGSEs +of new drugs, leading to serious over-fitting when the test DDIs involve such +drugs. To alleviate this issue, we propose a novel DDI prediction method based +on relation-aware graph structure embedding with co-contrastive learning, +RaGSECo. The proposed RaGSECo constructs two heterogeneous drug graphs: a +multi-relational DDI graph and a multi-attribute drug-drug similarity (DDS) +graph. The two graphs are used respectively for learning and propagating the +RaGSEs of drugs, aiming to ensure all drugs, including new ones, can possess +effective RaGSEs. Additionally, we present a novel co-contrastive learning +module to learn drug-pairs (DPs) representations. This mechanism learns DP +representations from two distinct views (interaction and similarity views) and +encourages these views to supervise each other collaboratively to obtain more +discriminative DP representations. We evaluate the effectiveness of our RaGSECo +on three different tasks using two real datasets. The experimental results +demonstrate that RaGSECo outperforms existing state-of-the-art prediction +methods. + +
+
+ comment: 14pages, 23figures +
+
+
+
+
+ + ♻ ☆ Machine learning methods for the search for L&T brown dwarfs in the data + of modern sky surveys + + +
+ According to various estimates, brown dwarfs (BD) should account for up to 25 +percent of all objects in the Galaxy. However, few of them are discovered and +well-studied, both individually and as a population. Homogeneous and complete +samples of brown dwarfs are needed for these kinds of studies. Due to their +weakness, spectral studies of brown dwarfs are rather laborious. For this +reason, creating a significant reliable sample of brown dwarfs, confirmed by +spectroscopic observations, seems unattainable at the moment. Numerous attempts +have been made to search for and create a set of brown dwarfs using their +colours as a decision rule applied to a vast amount of survey data. In this +work, we use machine learning methods such as Random Forest Classifier, +XGBoost, SVM Classifier and TabNet on PanStarrs DR1, 2MASS and WISE data to +distinguish L and T brown dwarfs from objects of other spectral and luminosity +classes. The explanation of the models is discussed. We also compare our models +with classical decision rules, proving their efficiency and relevance. + +
+
+ comment: 12 pages, 10 figures, Accepted for publication in Astronomy and + Computing +
+
+
+
+
+ + ♻ ☆ PTransIPs: Identification of phosphorylation sites based on protein + pretrained language model and Transformer + + +
+ Phosphorylation is central to numerous fundamental cellular processes, +influencing the onset and progression of a variety of diseases. The correct +identification of these phosphorylation sites is of great importance to unravel +the intricate molecular mechanisms within cells and during viral infections, +potentially leading to the discovery of new therapeutic targets. In this study, +we introduce PTransIPs, a novel deep learning model for the identification of +phosphorylation sites. PTransIPs treat amino acids within protein sequences as +words, extracting unique encodings based on their type and sequential position. +The model also incorporates embeddings from large pretrained protein models as +additional data inputs. PTransIPS is further trained on a combination model of +convolutional neural network with residual connections and Transformer model +equipped with multi-head attention mechanisms. At last, the model outputs +classification results through a fully connected layer. The results of +independent testing reveal that PTransIPs outperforms existing +state-of-the-art(SOTA) methods, achieving AUROCs of 0.9232 and 0.9660 for +identifying phosphorylated S/T and Y sites respectively. In addition, ablation +studies prove that pretrained model embeddings contribute to the performance of +PTransIPs. Furthermore, PTransIPs has interpretable amino acid preference, +visible training process and shows generalizability on other bioactivity +classification tasks. To facilitate usage, our code and data are publicly +accessible at \url{https://github.com/StatXzy7/PTransIPs}. + +
+
+
+
+
+ + ♻ ☆ Comprehensive Training and Evaluation on Deep Reinforcement Learning for + Automated Driving in Various Simulated Driving Maneuvers SC 2023 + + +
+ Developing and testing automated driving models in the real world might be +challenging and even dangerous, while simulation can help with this, especially +for challenging maneuvers. Deep reinforcement learning (DRL) has the potential +to tackle complex decision-making and controlling tasks through learning and +interacting with the environment, thus it is suitable for developing automated +driving while not being explored in detail yet. This study carried out a +comprehensive study by implementing, evaluating, and comparing the two DRL +algorithms, Deep Q-networks (DQN) and Trust Region Policy Optimization (TRPO), +for training automated driving on the highway-env simulation platform. +Effective and customized reward functions were developed and the implemented +algorithms were evaluated in terms of onlane accuracy (how well the car drives +on the road within the lane), efficiency (how fast the car drives), safety (how +likely the car is to crash into obstacles), and comfort (how much the car makes +jerks, e.g., suddenly accelerates or brakes). Results show that the TRPO-based +models with modified reward functions delivered the best performance in most +cases. Furthermore, to train a uniform driving model that can tackle various +driving maneuvers besides the specific ones, this study expanded the +highway-env and developed an extra customized training environment, namely, +ComplexRoads, integrating various driving maneuvers and multiple road scenarios +together. Models trained on the designed ComplexRoads environment can adapt +well to other driving maneuvers with promising overall performance. Lastly, +several functionalities were added to the highway-env to implement this work. +The codes are open on GitHub at https://github.com/alaineman/drlcarsim-paper. + +
+
+ comment: 6 pages, 3 figures, accepted by the 26th IEEE International + Conference on Intelligent Transportation Systems (ITSC 2023) +
+
+
+
+
+ + ♻ ☆ A Counterfactual Safety Margin Perspective on the Scoring of Autonomous + Vehicles' Riskiness + + +
+ Autonomous Vehicles (AVs) have the potential to provide numerous societal +benefits, such as decreased road accidents and increased overall transportation +efficiency. However, quantifying the risk associated with AVs is challenging +due to the lack of historical data and the rapidly evolving technology. This +paper presents a data-driven framework for comparing the risk of different AVs' +behaviors in various operational design domains (ODDs), based on counterfactual +simulations of "misbehaving" road users. We introduce the concept of +counterfactual safety margin, which represents the minimum deviation from +normal behavior that could lead to a collision. This concept helps to find the +most critical scenarios but also to assess the frequency and severity of risk +of AVs. We show that the proposed methodology is applicable even when the AV's +behavioral policy is unknown -- through worst- and best-case analyses -- making +the method useful also to external third-party risk assessors. Our experimental +results demonstrate the correlation between the safety margin, the driving +policy quality, and the ODD shedding light on the relative risk associated with +different AV providers. This work contributes to AV safety assessment and aids +in addressing legislative and insurance concerns surrounding this emerging +technology. + +
+
+ comment: updated affiliations +
+
+
+
+
+ + ♻ ☆ GHN-Q: Parameter Prediction for Unseen Quantized Convolutional + Architectures via Graph Hypernetworks + + +
+ Deep convolutional neural network (CNN) training via iterative optimization +has had incredible success in finding optimal parameters. However, modern CNN +architectures often contain millions of parameters. Thus, any given model for a +single architecture resides in a massive parameter space. Models with similar +loss could have drastically different characteristics such as adversarial +robustness, generalizability, and quantization robustness. For deep learning on +the edge, quantization robustness is often crucial. Finding a model that is +quantization-robust can sometimes require significant efforts. Recent works +using Graph Hypernetworks (GHN) have shown remarkable performance predicting +high-performant parameters of varying CNN architectures. Inspired by these +successes, we wonder if the graph representations of GHN-2 can be leveraged to +predict quantization-robust parameters as well, which we call GHN-Q. We conduct +the first-ever study exploring the use of graph hypernetworks for predicting +parameters of unseen quantized CNN architectures. We focus on a reduced CNN +search space and find that GHN-Q can in fact predict quantization-robust +parameters for various 8-bit quantized CNNs. Decent quantized accuracies are +observed even with 4-bit quantization despite GHN-Q not being trained on it. +Quantized finetuning of GHN-Q at lower bitwidths may bring further improvements +and is currently being explored. + +
+
+ comment: Updated Figure 1 and added additional results in Table 1. Initial + extended abstract version accepted at Edge Intelligence Workshop 2022 for + poster presentation +
+
+
+
+
+ + ♻ ☆ Dimension Independent Mixup for Hard Negative Sample in Collaborative + Filtering + + +
+ Collaborative filtering (CF) is a widely employed technique that predicts +user preferences based on past interactions. Negative sampling plays a vital +role in training CF-based models with implicit feedback. In this paper, we +propose a novel perspective based on the sampling area to revisit existing +sampling methods. We point out that current sampling methods mainly focus on +Point-wise or Line-wise sampling, lacking flexibility and leaving a significant +portion of the hard sampling area un-explored. To address this limitation, we +propose Dimension Independent Mixup for Hard Negative Sampling (DINS), which is +the first Area-wise sampling method for training CF-based models. DINS +comprises three modules: Hard Boundary Definition, Dimension Independent Mixup, +and Multi-hop Pooling. Experiments with real-world datasets on both matrix +factorization and graph-based models demonstrate that DINS outperforms other +negative sampling methods, establishing its effectiveness and superiority. Our +work contributes a new perspective, introduces Area-wise sampling, and presents +DINS as a novel approach that achieves state-of-the-art performance for +negative sampling. Our implementations are available in PyTorch. + +
+
+
+
+
+ + ♻ ☆ Robust Single-view Cone-beam X-ray Pose Estimation with Neural Tuned + Tomography (NeTT) and Masked Neural Radiance Fields (mNeRF) + + +
+ Many tasks performed in image-guided, mini-invasive, medical procedures can +be cast as pose estimation problems, where an X-ray projection is utilized to +reach a target in 3D space. Expanding on recent advances in the differentiable +rendering of optically reflective materials, we introduce new methods for pose +estimation of radiolucent objects using X-ray projections, and we demonstrate +the critical role of optimal view synthesis in performing this task. We first +develop an algorithm (DiffDRR) that efficiently computes Digitally +Reconstructed Radiographs (DRRs) and leverages automatic differentiation within +TensorFlow. Pose estimation is performed by iterative gradient descent using a +loss function that quantifies the similarity of the DRR synthesized from a +randomly initialized pose and the true fluoroscopic image at the target pose. +We propose two novel methods for high-fidelity view synthesis, Neural Tuned +Tomography (NeTT) and masked Neural Radiance Fields (mNeRF). Both methods rely +on classic Cone-Beam Computerized Tomography (CBCT); NeTT directly optimizes +the CBCT densities, while the non-zero values of mNeRF are constrained by a 3D +mask of the anatomic region segmented from CBCT. We demonstrate that both NeTT +and mNeRF distinctly improve pose estimation within our framework. By defining +a successful pose estimate to be a 3D angle error of less than 3 deg, we find +that NeTT and mNeRF can achieve similar results, both with overall success +rates more than 93%. However, the computational cost of NeTT is significantly +lower than mNeRF in both training and pose estimation. Furthermore, we show +that a NeTT trained for a single subject can generalize to synthesize +high-fidelity DRRs and ensure robust pose estimations for all other subjects. +Therefore, we suggest that NeTT is an attractive option for robust pose +estimation using fluoroscopic projections. + +
+
+
+
+
+ + ♻ ☆ Mirror Diffusion Models + + +
+ Diffusion models have successfully been applied to generative tasks in +various continuous domains. However, applying diffusion to discrete categorical +data remains a non-trivial task. Moreover, generation in continuous domains +often requires clipping in practice, which motivates the need for a theoretical +framework for adapting diffusion to constrained domains. Inspired by the mirror +Langevin algorithm for the constrained sampling problem, in this theoretical +report we propose Mirror Diffusion Models (MDMs). We demonstrate MDMs in the +context of simplex diffusion and propose natural extensions to popular domains +such as image and text generation. + +
+
+
+
+
+ + ♻ ☆ FeDXL: Provable Federated Learning for Deep X-Risk Optimization + + +
+ In this paper, we tackle a novel federated learning (FL) problem for +optimizing a family of X-risks, to which no existing FL algorithms are +applicable. In particular, the objective has the form of $\mathbb E_{z\sim S_1} +f(\mathbb E_{z'\sim S_2} \ell(w; z, z'))$, where two sets of data $S_1, S_2$ +are distributed over multiple machines, $\ell(\cdot)$ is a pairwise loss that +only depends on the prediction outputs of the input data pairs $(z, z')$, and +$f(\cdot)$ is possibly a non-linear non-convex function. This problem has +important applications in machine learning, e.g., AUROC maximization with a +pairwise loss, and partial AUROC maximization with a compositional loss. The +challenges for designing an FL algorithm for X-risks lie in the +non-decomposability of the objective over multiple machines and the +interdependency between different machines. To this end, we propose an +active-passive decomposition framework that decouples the gradient's components +with two types, namely active parts and passive parts, where the active parts +depend on local data that are computed with the local model and the passive +parts depend on other machines that are communicated/computed based on +historical models and samples. Under this framework, we develop two provable FL +algorithms (FeDXL) for handling linear and nonlinear $f$, respectively, based +on federated averaging and merging. We develop a novel theoretical analysis to +combat the latency of the passive parts and the interdependency between the +local model parameters and the involved data for computing local gradient +estimators. We establish both iteration and communication complexities and show +that using the historical samples and models for computing the passive parts do +not degrade the complexities. We conduct empirical studies of FeDXL for deep +AUROC and partial AUROC maximization, and demonstrate their performance +compared with several baselines. + +
+
+ comment: International Conference on Machine Learning, 2023 +
+
+
+
+
+ + ♻ ☆ Balancing Exploration and Exploitation: Disentangled $β$-CVAE in De + Novo Drug Design + + +
+ Deep generative models have recently emerged as a promising de novo drug +design method. In this respect, deep generative conditional variational +autoencoder (CVAE) models are a powerful approach for generating novel +molecules with desired drug-like properties. However, molecular graph-based +models with disentanglement and multivariate explicit latent conditioning have +not been fully elucidated. To address this, we proposed a molecular-graph +$\beta$-CVAE model for de novo drug design. Here, we empirically tuned the +value of disentanglement and assessed its ability to generate molecules with +optimised univariate- or-multivariate properties. In particular, we optimised +the octanol-water partition coefficient (ClogP), molar refractivity (CMR), +quantitative estimate of drug-likeness (QED), and synthetic accessibility score +(SAS). Results suggest that a lower $\beta$ value increases the uniqueness of +generated molecules (exploration). Univariate optimisation results showed our +model generated molecular property averages of ClogP = 41.07% $\pm$ 0.01% and +CMR 66.76% $\pm$ 0.01% by the Ghose filter. Multivariate property optimisation +results showed that our model generated an average of 30.07% $\pm$ 0.01% +molecules for both desired properties. Furthermore, our model improved the QED +and SAS (exploitation) of molecules generated. Together, these results suggest +that the $\beta$-CVAE could balance exploration and exploitation through +disentanglement and is a promising model for de novo drug design, thus +providing a basis for future studies. + +
+
+
+
+
+ + ♻ ☆ DeepAccident: A Motion and Accident Prediction Benchmark for V2X + Autonomous Driving + + +
+ Safety is the primary priority of autonomous driving. Nevertheless, no +published dataset currently supports the direct and explainable safety +evaluation for autonomous driving. In this work, we propose DeepAccident, a +large-scale dataset generated via a realistic simulator containing diverse +accident scenarios that frequently occur in real-world driving. The proposed +DeepAccident dataset includes 57K annotated frames and 285K annotated samples, +approximately 7 times more than the large-scale nuScenes dataset with 40k +annotated samples. In addition, we propose a new task, end-to-end motion and +accident prediction, which can be used to directly evaluate the accident +prediction ability for different autonomous driving algorithms. Furthermore, +for each scenario, we set four vehicles along with one infrastructure to record +data, thus providing diverse viewpoints for accident scenarios and enabling V2X +(vehicle-to-everything) research on perception and prediction tasks. Finally, +we present a baseline V2X model named V2XFormer that demonstrates superior +performance for motion and accident prediction and 3D object detection compared +to the single-vehicle model. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Community Detection for Stochastic Block Models ICML 2022 + + +
+ The goal of community detection over graphs is to recover underlying +labels/attributes of users (e.g., political affiliation) given the connectivity +between users (represented by adjacency matrix of a graph). There has been +significant recent progress on understanding the fundamental limits of +community detection when the graph is generated from a stochastic block model +(SBM). Specifically, sharp information theoretic limits and efficient +algorithms have been obtained for SBMs as a function of $p$ and $q$, which +represent the intra-community and inter-community connection probabilities. In +this paper, we study the community detection problem while preserving the +privacy of the individual connections (edges) between the vertices. Focusing on +the notion of $(\epsilon, \delta)$-edge differential privacy (DP), we seek to +understand the fundamental tradeoffs between $(p, q)$, DP budget $(\epsilon, +\delta)$, and computational efficiency for exact recovery of the community +labels. + To this end, we present and analyze the associated information-theoretic +tradeoffs for three broad classes of differentially private community recovery +mechanisms: a) stability based mechanism; b) sampling based mechanisms; and c) +graph perturbation mechanisms. Our main findings are that stability and +sampling based mechanisms lead to a superior tradeoff between $(p,q)$ and the +privacy budget $(\epsilon, \delta)$; however this comes at the expense of +higher computational complexity. On the other hand, albeit low complexity, +graph perturbation mechanisms require the privacy budget $\epsilon$ to scale as +$\Omega(\log(n))$ for exact recovery. To the best of our knowledge, this is the +first work to study the impact of privacy constraints on the fundamental limits +for community detection. + +
+
+ comment: ICML 2022. https://proceedings.mlr.press/v162/mohamed22a.html +
+
+
+
+
+ + ♻ ☆ Goal Representations for Instruction Following: A Semi-Supervised + Language Interface to Control + + +
+ Our goal is for robots to follow natural language instructions like "put the +towel next to the microwave." But getting large amounts of labeled data, i.e. +data that contains demonstrations of tasks labeled with the language +instruction, is prohibitive. In contrast, obtaining policies that respond to +image goals is much easier, because any autonomous trial or demonstration can +be labeled in hindsight with its final state as the goal. In this work, we +contribute a method that taps into joint image- and goal- conditioned policies +with language using only a small amount of language data. Prior work has made +progress on this using vision-language models or by jointly training +language-goal-conditioned policies, but so far neither method has scaled +effectively to real-world robot tasks without significant human annotation. Our +method achieves robust performance in the real world by learning an embedding +from the labeled data that aligns language not to the goal image, but rather to +the desired change between the start and goal images that the instruction +corresponds to. We then train a policy on this embedding: the policy benefits +from all the unlabeled data, but the aligned embedding provides an interface +for language to steer the policy. We show instruction following across a +variety of manipulation tasks in different scenes, with generalization to +language instructions outside of the labeled data. Videos and code for our +approach can be found on our website: https://rail-berkeley.github.io/grif/ . + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Secure and Privacy-Preserving Automated Machine Learning Operations into + End-to-End Integrated IoT-Edge-Artificial Intelligence-Blockchain Monitoring + System for Diabetes Mellitus Prediction + + +
+ Diabetes Mellitus, one of the leading causes of death worldwide, has no cure +to date and can lead to severe health complications, such as retinopathy, limb +amputation, cardiovascular diseases, and neuronal disease, if left untreated. +Consequently, it becomes crucial to take precautionary measures to +avoid/predict the occurrence of diabetes. Machine learning approaches have been +proposed and evaluated in the literature for diabetes prediction. This paper +proposes an IoT-edge-Artificial Intelligence (AI)-blockchain system for +diabetes prediction based on risk factors. The proposed system is underpinned +by the blockchain to obtain a cohesive view of the risk factors data from +patients across different hospitals and to ensure security and privacy of the +user's data. Furthermore, we provide a comparative analysis of different +medical sensors, devices, and methods to measure and collect the risk factors +values in the system. Numerical experiments and comparative analysis were +carried out between our proposed system, using the most accurate random forest +(RF) model, and the two most used state-of-the-art machine learning approaches, +Logistic Regression (LR) and Support Vector Machine (SVM), using three +real-life diabetes datasets. The results show that the proposed system using RF +predicts diabetes with 4.57% more accuracy on average compared to LR and SVM, +with 2.87 times more execution time. Data balancing without feature selection +does not show significant improvement. The performance is improved by 1.14% and +0.02% after feature selection for PIMA Indian and Sylhet datasets respectively, +while it reduces by 0.89% for MIMIC III. + +
+
+
+
+
+
+
+
+ + Multimedia 15 + +
+
+
+ + ☆ Audiovisual Moments in Time: A Large-Scale Annotated Dataset of + Audiovisual Actions + + +
+ We present Audiovisual Moments in Time (AVMIT), a large-scale dataset of +audiovisual action events. In an extensive annotation task 11 participants +labelled a subset of 3-second audiovisual videos from the Moments in Time +dataset (MIT). For each trial, participants assessed whether the labelled +audiovisual action event was present and whether it was the most prominent +feature of the video. The dataset includes the annotation of 57,177 audiovisual +videos, each independently evaluated by 3 of 11 trained participants. From this +initial collection, we created a curated test set of 16 distinct action +classes, with 60 videos each (960 videos). We also offer 2 sets of pre-computed +audiovisual feature embeddings, using VGGish/YamNet for audio data and +VGG16/EfficientNetB0 for visual data, thereby lowering the barrier to entry for +audiovisual DNN research. We explored the advantages of AVMIT annotations and +feature embeddings to improve performance on audiovisual event recognition. A +series of 6 Recurrent Neural Networks (RNNs) were trained on either +AVMIT-filtered audiovisual events or modality-agnostic events from MIT, and +then tested on our audiovisual test set. In all RNNs, top 1 accuracy was +increased by 2.71-5.94\% by training exclusively on audiovisual events, even +outweighing a three-fold increase in training data. We anticipate that the +newly annotated AVMIT dataset will serve as a valuable resource for research +and comparative experiments involving computational models and human +participants, specifically when addressing research questions where audiovisual +correspondence is of critical importance. + +
+
+
+
+
+ + ☆ PoSynDA: Multi-Hypothesis Pose Synthesis Domain Adaptation for Robust 3D + Human Pose Estimation + + +
+ The current 3D human pose estimators face challenges in adapting to new +datasets due to the scarcity of 2D-3D pose pairs in target domain training +sets. We present the \textit{Multi-Hypothesis \textbf{P}ose \textbf{Syn}thesis +\textbf{D}omain \textbf{A}daptation} (\textbf{PoSynDA}) framework to overcome +this issue without extensive target domain annotation. Utilizing a +diffusion-centric structure, PoSynDA simulates the 3D pose distribution in the +target domain, filling the data diversity gap. By incorporating a +multi-hypothesis network, it creates diverse pose hypotheses and aligns them +with the target domain. Target-specific source augmentation obtains the target +domain distribution data from the source domain by decoupling the scale and +position parameters. The teacher-student paradigm and low-rank adaptation +further refine the process. PoSynDA demonstrates competitive performance on +benchmarks, such as Human3.6M, MPI-INF-3DHP, and 3DPW, even comparable with the +target-trained MixSTE model~\cite{zhang2022mixste}. This work paves the way for +the practical application of 3D human pose estimation. The code is available at +https://github.com/hbing-l/PoSynDA. + +
+
+ comment: Accepted to ACM Multimedia 2023; 10 pages, 4 figures, 8 tables; the + code is at https://github.com/hbing-l/PoSynDA +
+
+
+
+
+ + ☆ Language-Guided Diffusion Model for Visual Grounding + + +
+ Visual grounding (VG) tasks involve explicit cross-modal alignment, as +semantically corresponding image regions are to be located for the language +phrases provided. Existing approaches complete such visual-text reasoning in a +single-step manner. Their performance causes high demands on large-scale +anchors and over-designed multi-modal fusion modules based on human priors, +leading to complicated frameworks that may be difficult to train and overfit to +specific scenarios. Even worse, such once-for-all reasoning mechanisms are +incapable of refining boxes continuously to enhance query-region matching. In +contrast, in this paper, we formulate an iterative reasoning process by +denoising diffusion modeling. Specifically, we propose a language-guided +diffusion framework for visual grounding, LG-DVG, which trains the model to +progressively reason queried object boxes by denoising a set of noisy boxes +with the language guide. To achieve this, LG-DVG gradually perturbs +query-aligned ground truth boxes to noisy ones and reverses this process step +by step, conditional on query semantics. Extensive experiments for our proposed +framework on five widely used datasets validate the superior performance of +solving visual grounding, a cross-modal alignment task, in a generative way. +The source codes are available at +\url{https://github.com/iQua/vgbase/tree/DiffusionVG}. + +
+
+ comment: 20 pages, 16 figures +
+
+
+
+
+ + ☆ Multi-scale Target-Aware Framework for Constrained Image Splicing + Detection and Localization + + +
+ Constrained image splicing detection and localization (CISDL) is a +fundamental task of multimedia forensics, which detects splicing operation +between two suspected images and localizes the spliced region on both images. +Recent works regard it as a deep matching problem and have made significant +progress. However, existing frameworks typically perform feature extraction and +correlation matching as separate processes, which may hinder the model's +ability to learn discriminative features for matching and can be susceptible to +interference from ambiguous background pixels. In this work, we propose a +multi-scale target-aware framework to couple feature extraction and correlation +matching in a unified pipeline. In contrast to previous methods, we design a +target-aware attention mechanism that jointly learns features and performs +correlation matching between the probe and donor images. Our approach can +effectively promote the collaborative learning of related patches, and perform +mutual promotion of feature learning and correlation matching. Additionally, in +order to handle scale transformations, we introduce a multi-scale projection +method, which can be readily integrated into our target-aware framework that +enables the attention process to be conducted between tokens containing +information of varying scales. Our experiments demonstrate that our model, +which uses a unified pipeline, outperforms state-of-the-art methods on several +benchmark datasets and is robust against scale transformations. + +
+
+
+
+
+ + ☆ RLIPv2: Fast Scaling of Relational Language-Image Pre-training ICCV 2023 + + +
+ Relational Language-Image Pre-training (RLIP) aims to align vision +representations with relational texts, thereby advancing the capability of +relational reasoning in computer vision tasks. However, hindered by the slow +convergence of RLIPv1 architecture and the limited availability of existing +scene graph data, scaling RLIPv1 is challenging. In this paper, we propose +RLIPv2, a fast converging model that enables the scaling of relational +pre-training to large-scale pseudo-labelled scene graph data. To enable fast +scaling, RLIPv2 introduces Asymmetric Language-Image Fusion (ALIF), a mechanism +that facilitates earlier and deeper gated cross-modal fusion with sparsified +language encoding layers. ALIF leads to comparable or better performance than +RLIPv1 in a fraction of the time for pre-training and fine-tuning. To obtain +scene graph data at scale, we extend object detection datasets with free-form +relation labels by introducing a captioner (e.g., BLIP) and a designed Relation +Tagger. The Relation Tagger assigns BLIP-generated relation texts to region +pairs, thus enabling larger-scale relational pre-training. Through extensive +experiments conducted on Human-Object Interaction Detection and Scene Graph +Generation, RLIPv2 shows state-of-the-art performance on three benchmarks under +fully-finetuning, few-shot and zero-shot settings. Notably, the largest RLIPv2 +achieves 23.29mAP on HICO-DET without any fine-tuning, yields 32.22mAP with +just 1% data and yields 45.09mAP with 100% data. Code and models are publicly +available at https://github.com/JacobYuan7/RLIPv2. + +
+
+ comment: Accepted to ICCV 2023. Code and models: + https://github.com/JacobYuan7/RLIPv2 +
+
+
+
+
+ + ☆ LSCD: A Large-Scale Screen Content Dataset for Video Compression + + +
+ Multimedia compression allows us to watch videos, see pictures and hear +sounds within a limited bandwidth, which helps the flourish of the internet. +During the past decades, multimedia compression has achieved great success +using hand-craft features and systems. With the development of artificial +intelligence and video compression, there emerges a lot of research work +related to using the neural network on the video compression task to get rid of +the complicated system. Not only producing the advanced algorithms, but +researchers also spread the compression to different content, such as User +Generated Content(UGC). With the rapid development of mobile devices, screen +content videos become an important part of multimedia data. In contrast, we +find community lacks a large-scale dataset for screen content video +compression, which impedes the fast development of the corresponding +learning-based algorithms. In order to fulfill this blank and accelerate the +research of this special type of videos, we propose the Large-scale Screen +Content Dataset(LSCD), which contains 714 source sequences. Meanwhile, we +provide the analysis of the proposed dataset to show some features of screen +content videos, which will help researchers have a better understanding of how +to explore new algorithms. Besides collecting and post-processing the data to +organize the dataset, we also provide a benchmark containing the performance of +both traditional codec and learning-based methods. + +
+
+
+
+
+ + ☆ Audio-Visual Glance Network for Efficient Video Recognition ICCV 2023 + + +
+ Deep learning has made significant strides in video understanding tasks, but +the computation required to classify lengthy and massive videos using +clip-level video classifiers remains impractical and prohibitively expensive. +To address this issue, we propose Audio-Visual Glance Network (AVGN), which +leverages the commonly available audio and visual modalities to efficiently +process the spatio-temporally important parts of a video. AVGN firstly divides +the video into snippets of image-audio clip pair and employs lightweight +unimodal encoders to extract global visual features and audio features. To +identify the important temporal segments, we use an Audio-Visual Temporal +Saliency Transformer (AV-TeST) that estimates the saliency scores of each +frame. To further increase efficiency in the spatial dimension, AVGN processes +only the important patches instead of the whole images. We use an +Audio-Enhanced Spatial Patch Attention (AESPA) module to produce a set of +enhanced coarse visual features, which are fed to a policy network that +produces the coordinates of the important patches. This approach enables us to +focus only on the most important spatio-temporally parts of the video, leading +to more efficient video recognition. Moreover, we incorporate various training +techniques and multi-modal feature fusion to enhance the robustness and +effectiveness of our AVGN. By combining these strategies, our AVGN sets new +state-of-the-art performance in multiple video recognition benchmarks while +achieving faster processing speed. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Robust Audio Anti-Spoofing with Fusion-Reconstruction Learning on + Multi-Order Spectrograms + + +
+ Robust audio anti-spoofing has been increasingly challenging due to the +recent advancements on deepfake techniques. While spectrograms have +demonstrated their capability for anti-spoofing, complementary information +presented in multi-order spectral patterns have not been well explored, which +limits their effectiveness for varying spoofing attacks. Therefore, we propose +a novel deep learning method with a spectral fusion-reconstruction strategy, +namely S2pecNet, to utilise multi-order spectral patterns for robust audio +anti-spoofing representations. Specifically, spectral patterns up to +second-order are fused in a coarse-to-fine manner and two branches are designed +for the fine-level fusion from the spectral and temporal contexts. A +reconstruction from the fused representation to the input spectrograms further +reduces the potential fused information loss. Our method achieved the +state-of-the-art performance with an EER of 0.77% on a widely used dataset: +ASVspoof2019 LA Challenge. + +
+
+
+
+
+ + ☆ V2A-Mapper: A Lightweight Solution for Vision-to-Audio Generation by + Connecting Foundation Models + + +
+ Building artificial intelligence (AI) systems on top of a set of foundation +models (FMs) is becoming a new paradigm in AI research. Their representative +and generative abilities learnt from vast amounts of data can be easily adapted +and transferred to a wide range of downstream tasks without extra training from +scratch. However, leveraging FMs in cross-modal generation remains +under-researched when audio modality is involved. On the other hand, +automatically generating semantically-relevant sound from visual input is an +important problem in cross-modal generation studies. To solve this +vision-to-audio (V2A) generation problem, existing methods tend to design and +build complex systems from scratch using modestly sized datasets. In this +paper, we propose a lightweight solution to this problem by leveraging +foundation models, specifically CLIP, CLAP, and AudioLDM. We first investigate +the domain gap between the latent space of the visual CLIP and the auditory +CLAP models. Then we propose a simple yet effective mapper mechanism +(V2A-Mapper) to bridge the domain gap by translating the visual input between +CLIP and CLAP spaces. Conditioned on the translated CLAP embedding, pretrained +audio generative FM AudioLDM is adopted to produce high-fidelity and +visually-aligned sound. Compared to previous approaches, our method only +requires a quick training of the V2A-Mapper. We further analyze and conduct +extensive experiments on the choice of the V2A-Mapper and show that a +generative mapper is better at fidelity and variability (FD) while a regression +mapper is slightly better at relevance (CS). Both objective and subjective +evaluation on two V2A datasets demonstrate the superiority of our proposed +method compared to current state-of-the-art approaches - trained with 86% fewer +parameters but achieving 53% and 19% improvement in FD and CS, respectively. + +
+
+ comment: 13 pages, 10 figures. Code, demo, and samples: + https://v2a-mapper.github.io/ +
+
+
+
+
+ + ☆ Preference-conditioned Pixel-based AI Agent For Game Testing + + +
+ The game industry is challenged to cope with increasing growth in demand and +game complexity while maintaining acceptable quality standards for released +games. Classic approaches solely depending on human efforts for quality +assurance and game testing do not scale effectively in terms of time and cost. +Game-testing AI agents that learn by interaction with the environment have the +potential to mitigate these challenges with good scalability properties on time +and costs. However, most recent work in this direction depends on game state +information for the agent's state representation, which limits generalization +across different game scenarios. Moreover, game test engineers usually prefer +exploring a game in a specific style, such as exploring the golden path. +However, current game testing AI agents do not provide an explicit way to +satisfy such a preference. This paper addresses these limitations by proposing +an agent design that mainly depends on pixel-based state observations while +exploring the environment conditioned on a user's preference specified by +demonstration trajectories. In addition, we propose an imitation learning +method that couples self-supervised and supervised learning objectives to +enhance the quality of imitation behaviors. Our agent significantly outperforms +state-of-the-art pixel-based game testing agents over exploration coverage and +test execution quality when evaluated on a complex open-world environment +resembling many aspects of real AAA games. + +
+
+
+
+
+ + ♻ ☆ KeyPosS: Plug-and-Play Facial Landmark Detection through GPS-Inspired + True-Range Multilateration + + +
+ In the realm of facial analysis, accurate landmark detection is crucial for +various applications, ranging from face recognition and expression analysis to +animation. Conventional heatmap or coordinate regression-based techniques, +however, often face challenges in terms of computational burden and +quantization errors. To address these issues, we present the KeyPoint +Positioning System (KeyPosS) - a groundbreaking facial landmark detection +framework that stands out from existing methods. The framework utilizes a fully +convolutional network to predict a distance map, which computes the distance +between a Point of Interest (POI) and multiple anchor points. These anchor +points are ingeniously harnessed to triangulate the POI's position through the +True-range Multilateration algorithm. Notably, the plug-and-play nature of +KeyPosS enables seamless integration into any decoding stage, ensuring a +versatile and adaptable solution. We conducted a thorough evaluation of +KeyPosS's performance by benchmarking it against state-of-the-art models on +four different datasets. The results show that KeyPosS substantially +outperforms leading methods in low-resolution settings while requiring a +minimal time overhead. The code is available at +https://github.com/zhiqic/KeyPosS. + +
+
+ comment: Accepted to ACM Multimedia 2023; 10 pages, 7 figures, 6 tables; the + code is at https://github.com/zhiqic/KeyPosS +
+
+
+
+
+ + ♻ ☆ A Shift In Artistic Practices through Artificial Intelligence + + +
+ The explosion of content generated by Artificial Intelligence models has +initiated a cultural shift in arts, music, and media, where roles are changing, +values are shifting, and conventions are challenged. The readily available, +vast dataset of the internet has created an environment for AI models to be +trained on any content on the web. With AI models shared openly, and used by +many, globally, how does this new paradigm shift challenge the status quo in +artistic practices? What kind of changes will AI technology bring into music, +arts, and new media? + +
+
+ comment: Submitted to Leonardo Journal +
+
+
+
+
+ + ♻ ☆ Improved Nonlinear Transform Source-Channel Coding to Catalyze Semantic + Communications + + +
+ Recent deep learning methods have led to increased interest in solving +high-efficiency end-to-end transmission problems. These methods, we call +nonlinear transform source-channel coding (NTSCC), extract the semantic latent +features of source signal, and learn entropy model to guide the joint +source-channel coding with variable rate to transmit latent features over +wireless channels. In this paper, we propose a comprehensive framework for +improving NTSCC, thereby higher system coding gain, better model versatility, +and more flexible adaptation strategy aligned with semantic guidance are all +achieved. This new sophisticated NTSCC model is now ready to support large-size +data interaction in emerging XR, which catalyzes the application of semantic +communications. Specifically, we propose three useful improvement approaches. +First, we introduce a contextual entropy model to better capture the spatial +correlations among the semantic latent features, thereby more accurate rate +allocation and contextual joint source-channel coding are developed accordingly +to enable higher coding gain. On that basis, we further propose response +network architectures to formulate versatile NTSCC, i.e., once-trained model +supports various rates and channel states that benefits the practical +deployment. Following this, we propose an online latent feature editing method +to enable more flexible coding rate control aligned with some specific semantic +guidance. By comprehensively applying the above three improvement methods for +NTSCC, a deployment-friendly semantic coded transmission system stands out +finally. Our improved NTSCC system has been experimentally verified to achieve +considerable bandwidth saving versus the state-of-the-art engineered VTM + 5G +LDPC coded transmission system with lower processing latency. + +
+
+
+
+
+ + ♻ ☆ Audio-Visual Spatial Integration and Recursive Attention for Robust + Sound Source Localization ACM MM 2023 + + +
+ The objective of the sound source localization task is to enable machines to +detect the location of sound-making objects within a visual scene. While the +audio modality provides spatial cues to locate the sound source, existing +approaches only use audio as an auxiliary role to compare spatial regions of +the visual modality. Humans, on the other hand, utilize both audio and visual +modalities as spatial cues to locate sound sources. In this paper, we propose +an audio-visual spatial integration network that integrates spatial cues from +both modalities to mimic human behavior when detecting sound-making objects. +Additionally, we introduce a recursive attention network to mimic human +behavior of iterative focusing on objects, resulting in more accurate attention +regions. To effectively encode spatial information from both modalities, we +propose audio-visual pair matching loss and spatial region alignment loss. By +utilizing the spatial cues of audio-visual modalities and recursively focusing +objects, our method can perform more robust sound source localization. +Comprehensive experimental results on the Flickr SoundNet and VGG-Sound Source +datasets demonstrate the superiority of our proposed method over existing +approaches. Our code is available at: https://github.com/VisualAIKHU/SIRA-SSL + +
+
+ comment: Camera-Ready, ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ CoVLR: Coordinating Cross-Modal Consistency and Intra-Modal Structure + for Vision-Language Retrieval + + +
+ Current vision-language retrieval aims to perform cross-modal instance +search, in which the core idea is to learn the consistent visionlanguage +representations. Although the performance of cross-modal retrieval has greatly +improved with the development of deep models, we unfortunately find that +traditional hard consistency may destroy the original relationships among +single-modal instances, leading the performance degradation for single-modal +retrieval. To address this challenge, in this paper, we experimentally observe +that the vision-language divergence may cause the existence of strong and weak +modalities, and the hard cross-modal consistency cannot guarantee that strong +modal instances' relationships are not affected by weak modality, resulting in +the strong modal instances' relationships perturbed despite learned consistent +representations.To this end, we propose a novel and directly Coordinated +VisionLanguage Retrieval method (dubbed CoVLR), which aims to study and +alleviate the desynchrony problem between the cross-modal alignment and +single-modal cluster-preserving tasks. CoVLR addresses this challenge by +developing an effective meta-optimization based strategy, in which the +cross-modal consistency objective and the intra-modal relation preserving +objective are acted as the meta-train and meta-test tasks, thereby CoVLR +encourages both tasks to be optimized in a coordinated way. Consequently, we +can simultaneously insure cross-modal consistency and intra-modal structure. +Experiments on different datasets validate CoVLR can improve single-modal +retrieval accuracy whilst preserving crossmodal retrieval capacity compared +with the baselines. + +
+
+ comment: I apologize for my operational mistake, which has resulted in the + absence of a revised version of the manuscript. Furthermore, I am concerned + that the submission process of this paper may potentially lead to conflicts. + Therefore, I kindly request the withdrawal of the manuscript +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 46 + +
+
+
+ + ☆ A Comparative Study of Text Embedding Models for Semantic Text + Similarity in Bug Reports + + +
+ Bug reports are an essential aspect of software development, and it is +crucial to identify and resolve them quickly to ensure the consistent +functioning of software systems. Retrieving similar bug reports from an +existing database can help reduce the time and effort required to resolve bugs. +In this paper, we compared the effectiveness of semantic textual similarity +methods for retrieving similar bug reports based on a similarity score. We +explored several embedding models such as TF-IDF (Baseline), FastText, Gensim, +BERT, and ADA. We used the Software Defects Data containing bug reports for +various software projects to evaluate the performance of these models. Our +experimental results showed that BERT generally outperformed the rest of the +models regarding recall, followed by ADA, Gensim, FastText, and TFIDF. Our +study provides insights into the effectiveness of different embedding methods +for retrieving similar bug reports and highlights the impact of selecting the +appropriate one for this task. Our code is available on GitHub. + +
+
+ comment: 7 Pages +
+
+
+
+
+ + ☆ Is Argument Structure of Learner Chinese Understandable: A Corpus-Based + Analysis + + +
+ This paper presents a corpus-based analysis of argument structure errors in +learner Chinese. The data for analysis includes sentences produced by language +learners as well as their corrections by native speakers. We couple the data +with semantic role labeling annotations that are manually created by two senior +students whose majors are both Applied Linguistics. The annotation procedure is +guided by the Chinese PropBank specification, which is originally developed to +cover first language phenomena. Nevertheless, we find that it is quite +comprehensive for handling second language phenomena. The inter-annotator +agreement is rather high, suggesting the understandability of learner texts to +native speakers. Based on our annotations, we present a preliminary analysis of +competence errors related to argument structure. In particular, speech errors +related to word order, word selection, lack of proposition, and +argument-adjunct confounding are discussed. + +
+
+ comment: Proceedings of the 2018 International Conference on Bilingual + Learning and Teaching (ICBLT-2018) +
+
+
+
+
+ + ☆ ZhiJian: A Unifying and Rapidly Deployable Toolbox for Pre-trained Model + Reuse + + +
+ The rapid expansion of foundation pre-trained models and their fine-tuned +counterparts has significantly contributed to the advancement of machine +learning. Leveraging pre-trained models to extract knowledge and expedite +learning in real-world tasks, known as "Model Reuse", has become crucial in +various applications. Previous research focuses on reusing models within a +certain aspect, including reusing model weights, structures, and hypothesis +spaces. This paper introduces ZhiJian, a comprehensive and user-friendly +toolbox for model reuse, utilizing the PyTorch backend. ZhiJian presents a +novel paradigm that unifies diverse perspectives on model reuse, encompassing +target architecture construction with PTM, tuning target model with PTM, and +PTM-based inference. This empowers deep learning practitioners to explore +downstream tasks and identify the complementary advantages among different +methods. ZhiJian is readily accessible at +https://github.com/zhangyikaii/lamda-zhijian facilitating seamless utilization +of pre-trained models and streamlining the model reuse process for researchers +and developers. + +
+
+
+
+
+ + ☆ Characterizing Information Seeking Events in Health-Related Social + Discourse AAAI-2024 + + +
+ Social media sites have become a popular platform for individuals to seek and +share health information. Despite the progress in natural language processing +for social media mining, a gap remains in analyzing health-related texts on +social discourse in the context of events. Event-driven analysis can offer +insights into different facets of healthcare at an individual and collective +level, including treatment options, misconceptions, knowledge gaps, etc. This +paper presents a paradigm to characterize health-related information-seeking in +social discourse through the lens of events. Events here are board categories +defined with domain experts that capture the trajectory of the +treatment/medication. To illustrate the value of this approach, we analyze +Reddit posts regarding medications for Opioid Use Disorder (OUD), a critical +global health concern. To the best of our knowledge, this is the first attempt +to define event categories for characterizing information-seeking in OUD social +discourse. Guided by domain experts, we develop TREAT-ISE, a novel multilabel +treatment information-seeking event dataset to analyze online discourse on an +event-based framework. This dataset contains Reddit posts on +information-seeking events related to recovery from OUD, where each post is +annotated based on the type of events. We also establish a strong performance +benchmark (77.4% F1 score) for the task by employing several machine learning +and deep learning classifiers. Finally, we thoroughly investigate the +performance and errors of ChatGPT on this task, providing valuable insights +into the LLM's capabilities and ongoing characterization efforts. + +
+
+ comment: Under review AAAI-2024. 10 pages, 6 tables, 2 figues +
+
+
+
+
+ + ☆ Semantic Consistency for Assuring Reliability of Large Language Models + + +
+ Large Language Models (LLMs) exhibit remarkable fluency and competence across +various natural language tasks. However, recent research has highlighted their +sensitivity to variations in input prompts. To deploy LLMs in a safe and +reliable manner, it is crucial for their outputs to be consistent when prompted +with expressions that carry the same meaning or intent. While some existing +work has explored how state-of-the-art LLMs address this issue, their +evaluations have been confined to assessing lexical equality of single- or +multi-word answers, overlooking the consistency of generative text sequences. +For a more comprehensive understanding of the consistency of LLMs in open-ended +text generation scenarios, we introduce a general measure of semantic +consistency, and formulate multiple versions of this metric to evaluate the +performance of various LLMs. Our proposal demonstrates significantly higher +consistency and stronger correlation with human evaluations of output +consistency than traditional metrics based on lexical consistency. Finally, we +propose a novel prompting strategy, called Ask-to-Choose (A2C), to enhance +semantic consistency. When evaluated for closed-book question answering based +on answer variations from the TruthfulQA benchmark, A2C increases accuracy +metrics for pretrained and finetuned LLMs by up to 47%, and semantic +consistency metrics for instruction-tuned models by up to 7-fold. + +
+
+
+
+
+ + ☆ EgoSchema: A Diagnostic Benchmark for Very Long-form Video Language + Understanding + + +
+ We introduce EgoSchema, a very long-form video question-answering dataset, +and benchmark to evaluate long video understanding capabilities of modern +vision and language systems. Derived from Ego4D, EgoSchema consists of over +5000 human curated multiple choice question answer pairs, spanning over 250 +hours of real video data, covering a very broad range of natural human activity +and behavior. For each question, EgoSchema requires the correct answer to be +selected between five given options based on a three-minute-long video clip. +While some prior works have proposed video datasets with long clip lengths, we +posit that merely the length of the video clip does not truly capture the +temporal difficulty of the video task that is being considered. To remedy this, +we introduce temporal certificate sets, a general notion for capturing the +intrinsic temporal understanding length associated with a broad range of video +understanding tasks & datasets. Based on this metric, we find EgoSchema to have +intrinsic temporal lengths over 5.7x longer than the second closest dataset and +10x to 100x longer than any other video understanding dataset. Further, our +evaluation of several current state-of-the-art video and language models shows +them to be severely lacking in long-term video understanding capabilities. Even +models with several billions of parameters achieve QA accuracy less than 33% +(random is 20%) on the EgoSchema multi-choice question answering task, while +humans achieve about 76% accuracy. We posit that \name{}{}, with its long +intrinsic temporal structures and diverse complexity, would serve as a valuable +evaluation probe for developing effective long-term video understanding systems +in the future. Data and Zero-shot model evaluation code are open-sourced for +both public and commercial use under the Ego4D license at +http://egoschema.github.io + +
+
+ comment: https://egoschema.github.io/ +
+
+
+
+
+ + ☆ Linearity of Relation Decoding in Transformer Language Models + + +
+ Much of the knowledge encoded in transformer language models (LMs) may be +expressed in terms of relations: relations between words and their synonyms, +entities and their attributes, etc. We show that, for a subset of relations, +this computation is well-approximated by a single linear transformation on the +subject representation. Linear relation representations may be obtained by +constructing a first-order approximation to the LM from a single prompt, and +they exist for a variety of factual, commonsense, and linguistic relations. +However, we also identify many cases in which LM predictions capture relational +knowledge accurately, but this knowledge is not linearly encoded in their +representations. Our results thus reveal a simple, interpretable, but +heterogeneously deployed knowledge representation strategy in transformer LMs. + +
+
+
+
+
+ + ☆ MaScQA: A Question Answering Dataset for Investigating Materials Science + Knowledge of Large Language Models + + +
+ Information extraction and textual comprehension from materials literature +are vital for developing an exhaustive knowledge base that enables accelerated +materials discovery. Language models have demonstrated their capability to +answer domain-specific questions and retrieve information from knowledge bases. +However, there are no benchmark datasets in the materials domain that can +evaluate the understanding of the key concepts by these language models. In +this work, we curate a dataset of 650 challenging questions from the materials +domain that require the knowledge and skills of a materials student who has +cleared their undergraduate degree. We classify these questions based on their +structure and the materials science domain-based subcategories. Further, we +evaluate the performance of GPT-3.5 and GPT-4 models on solving these questions +via zero-shot and chain of thought prompting. It is observed that GPT-4 gives +the best performance (~62% accuracy) as compared to GPT-3.5. Interestingly, in +contrast to the general observation, no significant improvement in accuracy is +observed with the chain of thought prompting. To evaluate the limitations, we +performed an error analysis, which revealed conceptual errors (~64%) as the +major contributor compared to computational errors (~36%) towards the reduced +performance of LLMs. We hope that the dataset and analysis performed in this +work will promote further research in developing better materials science +domain-specific LLMs and strategies for information extraction. + +
+
+
+
+
+ + ☆ mCL-NER: Cross-Lingual Named Entity Recognition via Multi-view + Contrastive Learning + + +
+ Cross-lingual named entity recognition (CrossNER) faces challenges stemming +from uneven performance due to the scarcity of multilingual corpora, especially +for non-English data. While prior efforts mainly focus on data-driven transfer +methods, a significant aspect that has not been fully explored is aligning both +semantic and token-level representations across diverse languages. In this +paper, we propose Multi-view Contrastive Learning for Cross-lingual Named +Entity Recognition (mCL-NER). Specifically, we reframe the CrossNER task into a +problem of recognizing relationships between pairs of tokens. This approach +taps into the inherent contextual nuances of token-to-token connections within +entities, allowing us to align representations across different languages. A +multi-view contrastive learning framework is introduced to encompass semantic +contrasts between source, codeswitched, and target sentences, as well as +contrasts among token-to-token relations. By enforcing agreement within both +semantic and relational spaces, we minimize the gap between source sentences +and their counterparts of both codeswitched and target sentences. This +alignment extends to the relationships between diverse tokens, enhancing the +projection of entities across languages. We further augment CrossNER by +combining self-training with labeled source data and unlabeled target data. Our +experiments on the XTREME benchmark, spanning 40 languages, demonstrate the +superiority of mCL-NER over prior data-driven and model-based approaches. It +achieves a substantial increase of nearly +2.0 $F_1$ scores across a broad +spectrum and establishes itself as the new state-of-the-art performer. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Enhancing API Documentation through BERTopic Modeling and Summarization + + +
+ As the amount of textual data in various fields, including software +development, continues to grow, there is a pressing demand for efficient and +effective extraction and presentation of meaningful insights. This paper +presents a unique approach to address this need, focusing on the complexities +of interpreting Application Programming Interface (API) documentation. While +official API documentation serves as a primary source of information for +developers, it can often be extensive and lacks user-friendliness. In light of +this, developers frequently resort to unofficial sources like Stack Overflow +and GitHub. Our novel approach employs the strengths of BERTopic for topic +modeling and Natural Language Processing (NLP) to automatically generate +summaries of API documentation, thereby creating a more efficient method for +developers to extract the information they need. The produced summaries and +topics are evaluated based on their performance, coherence, and +interoperability. + The findings of this research contribute to the field of API documentation +analysis by providing insights into recurring topics, identifying common +issues, and generating potential solutions. By improving the accessibility and +efficiency of API documentation comprehension, our work aims to enhance the +software development process and empower developers with practical tools for +navigating complex APIs. + +
+
+
+
+
+ + ☆ Contrasting Linguistic Patterns in Human and LLM-Generated Text + + +
+ We conduct a quantitative analysis contrasting human-written English news +text with comparable large language model (LLM) output from 4 LLMs from the +LLaMa family. Our analysis spans several measurable linguistic dimensions, +including morphological, syntactic, psychometric and sociolinguistic aspects. +The results reveal various measurable differences between human and +AI-generated texts. Among others, human texts exhibit more scattered sentence +length distributions, a distinct use of dependency and constituent types, +shorter constituents, and more aggressive emotions (fear, disgust) than +LLM-generated texts. LLM outputs use more numbers, symbols and auxiliaries +(suggesting objective language) than human texts, as well as more pronouns. The +sexist bias prevalent in human text is also expressed by LLMs. + +
+
+
+
+
+ + ☆ Don't lose the message while paraphrasing: A study on content preserving + style transfer + + +
+ Text style transfer techniques are gaining popularity in natural language +processing allowing paraphrasing text in the required form: from toxic to +neural, from formal to informal, from old to the modern English language, etc. +Solving the task is not sufficient to generate some neural/informal/modern +text, but it is important to preserve the original content unchanged. This +requirement becomes even more critical in some applications such as style +transfer of goal-oriented dialogues where the factual information shall be kept +to preserve the original message, e.g. ordering a certain type of pizza to a +certain address at a certain time. The aspect of content preservation is +critical for real-world applications of style transfer studies, but it has +received little attention. To bridge this gap we perform a comparison of +various style transfer models on the example of the formality transfer domain. +To perform a study of the content preservation abilities of various style +transfer methods we create a parallel dataset of formal vs. informal +task-oriented dialogues. The key difference between our dataset and the +existing ones like GYAFC [17] is the presence of goal-oriented dialogues with +predefined semantic slots essential to be kept during paraphrasing, e.g. named +entities. This additional annotation allowed us to conduct a precise +comparative study of several state-of-the-art techniques for style transfer. +Another result of our study is a modification of the unsupervised method LEWIS +[19] which yields a substantial improvement over the original method and all +evaluated baselines on the proposed task. + +
+
+ comment: Published at the NLDB 2023 conference +
+
+
+
+
+ + ☆ Reinforced Self-Training (ReST) for Language Modeling + + +
+ Reinforcement learning from human feedback (RLHF) can improve the quality of +large language model's (LLM) outputs by aligning them with human preferences. +We propose a simple algorithm for aligning LLMs with human preferences inspired +by growing batch reinforcement learning (RL), which we call Reinforced +Self-Training (ReST). Given an initial LLM policy, ReST produces a dataset by +generating samples from the policy, which are then used to improve the LLM +policy using offline RL algorithms. ReST is more efficient than typical online +RLHF methods because the training dataset is produced offline, which allows +data reuse. While ReST is a general approach applicable to all generative +learning settings, we focus on its application to machine translation. Our +results show that ReST can substantially improve translation quality, as +measured by automated metrics and human evaluation on machine translation +benchmarks in a compute and sample-efficient manner. + +
+
+ comment: 23 pages, 16 figures +
+
+
+
+
+ + ☆ Evaluation of really good grammatical error correction + + +
+ Although rarely stated, in practice, Grammatical Error Correction (GEC) +encompasses various models with distinct objectives, ranging from grammatical +error detection to improving fluency. Traditional evaluation methods fail to +fully capture the full range of system capabilities and objectives. +Reference-based evaluations suffer from limitations in capturing the wide +variety of possible correction and the biases introduced during reference +creation and is prone to favor fixing local errors over overall text +improvement. The emergence of large language models (LLMs) has further +highlighted the shortcomings of these evaluation strategies, emphasizing the +need for a paradigm shift in evaluation methodology. In the current study, we +perform a comprehensive evaluation of various GEC systems using a recently +published dataset of Swedish learner texts. The evaluation is performed using +established evaluation metrics as well as human judges. We find that GPT-3 in a +few-shot setting by far outperforms previous grammatical error correction +systems for Swedish, a language comprising only 0.11% of its training data. We +also found that current evaluation methods contain undesirable biases that a +human evaluation is able to reveal. We suggest using human post-editing of GEC +system outputs to analyze the amount of change required to reach native-level +human performance on the task, and provide a dataset annotated with human +post-edits and assessments of grammaticality, fluency and meaning preservation +of GEC system outputs. + +
+
+
+
+
+ + ☆ Beam Retrieval: General End-to-End Retrieval for Multi-Hop Question + Answering + + +
+ Multi-hop QA involves finding multiple relevant passages and step-by-step +reasoning to answer complex questions. While previous approaches have developed +retrieval modules for selecting relevant passages, they face challenges in +scenarios beyond two hops, owing to the limited performance of one-step methods +and the failure of two-step methods when selecting irrelevant passages in +earlier stages. In this work, we introduce Beam Retrieval, a general end-to-end +retrieval framework for multi-hop QA. This approach maintains multiple partial +hypotheses of relevant passages at each step, expanding the search space and +reducing the risk of missing relevant passages. Moreover, Beam Retrieval +jointly optimizes an encoder and two classification heads by minimizing the +combined loss across all hops. To establish a complete QA system, we +incorporate a supervised reader or a zero-shot GPT-3.5. Experimental results +demonstrate that Beam Retrieval achieves a nearly 50% improvement compared with +baselines on challenging MuSiQue-Ans, and it also surpasses all previous +retrievers on HotpotQA and 2WikiMultiHopQA. Providing high-quality context, +Beam Retrieval helps our supervised reader achieve new state-of-the-art +performance and substantially improves (up to 28.8 points) the QA performance +of zero-shot GPT-3.5. + +
+
+ comment: Code is available at https://github.com/canghongjian/beam_retriever +
+
+
+
+
+ + ☆ CMB: A Comprehensive Medical Benchmark in Chinese + + +
+ Large Language Models (LLMs) provide a possibility to make a great +breakthrough in medicine. The establishment of a standardized medical benchmark +becomes a fundamental cornerstone to measure progression. However, medical +environments in different regions have their local characteristics, e.g., the +ubiquity and significance of traditional Chinese medicine within China. +Therefore, merely translating English-based medical evaluation may result in +\textit{contextual incongruities} to a local region. To solve the issue, we +propose a localized medical benchmark called CMB, a Comprehensive Medical +Benchmark in Chinese, designed and rooted entirely within the native Chinese +linguistic and cultural framework. While traditional Chinese medicine is +integral to this evaluation, it does not constitute its entirety. Using this +benchmark, we have evaluated several prominent large-scale LLMs, including +ChatGPT, GPT-4, dedicated Chinese LLMs, and LLMs specialized in the medical +domain. It is worth noting that our benchmark is not devised as a leaderboard +competition but as an instrument for self-assessment of model advancements. We +hope this benchmark could facilitate the widespread adoption and enhancement of +medical LLMs within China. Check details in +\url{https://cmedbenchmark.llmzoo.com/}. + +
+
+
+
+
+ + ☆ Factuality Detection using Machine Translation -- a Use Case for German + Clinical Text + + +
+ Factuality can play an important role when automatically processing clinical +text, as it makes a difference if particular symptoms are explicitly not +present, possibly present, not mentioned, or affirmed. In most cases, a +sufficient number of examples is necessary to handle such phenomena in a +supervised machine learning setting. However, as clinical text might contain +sensitive information, data cannot be easily shared. In the context of +factuality detection, this work presents a simple solution using machine +translation to translate English data to German to train a transformer-based +factuality detection model. + +
+
+ comment: Accepted at KONVENS 2023 +
+
+
+
+
+ + ☆ Linguistically-Informed Neural Architectures for Lexical, Syntactic and + Semantic Tasks in Sanskrit + + +
+ The primary focus of this thesis is to make Sanskrit manuscripts more +accessible to the end-users through natural language technologies. The +morphological richness, compounding, free word orderliness, and low-resource +nature of Sanskrit pose significant challenges for developing deep learning +solutions. We identify four fundamental tasks, which are crucial for developing +a robust NLP technology for Sanskrit: word segmentation, dependency parsing, +compound type identification, and poetry analysis. The first task, Sanskrit +Word Segmentation (SWS), is a fundamental text processing task for any other +downstream applications. However, it is challenging due to the sandhi +phenomenon that modifies characters at word boundaries. Similarly, the existing +dependency parsing approaches struggle with morphologically rich and +low-resource languages like Sanskrit. Compound type identification is also +challenging for Sanskrit due to the context-sensitive semantic relation between +components. All these challenges result in sub-optimal performance in NLP +applications like question answering and machine translation. Finally, Sanskrit +poetry has not been extensively studied in computational linguistics. + While addressing these challenges, this thesis makes various contributions: +(1) The thesis proposes linguistically-informed neural architectures for these +tasks. (2) We showcase the interpretability and multilingual extension of the +proposed systems. (3) Our proposed systems report state-of-the-art performance. +(4) Finally, we present a neural toolkit named SanskritShala, a web-based +application that provides real-time analysis of input for various NLP tasks. +Overall, this thesis contributes to making Sanskrit manuscripts more accessible +by developing robust NLP technology and releasing various resources, datasets, +and web-based toolkit. + +
+
+ comment: Ph.D. dissertation +
+
+
+
+
+ + ☆ Chinese Spelling Correction as Rephrasing Language Model + + +
+ This paper studies Chinese Spelling Correction (CSC), which aims to detect +and correct potential spelling errors in a given sentence. Current +state-of-the-art methods regard CSC as a sequence tagging task and fine-tune +BERT-based models on sentence pairs. However, we note a critical flaw in the +process of tagging one character to another, that the correction is excessively +conditioned on the error. This is opposite from human mindset, where +individuals rephrase the complete sentence based on its semantics, rather than +solely on the error patterns memorized before. Such a counter-intuitive +learning process results in the bottleneck of generalizability and +transferability of machine spelling correction. To address this, we propose +$Rephrasing Language Modeling$ (ReLM), where the model is trained to rephrase +the entire sentence by infilling additional slots, instead of +character-to-character tagging. This novel training paradigm achieves the new +state-of-the-art results across fine-tuned and zero-shot CSC benchmarks, +outperforming previous counterparts by a large margin. Our method also learns +transferable language representation when CSC is jointly trained with other +tasks. + +
+
+
+
+
+ + ☆ Task Relation Distillation and Prototypical Pseudo Label for Incremental + Named Entity Recognition CIKM2023 + + +
+ Incremental Named Entity Recognition (INER) involves the sequential learning +of new entity types without accessing the training data of previously learned +types. However, INER faces the challenge of catastrophic forgetting specific +for incremental learning, further aggravated by background shift (i.e., old and +future entity types are labeled as the non-entity type in the current task). To +address these challenges, we propose a method called task Relation Distillation +and Prototypical pseudo label (RDP) for INER. Specifically, to tackle +catastrophic forgetting, we introduce a task relation distillation scheme that +serves two purposes: 1) ensuring inter-task semantic consistency across +different incremental learning tasks by minimizing inter-task relation +distillation loss, and 2) enhancing the model's prediction confidence by +minimizing intra-task self-entropy loss. Simultaneously, to mitigate background +shift, we develop a prototypical pseudo label strategy that distinguishes old +entity types from the current non-entity type using the old model. This +strategy generates high-quality pseudo labels by measuring the distances +between token embeddings and type-wise prototypes. We conducted extensive +experiments on ten INER settings of three benchmark datasets (i.e., CoNLL2003, +I2B2, and OntoNotes5). The results demonstrate that our method achieves +significant improvements over the previous state-of-the-art methods, with an +average increase of 6.08% in Micro F1 score and 7.71% in Macro F1 score. + +
+
+ comment: Accepted by CIKM2023 as a long paper with an oral presentation +
+
+
+
+
+ + ☆ Exploring Demonstration Ensembling for In-context Learning ICLR 2023 + + +
+ In-context learning (ICL) operates by showing language models (LMs) examples +of input-output pairs for a given task, i.e., demonstrations. The standard +approach for ICL is to prompt the LM with concatenated demonstrations followed +by the test input. This approach suffers from some issues. First, concatenation +offers almost no control over the contribution of each demo to the model +prediction. This can be sub-optimal when some demonstrations are irrelevant to +the test example. Second, due to the input length limit of some transformer +models, it might be infeasible to fit many examples into the context, +especially when dealing with long-input tasks. In this work, we explore +Demonstration Ensembling (DENSE) as an alternative to simple concatenation. +\model predicts outputs using subsets (i.e., buckets) of the demonstrations and +then combines the output probabilities resulting from each subset to produce +the final prediction. We study different ensembling methods using GPT-j and +experiment on 12 language tasks. Our experiments show weighted max ensembling +to outperform vanilla concatenation by as large as 2.4 average points. Code +available at \url{https://github.com/mukhal/icl-ensembling}. + +
+
+ comment: Published at ME-FoMo workshop at ICLR 2023. Arxiv version includes + evaluation on 5 more tasks +
+
+
+
+
+ + ☆ Differential Privacy, Linguistic Fairness, and Training Data Influence: + Impossibility and Possibility Theorems for Multilingual Language Models ICML 2023 + + +
+ Language models such as mBERT, XLM-R, and BLOOM aim to achieve multilingual +generalization or compression to facilitate transfer to a large number of +(potentially unseen) languages. However, these models should ideally also be +private, linguistically fair, and transparent, by relating their predictions to +training data. Can these requirements be simultaneously satisfied? We show that +multilingual compression and linguistic fairness are compatible with +differential privacy, but that differential privacy is at odds with training +data influence sparsity, an objective for transparency. We further present a +series of experiments on two common NLP tasks and evaluate multilingual +compression and training data influence sparsity under different privacy +guarantees, exploring these trade-offs in more detail. Our results suggest that +we need to develop ways to jointly optimize for these objectives in order to +find practical trade-offs. + +
+
+ comment: ICML 2023 +
+
+
+
+
+ + ☆ Discrete Prompt Compression with Reinforcement Learning + + +
+ Instruction-tuned Language Models (LMs) are widely used by users to address +various problems with task-specific prompts. Constraints associated with the +context window length and computational costs encourage the development of +compressed prompts. Existing methods rely heavily on training embeddings, which +are designed to accommodate multiple token meanings. This presents challenges +in terms of interpretability, a fixed number of embedding tokens, reusability +across different LMs, and inapplicability when interacting with black-box APIs. +This study proposes prompt compression with reinforcement learning (PCRL), a +novel discrete prompt compression method that addresses these issues. PCRL +employs a computationally efficient policy network that directly edits prompts. +The PCRL training approach can be flexibly applied to various types of LMs, as +well as decoder-only and encoder-decoder architecture, and can be trained +without gradient access to LMs or labeled data. PCRL achieves an average +reduction of 24.6% in token count across various instruction prompts while +preserving performance. Further, we demonstrate that the learned policy can be +transferred to larger LMs, and through various analyses, we aid the +understanding of token importance within prompts. + +
+
+
+
+
+ + ☆ An Empirical Study of Catastrophic Forgetting in Large Language Models + During Continual Fine-tuning + + +
+ Catastrophic forgetting (CF) is a phenomenon that occurs in machine learning +when a model forgets previously learned information as it learns new +information. As large language models (LLMs) have shown excellent performance, +it is interesting to uncover whether CF exists in the continual fine-tuning of +LLMs. In this study, we empirically evaluate the forgetting phenomenon in LLMs' +knowledge, from the perspectives of domain knowledge, reasoning, and reading +comprehension. The experiments demonstrate that catastrophic forgetting is +generally observed in LLMs ranging from 1b to 7b. Furthermore, as the scale +increases, the severity of forgetting also intensifies. Comparing the +decoder-only model BLOOMZ with the encoder-decoder model mT0, BLOOMZ suffers +less forgetting and maintains more knowledge. We also observe that LLMs can +mitigate language bias (e.g. gender bias) during continual fine-tuning. +Moreover, we find that ALPACA can maintain more knowledge and capacity compared +with LLAMA during the continual fine-tuning, which implies that general +instruction tuning can help mitigate the forgetting phenomenon of LLMs in the +further fine-tuning process. + +
+
+
+
+
+ + ☆ PMET: Precise Model Editing in a Transformer + + +
+ Model editing techniques modify a minor proportion of knowledge in Large +Language Models (LLMs) at a relatively low cost, which have demonstrated +notable success. Existing methods assume Transformer Layer (TL) hidden states +are values of key-value memories of the Feed-Forward Network (FFN). They +usually optimize the TL hidden states to memorize target knowledge and use it +to update the weights of the FFN in LLMs. However, the information flow of TL +hidden states comes from three parts: Multi-Head Self-Attention (MHSA), FFN, +and residual connections. Existing methods neglect the fact that the TL hidden +states contains information not specifically required for FFN. Consequently, +the performance of model editing decreases. To achieve more precise model +editing, we analyze hidden states of MHSA and FFN, finding that MHSA encodes +certain general knowledge extraction patterns. This implies that MHSA weights +do not require updating when new knowledge is introduced. Based on above +findings, we introduce PMET, which simultaneously optimizes Transformer +Component (TC, namely MHSA and FFN) hidden states, while only using the +optimized TC hidden states of FFN to precisely update FFN weights. Our +experiments demonstrate that PMET exhibits state-of-the-art performance on both +the \textsc{counterfact} and zsRE datasets. Our ablation experiments +substantiate the effectiveness of our enhancements, further reinforcing the +finding that the MHSA encodes certain general knowledge extraction patterns and +indicating its storage of a small amount of factual knowledge. Our code is +available at \url{https://github.com/xpq-tech/PMET.git}. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ Enhancing Phrase Representation by Information Bottleneck Guided Text + Diffusion Process for Keyphrase Extraction + + +
+ Keyphrase extraction (KPE) is an important task in Natural Language +Processing for many scenarios, which aims to extract keyphrases that are +present in a given document. Many existing supervised methods treat KPE as +sequential labeling, span-level classification, or generative tasks. However, +these methods lack the ability to utilize keyphrase information, which may +result in biased results. In this study, we propose Diff-KPE, which leverages +the supervised Variational Information Bottleneck (VIB) to guide the text +diffusion process for generating enhanced keyphrase representations. Diff-KPE +first generates the desired keyphrase embeddings conditioned on the entire +document and then injects the generated keyphrase embeddings into each phrase +representation. A ranking network and VIB are then optimized together with rank +loss and classification loss, respectively. This design of Diff-KPE allows us +to rank each candidate phrase by utilizing both the information of keyphrases +and the document. Experiments show that Diff-KPE outperforms existing KPE +methods on a large open domain keyphrase extraction benchmark, OpenKP, and a +scientific domain dataset, KP20K. + +
+
+ comment: 10 pages, 2 figures +
+
+
+
+
+ + ☆ LLM-FuncMapper: Function Identification for Interpreting Complex Clauses + in Building Codes via LLM + + +
+ As a vital stage of automated rule checking (ARC), rule interpretation of +regulatory texts requires considerable effort. However, interpreting regulatory +clauses with implicit properties or complex computational logic is still +challenging due to the lack of domain knowledge and limited expressibility of +conventional logic representations. Thus, LLM-FuncMapper, an approach to +identifying predefined functions needed to interpret various regulatory clauses +based on the large language model (LLM), is proposed. First, by systematically +analysis of building codes, a series of atomic functions are defined to capture +shared computational logics of implicit properties and complex constraints, +creating a database of common blocks for interpreting regulatory clauses. Then, +a prompt template with the chain of thought is developed and further enhanced +with a classification-based tuning strategy, to enable common LLMs for +effective function identification. Finally, the proposed approach is validated +with statistical analysis, experiments, and proof of concept. Statistical +analysis reveals a long-tail distribution and high expressibility of the +developed function database, with which almost 100% of computer-processible +clauses can be interpreted and represented as computer-executable codes. +Experiments show that LLM-FuncMapper achieve promising results in identifying +relevant predefined functions for rule interpretation. Further proof of concept +in automated rule interpretation also demonstrates the possibility of +LLM-FuncMapper in interpreting complex regulatory clauses. To the best of our +knowledge, this study is the first attempt to introduce LLM for understanding +and interpreting complex regulatory clauses, which may shed light on further +adoption of LLM in the construction domain. + +
+
+
+
+
+ + ☆ Decoding Emotions: A comprehensive Multilingual Study of Speech Models + for Speech Emotion Recognition + + +
+ Recent advancements in transformer-based speech representation models have +greatly transformed speech processing. However, there has been limited research +conducted on evaluating these models for speech emotion recognition (SER) +across multiple languages and examining their internal representations. This +article addresses these gaps by presenting a comprehensive benchmark for SER +with eight speech representation models and six different languages. We +conducted probing experiments to gain insights into inner workings of these +models for SER. We find that using features from a single optimal layer of a +speech model reduces the error rate by 32\% on average across seven datasets +when compared to systems where features from all layers of speech models are +used. We also achieve state-of-the-art results for German and Persian +languages. Our probing results indicate that the middle layers of speech models +capture the most important emotional information for speech emotion +recognition. + +
+
+
+
+
+ + ☆ ChatGPT-HealthPrompt. Harnessing the Power of XAI in Prompt-Based + Healthcare Decision Support using ChatGPT + + +
+ This study presents an innovative approach to the application of large +language models (LLMs) in clinical decision-making, focusing on OpenAI's +ChatGPT. Our approach introduces the use of contextual prompts-strategically +designed to include task description, feature description, and crucially, +integration of domain knowledge-for high-quality binary classification tasks +even in data-scarce scenarios. The novelty of our work lies in the utilization +of domain knowledge, obtained from high-performing interpretable ML models, and +its seamless incorporation into prompt design. By viewing these ML models as +medical experts, we extract key insights on feature importance to aid in +decision-making processes. This interplay of domain knowledge and AI holds +significant promise in creating a more insightful diagnostic tool. + Additionally, our research explores the dynamics of zero-shot and few-shot +prompt learning based on LLMs. By comparing the performance of OpenAI's ChatGPT +with traditional supervised ML models in different data conditions, we aim to +provide insights into the effectiveness of prompt engineering strategies under +varied data availability. In essence, this paper bridges the gap between AI and +healthcare, proposing a novel methodology for LLMs application in clinical +decision support systems. It highlights the transformative potential of +effective prompt design, domain knowledge integration, and flexible learning +approaches in enhancing automated decision-making. + +
+
+
+
+
+ + ♻ ☆ Fighting Fire with Fire: Can ChatGPT Detect AI-generated Text? KDD + + +
+ Large language models (LLMs) such as ChatGPT are increasingly being used for +various use cases, including text content generation at scale. Although +detection methods for such AI-generated text exist already, we investigate +ChatGPT's performance as a detector on such AI-generated text, inspired by +works that use ChatGPT as a data labeler or annotator. We evaluate the +zero-shot performance of ChatGPT in the task of human-written vs. AI-generated +text detection, and perform experiments on publicly available datasets. We +empirically investigate if ChatGPT is symmetrically effective in detecting +AI-generated or human-written text. Our findings provide insight on how ChatGPT +and similar LLMs may be leveraged in automated detection pipelines by simply +focusing on solving a specific aspect of the problem and deriving the rest from +that solution. All code and data is available at +https://github.com/AmritaBh/ChatGPT-as-Detector. + +
+
+ comment: to appear in SIGKDD Explorations (December 2023) +
+
+
+
+
+ + ♻ ☆ PromptCap: Prompt-Guided Task-Aware Image Captioning ICCV 2023 + + +
+ Knowledge-based visual question answering (VQA) involves questions that +require world knowledge beyond the image to yield the correct answer. Large +language models (LMs) like GPT-3 are particularly helpful for this task because +of their strong knowledge retrieval and reasoning capabilities. To enable LM to +understand images, prior work uses a captioning model to convert images into +text. However, when summarizing an image in a single caption sentence, which +visual entities to describe are often underspecified. Generic image captions +often miss visual details essential for the LM to answer visual questions +correctly. To address this challenge, we propose PromptCap (Prompt-guided image +Captioning), a captioning model designed to serve as a better connector between +images and black-box LMs. Different from generic captions, PromptCap takes a +natural-language prompt to control the visual entities to describe in the +generated caption. The prompt contains a question that the caption should aid +in answering. To avoid extra annotation, PromptCap is trained by examples +synthesized with GPT-3 and existing datasets. We demonstrate PromptCap's +effectiveness on an existing pipeline in which GPT-3 is prompted with image +captions to carry out VQA. PromptCap outperforms generic captions by a large +margin and achieves state-of-the-art accuracy on knowledge-based VQA tasks +(60.4% on OK-VQA and 59.6% on A-OKVQA). Zero-shot results on WebQA show that +PromptCap generalizes well to unseen domains. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Can Knowledge Graphs Simplify Text? CIKM 2023 + + +
+ Knowledge Graph (KG)-to-Text Generation has seen recent improvements in +generating fluent and informative sentences which describe a given KG. As KGs +are widespread across multiple domains and contain important entity-relation +information, and as text simplification aims to reduce the complexity of a text +while preserving the meaning of the original text, we propose KGSimple, a novel +approach to unsupervised text simplification which infuses KG-established +techniques in order to construct a simplified KG path and generate a concise +text which preserves the original input's meaning. Through an iterative and +sampling KG-first approach, our model is capable of simplifying text when +starting from a KG by learning to keep important information while harnessing +KG-to-text generation to output fluent and descriptive sentences. We evaluate +various settings of the KGSimple model on currently-available KG-to-text +datasets, demonstrating its effectiveness compared to unsupervised text +simplification models which start with a given complex text. Our code is +available on GitHub. + +
+
+ comment: Accepted as a Main Conference Long Paper at CIKM 2023 +
+
+
+
+
+ + ♻ ☆ GrammarGPT: Exploring Open-Source LLMs for Native Chinese Grammatical + Error Correction with Supervised Fine-Tuning + + +
+ Grammatical error correction aims to correct ungrammatical sentences +automatically. Recently, some work has demonstrated the excellent capabilities +of closed-source Large Language Models (LLMs, e.g., ChatGPT) in grammatical +error correction. However, the potential of open-source LLMs remains +unexplored. In this paper, we introduced GrammarGPT, an open-source LLM, to +preliminary explore its potential for native Chinese grammatical error +correction. The core recipe of GrammarGPT is to leverage the hybrid dataset of +ChatGPT-generated and human-annotated. For grammatical errors with clues, we +proposed a heuristic method to guide ChatGPT to generate ungrammatical +sentences by providing those clues. For grammatical errors without clues, we +collected ungrammatical sentences from publicly available websites and manually +corrected them. In addition, we employed an error-invariant augmentation method +to enhance the ability of the model to correct native Chinese grammatical +errors. We ultimately constructed about 1k parallel data and utilized these +data to fine-tune open-source LLMs (e.g., Phoenix, released by The Chinese +University of Hong Kong, Shenzhen) with instruction tuning. The experimental +results show that GrammarGPT outperforms the existing SOTA system +significantly. Although model parameters are 20x larger than the SOTA baseline, +the required amount of data for instruction tuning is 1200x smaller, +illustrating the potential of open-source LLMs on native CGEC. Our GrammarGPT +ranks $3^{rd}$ on NLPCC2023 SharedTask1, demonstrating our approach's +effectiveness. The code and data are available at +\url{https://github.com/FreedomIntelligence/GrammarGPT}. + +
+
+
+
+
+ + ♻ ☆ Can ChatGPT Detect Intent? Evaluating Large Language Models for Spoken + Language Understanding + + +
+ Recently, large pretrained language models have demonstrated strong language +understanding capabilities. This is particularly reflected in their zero-shot +and in-context learning abilities on downstream tasks through prompting. To +assess their impact on spoken language understanding (SLU), we evaluate several +such models like ChatGPT and OPT of different sizes on multiple benchmarks. We +verify the emergent ability unique to the largest models as they can reach +intent classification accuracy close to that of supervised models with zero or +few shots on various languages given oracle transcripts. By contrast, the +results for smaller models fitting a single GPU fall far behind. We note that +the error cases often arise from the annotation scheme of the dataset; +responses from ChatGPT are still reasonable. We show, however, that the model +is worse at slot filling, and its performance is sensitive to ASR errors, +suggesting serious challenges for the application of those textual models on +SLU. + +
+
+ comment: 6 pages, 2 figures; Accepted by Interspeech 2023 +
+
+
+
+
+ + ♻ ☆ A Survey on Model Compression for Large Language Models + + +
+ Large Language Models (LLMs) have revolutionized natural language processing +tasks with remarkable success. However, their formidable size and computational +demands present significant challenges for practical deployment, especially in +resource-constrained environments. As these challenges become increasingly +pertinent, the field of model compression has emerged as a pivotal research +area to alleviate these limitations. This paper presents a comprehensive survey +that navigates the landscape of model compression techniques tailored +specifically for LLMs. Addressing the imperative need for efficient deployment, +we delve into various methodologies, encompassing quantization, pruning, +knowledge distillation, and more. Within each of these techniques, we highlight +recent advancements and innovative approaches that contribute to the evolving +landscape of LLM research. Furthermore, we explore benchmarking strategies and +evaluation metrics that are essential for assessing the effectiveness of +compressed LLMs. By providing insights into the latest developments and +practical implications, this survey serves as an invaluable resource for both +researchers and practitioners. As LLMs continue to evolve, this survey aims to +facilitate enhanced efficiency and real-world applicability, establishing a +foundation for future advancements in the field. + +
+
+
+
+
+ + ♻ ☆ Black Box Few-Shot Adaptation for Vision-Language models ICCV 2023 + + +
+ Vision-Language (V-L) models trained with contrastive learning to align the +visual and language modalities have been shown to be strong few-shot learners. +Soft prompt learning is the method of choice for few-shot downstream adaptation +aiming to bridge the modality gap caused by the distribution shift induced by +the new domain. While parameter-efficient, prompt learning still requires +access to the model weights and can be computationally infeasible for large +models with billions of parameters. To address these shortcomings, in this +work, we describe a black-box method for V-L few-shot adaptation that (a) +operates on pre-computed image and text features and hence works without access +to the model's weights, (b) it is orders of magnitude faster at training time, +(c) it is amenable to both supervised and unsupervised training, and (d) it can +be even used to align image and text features computed from uni-modal models. +To achieve this, we propose Linear Feature Alignment (LFA), a simple linear +approach for V-L re-alignment in the target domain. LFA is initialized from a +closed-form solution to a least-squares problem and then it is iteratively +updated by minimizing a re-ranking loss. Despite its simplicity, our approach +can even surpass soft-prompt learning methods as shown by extensive experiments +on 11 image and 2 video datasets. + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Clinical Camel: An Open Expert-Level Medical Language Model with + Dialogue-Based Knowledge Encoding + + +
+ We present Clinical Camel, an open large language model (LLM) explicitly +tailored for clinical research. Fine-tuned from LLaMA-2 using QLoRA, Clinical +Camel achieves state-of-the-art performance across medical benchmarks among +openly available medical LLMs. Leveraging efficient single-GPU training, +Clinical Camel surpasses GPT-3.5 in five-shot evaluations on all assessed +benchmarks, including 64.3% on the USMLE Sample Exam (compared to 58.5% for +GPT-3.5), 77.9% on PubMedQA (compared to 60.2%), 60.7% on MedQA (compared to +53.6%), and 54.2% on MedMCQA (compared to 51.0%). In addition to these +benchmarks, Clinical Camel demonstrates its broader capabilities, such as +synthesizing plausible clinical notes. This work introduces dialogue-based +knowledge encoding, a novel method to synthesize conversational data from dense +medical texts. While benchmark results are encouraging, extensive and rigorous +human evaluation across diverse clinical scenarios is imperative to ascertain +safety before implementation. By openly sharing Clinical Camel, we hope to +foster transparent and collaborative research, working towards the safe +integration of LLMs within the healthcare domain. Significant challenges +concerning reliability, bias, and the potential for outdated knowledge persist. +Nonetheless, the transparency provided by an open approach reinforces the +scientific rigor essential for future clinical applications. + +
+
+ comment: for model weights, see https://huggingface.co/wanglab/ +
+
+
+
+
+ + ♻ ☆ Topological properties and organizing principles of semantic networks + + +
+ Interpreting natural language is an increasingly important task in computer +algorithms due to the growing availability of unstructured textual data. +Natural Language Processing (NLP) applications rely on semantic networks for +structured knowledge representation. The fundamental properties of semantic +networks must be taken into account when designing NLP algorithms, yet they +remain to be structurally investigated. We study the properties of semantic +networks from ConceptNet, defined by 7 semantic relations from 11 different +languages. We find that semantic networks have universal basic properties: they +are sparse, highly clustered, and many exhibit power-law degree distributions. +Our findings show that the majority of the considered networks are scale-free. +Some networks exhibit language-specific properties determined by grammatical +rules, for example networks from highly inflected languages, such as e.g. +Latin, German, French and Spanish, show peaks in the degree distribution that +deviate from a power law. We find that depending on the semantic relation type +and the language, the link formation in semantic networks is guided by +different principles. In some networks the connections are similarity-based, +while in others the connections are more complementarity-based. Finally, we +demonstrate how knowledge of similarity and complementarity in semantic +networks can improve NLP algorithms in missing link inference. + +
+
+
+
+
+ + ♻ ☆ Efficient Utilization of Large Pre-Trained Models for Low Resource ASR ICASSP + + +
+ Unsupervised representation learning has recently helped automatic speech +recognition (ASR) to tackle tasks with limited labeled data. Following this, +hardware limitations and applications give rise to the question how to take +advantage of large pre-trained models efficiently and reduce their complexity. +In this work, we study a challenging low resource conversational telephony +speech corpus from the medical domain in Vietnamese and German. We show the +benefits of using unsupervised techniques beyond simple fine-tuning of large +pre-trained models, discuss how to adapt them to a practical telephony task +including bandwidth transfer and investigate different data conditions for +pre-training and fine-tuning. We outperform the project baselines by 22% +relative using pretraining techniques. Further gains of 29% can be achieved by +refinements of architecture and training and 6% by adding 0.8 h of in-domain +adaptation data. + +
+
+ comment: Accepted at ICASSP SASB 2023 +
+
+
+
+
+ + ♻ ☆ Getting pwn'd by AI: Penetration Testing with Large Language Models + + +
+ The field of software security testing, more specifically penetration +testing, is an activity that requires high levels of expertise and involves +many manual testing and analysis steps. This paper explores the potential usage +of large-language models, such as GPT3.5, to augment penetration testers with +AI sparring partners. We explore the feasibility of supplementing penetration +testers with AI models for two distinct use cases: high-level task planning for +security testing assignments and low-level vulnerability hunting within a +vulnerable virtual machine. For the latter, we implemented a closed-feedback +loop between LLM-generated low-level actions with a vulnerable virtual machine +(connected through SSH) and allowed the LLM to analyze the machine state for +vulnerabilities and suggest concrete attack vectors which were automatically +executed within the virtual machine. We discuss promising initial results, +detail avenues for improvement, and close deliberating on the ethics of +providing AI-based sparring partners. + +
+
+
+
+
+ + ♻ ☆ Self-Edit: Fault-Aware Code Editor for Code Generation ACL2023 + + +
+ Large language models (LLMs) have demonstrated an impressive ability to +generate codes on competitive programming tasks. However, with limited sample +numbers, LLMs still suffer from poor accuracy. Inspired by the process of human +programming, we propose a generate-and-edit approach named Self-Edit that +utilizes execution results of the generated code from LLMs to improve the code +quality on the competitive programming task. We execute the generated code on +the example test case provided in the question and wrap execution results into +a supplementary comment. Utilizing this comment as guidance, our fault-aware +code editor is employed to correct errors in the generated code. We perform +extensive evaluations across two competitive programming datasets with nine +different LLMs. Compared to directly generating from LLMs, our approach can +improve the average of pass@1 by 89\% on APPS-dev, 31\% on APPS-test, and 48\% +on HumanEval over nine popular code generation LLMs with parameter sizes +ranging from 110M to 175B. Compared to other post-processing methods, our +method demonstrates superior accuracy and efficiency. + +
+
+ comment: Accepted by ACL2023 +
+
+
+
+
+ + ♻ ☆ Does mBERT understand Romansh? Evaluating word embeddings using word + alignment + + +
+ We test similarity-based word alignment models (SimAlign and awesome-align) +in combination with word embeddings from mBERT and XLM-R on parallel sentences +in German and Romansh. Since Romansh is an unseen language, we are dealing with +a zero-shot setting. Using embeddings from mBERT, both models reach an +alignment error rate of 0.22, which outperforms fast_align, a statistical +model, and is on par with similarity-based word alignment for seen languages. +We interpret these results as evidence that mBERT contains information that can +be meaningful and applicable to Romansh. + To evaluate performance, we also present a new trilingual corpus, which we +call the DERMIT (DE-RM-IT) corpus, containing press releases made by the Canton +of Grisons in German, Romansh and Italian in the past 25 years. The corpus +contains 4 547 parallel documents and approximately 100 000 sentence pairs in +each language combination. We additionally present a gold standard for +German-Romansh word alignment. The data is available at +https://github.com/eyldlv/DERMIT-Corpus. + +
+
+
+
+
+ + ♻ ☆ Steering Language Generation: Harnessing Contrastive Expert Guidance and + Negative Prompting for Coherent and Diverse Synthetic Data Generation + + +
+ Large Language Models (LLMs) hold immense potential to generate synthetic +data of high quality and utility, which has numerous applications from +downstream model training to practical data utilisation. However, contemporary +models, despite their impressive capacities, consistently struggle to produce +both coherent and diverse data. To address the coherency issue, we introduce +contrastive expert guidance, where the difference between the logit +distributions of fine-tuned and base language models is emphasised to ensure +domain adherence. In order to ensure diversity, we utilise existing real and +synthetic examples as negative prompts to the model. We deem this dual-pronged +approach to logit reshaping as STEER: Semantic Text Enhancement via Embedding +Repositioning. STEER operates at inference-time and systematically guides the +LLMs to strike a balance between adherence to the data distribution (ensuring +semantic fidelity) and deviation from prior synthetic examples or existing real +datasets (ensuring diversity and authenticity). This delicate balancing act is +achieved by dynamically moving towards or away from chosen representations in +the latent space. STEER demonstrates improved performance over previous +synthetic data generation techniques, exhibiting better balance between data +diversity and coherency across three distinct tasks: hypothesis generation, +toxic and non-toxic comment generation, and commonsense reasoning task +generation. We demonstrate how STEER allows for fine-tuned control over the +diversity-coherency trade-off via its hyperparameters, highlighting its +versatility. + +
+
+
+
+
+ + ♻ ☆ Forward-Backward Reasoning in Large Language Models for Verification + + +
+ Chain-of-Though (CoT) prompting has shown promising performance in various +reasoning tasks. Recently, Self-Consistency \citep{wang2023selfconsistency} +proposes to sample a diverse set of reasoning chains which may lead to +different answers while the answer that receives the most votes is selected. In +this paper, we propose a novel method to use backward reasoning in verifying +candidate answers. We mask a token in the question by ${\bf x}$ and ask the LLM +to predict the masked token when a candidate answer is provided by \textit{a +simple template}, i.e., ``\textit{\textbf{If we know the answer of the above +question is \{a candidate answer\}, what is the value of unknown variable ${\bf +x}$?}}'' Intuitively, the LLM is expected to predict the masked token +successfully if the provided candidate answer is correct. We further propose +FOBAR to combine forward and backward reasoning for estimating the probability +of candidate answers. We conduct extensive experiments on six data sets and +three LLMs. Experimental results demonstrate that FOBAR achieves +state-of-the-art performance on various reasoning benchmarks. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Metacognitive Prompting Improves Understanding in Large Language Models + + +
+ In Large Language Models (LLMs), there have been consistent advancements in +task-specific performance, largely influenced by effective prompt design. While +recent research on prompting has enhanced the reasoning capabilities of LLMs, a +gap remains in further improving their understanding abilities. In this study, +we introduce Metacognitive Prompting (MP), a strategy inspired by human +introspective reasoning processes. Using MP, LLMs undergo a systematic series +of structured, self-aware evaluations, drawing on both their vast inherent +knowledge and new insights. Our experiments involve five prevalent LLMs: +Llama2, Vicuna, PaLM, GPT-3.5, and GPT-4, all of which span various general +natural language understanding (NLU) tasks from the GLUE and SuperGLUE +benchmarks. Results indicate that, although GPT-4 consistently excels in most +tasks, PaLM, when equipped with MP, approaches its performance level. +Furthermore, across models and datasets, MP consistently outperforms existing +prompting methods, including standard and chain-of-thought prompting. This +study underscores the potential to amplify the understanding abilities of LLMs +and highlights the benefits of mirroring human introspective reasoning in NLU +tasks. + +
+
+ comment: 9 pages, in submission +
+
+
+
+
+ + ♻ ☆ Few-Shot Table-to-Text Generation with Prompt Planning and Knowledge + Memorization + + +
+ Pre-trained language models (PLM) have achieved remarkable advancement in +table-to-text generation tasks. However, the lack of labeled domain-specific +knowledge and the topology gap between tabular data and text make it difficult +for PLMs to yield faithful text. Low-resource generation likewise faces unique +challenges in this domain. Inspired by how humans descript tabular data with +prior knowledge, we suggest a new framework: PromptMize, which targets +table-to-text generation under few-shot settings. The design of our framework +consists of two aspects: a prompt planner and a knowledge adapter. The prompt +planner aims to generate a prompt signal that provides instance guidance for +PLMs to bridge the topology gap between tabular data and text. Moreover, the +knowledge adapter memorizes domain-specific knowledge from the unlabelled +corpus to supply essential information during generation. Extensive experiments +and analyses are investigated on three open domain few-shot NLG datasets: +human, song, and book. Compared with previous state-of-the-art approaches, our +model achieves remarkable performance in generating quality as judged by human +and automatic evaluations. + +
+
+ comment: Accidental duplicate. Please see arXiv:2302.12468 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 1 + +
+
+
+ + ☆ GPU Accelerated Color Correction and Frame Warping for Real-time Video + Stitching + + +
+ Traditional image stitching focuses on a single panorama frame without +considering the spatial-temporal consistency in videos. The straightforward +image stitching approach will cause temporal flicking and color inconstancy +when it is applied to the video stitching task. Besides, inaccurate camera +parameters will cause artifacts in the image warping. In this paper, we propose +a real-time system to stitch multiple video sequences into a panoramic video, +which is based on GPU accelerated color correction and frame warping without +accurate camera parameters. We extend the traditional 2D-Matrix (2D-M) color +correction approach and a present spatio-temporal 3D-Matrix (3D-M) color +correction method for the overlap local regions with online color balancing +using a piecewise function on global frames. Furthermore, we use pairwise +homography matrices given by coarse camera calibration for global warping +followed by accurate local warping based on the optical flow. Experimental +results show that our system can generate highquality panorama videos in real +time. + +
+
+
+
+
+
+
+
+ + Information Retrieval 11 + +
+
+
+ + ☆ A Model-Agnostic Framework for Recommendation via Interest-aware Item + Embeddings RecSys + + +
+ Item representation holds significant importance in recommendation systems, +which encompasses domains such as news, retail, and videos. Retrieval and +ranking models utilise item representation to capture the user-item +relationship based on user behaviours. While existing representation learning +methods primarily focus on optimising item-based mechanisms, such as attention +and sequential modelling. However, these methods lack a modelling mechanism to +directly reflect user interests within the learned item representations. +Consequently, these methods may be less effective in capturing user interests +indirectly. To address this challenge, we propose a novel Interest-aware +Capsule network (IaCN) recommendation model, a model-agnostic framework that +directly learns interest-oriented item representations. IaCN serves as an +auxiliary task, enabling the joint learning of both item-based and +interest-based representations. This framework adopts existing recommendation +models without requiring substantial redesign. We evaluate the proposed +approach on benchmark datasets, exploring various scenarios involving different +deep neural networks, behaviour sequence lengths, and joint learning ratios of +interest-oriented item representations. Experimental results demonstrate +significant performance enhancements across diverse recommendation models, +validating the effectiveness of our approach. + +
+
+ comment: Accepted Paper under LBR track in the Seventeenth ACM Conference on + Recommender Systems (RecSys) 2023 +
+
+
+
+
+ + ☆ Identity-Aware Semi-Supervised Learning for Comic Character + Re-Identification + + +
+ Character re-identification, recognizing characters consistently across +different panels in comics, presents significant challenges due to limited +annotated data and complex variations in character appearances. To tackle this +issue, we introduce a robust semi-supervised framework that combines metric +learning with a novel 'Identity-Aware' self-supervision method by contrastive +learning of face and body pairs of characters. Our approach involves processing +both facial and bodily features within a unified network architecture, +facilitating the extraction of identity-aligned character embeddings that +capture individual identities while preserving the effectiveness of face and +body features. This integrated character representation enhances feature +extraction and improves character re-identification compared to +re-identification by face or body independently, offering a parameter-efficient +solution. By extensively validating our method using in-series and inter-series +evaluation metrics, we demonstrate its effectiveness in consistently +re-identifying comic characters. Compared to existing methods, our approach not +only addresses the challenge of character re-identification but also serves as +a foundation for downstream tasks since it can produce character embeddings +without restrictions of face and body availability, enriching the comprehension +of comic books. In our experiments, we leverage two newly curated datasets: the +'Comic Character Instances Dataset', comprising over a million character +instances and the 'Comic Sequence Identity Dataset', containing annotations of +identities within more than 3000 sets of four consecutive comic panels that we +collected. + +
+
+ comment: 18 pages, 9 Figures +
+
+
+
+
+ + ☆ Bridging High-Quality Audio and Video via Language for Sound Effects + Retrieval from Visual Queries SP + + +
+ Finding the right sound effects (SFX) to match moments in a video is a +difficult and time-consuming task, and relies heavily on the quality and +completeness of text metadata. Retrieving high-quality (HQ) SFX using a video +frame directly as the query is an attractive alternative, removing the reliance +on text metadata and providing a low barrier to entry for non-experts. Due to +the lack of HQ audio-visual training data, previous work on audio-visual +retrieval relies on YouTube (in-the-wild) videos of varied quality for +training, where the audio is often noisy and the video of amateur quality. As +such it is unclear whether these systems would generalize to the task of +matching HQ audio to production-quality video. To address this, we propose a +multimodal framework for recommending HQ SFX given a video frame by (1) +leveraging large language models and foundational vision-language models to +bridge HQ audio and video to create audio-visual pairs, resulting in a highly +scalable automatic audio-visual data curation pipeline; and (2) using +pre-trained audio and visual encoders to train a contrastive learning-based +retrieval system. We show that our system, trained using our automatic data +curation pipeline, significantly outperforms baselines trained on in-the-wild +data on the task of HQ SFX retrieval for video. Furthermore, while the +baselines fail to generalize to this task, our system generalizes well from +clean to in-the-wild data, outperforming the baselines on a dataset of YouTube +videos despite only being trained on the HQ audio-visual pairs. A user study +confirms that people prefer SFX retrieved by our system over the baseline 67% +of the time both for HQ and in-the-wild data. Finally, we present ablations to +determine the impact of model and data pipeline design choices on downstream +retrieval performance. Please visit our project website to listen to and view +our SFX retrieval results. + +
+
+ comment: WASPAA 2023. Project page: + https://juliawilkins.github.io/sound-effects-retrieval-from-video/. 4 pages, + 2 figures, 2 tables +
+
+
+
+
+ + ☆ Uplift Modeling: from Causal Inference to Personalization + + +
+ Uplift modeling is a collection of machine learning techniques for estimating +causal effects of a treatment at the individual or subgroup levels. Over the +last years, causality and uplift modeling have become key trends in +personalization at online e-commerce platforms, enabling the selection of the +best treatment for each user in order to maximize the target business metric. +Uplift modeling can be particularly useful for personalized promotional +campaigns, where the potential benefit caused by a promotion needs to be +weighed against the potential costs. In this tutorial we will cover basic +concepts of causality and introduce the audience to state-of-the-art techniques +in uplift modeling. We will discuss the advantages and the limitations of +different approaches and dive into the unique setup of constrained uplift +modeling. Finally, we will present real-life applications and discuss +challenges in implementing these models in production. + +
+
+
+
+
+ + ☆ Towards Filling the Gap in Conversational Search: From Passage Retrieval + to Conversational Response Generation CIKM '23 + + +
+ Research on conversational search has so far mostly focused on query +rewriting and multi-stage passage retrieval. However, synthesizing the top +retrieved passages into a complete, relevant, and concise response is still an +open challenge. Having snippet-level annotations of relevant passages would +enable both (1) the training of response generation models that are able to +ground answers in actual statements and (2) the automatic evaluation of the +generated responses in terms of completeness. In this paper, we address the +problem of collecting high-quality snippet-level answer annotations for two of +the TREC Conversational Assistance track datasets. To ensure quality, we first +perform a preliminary annotation study, employing different task designs, +crowdsourcing platforms, and workers with different qualifications. Based on +the outcomes of this study, we refine our annotation protocol before proceeding +with the full-scale data collection. Overall, we gather annotations for 1.8k +question-paragraph pairs, each annotated by three independent crowd workers. +The process of collecting data at this magnitude also led to multiple insights +about the problem that can inform the design of future response-generation +methods. This is an extended version of the article published with the same +title in the Proceedings of CIKM'23. + +
+
+ comment: Extended version of the paper that appeared in the Proceedings of the + 32nd ACM International Conference on Information and Knowledge Management + (CIKM '23) +
+
+
+
+
+ + ☆ Capturing Popularity Trends: A Simplistic Non-Personalized Approach for + Enhanced Item Recommendation + + +
+ Recommender systems have been gaining increasing research attention over the +years. Most existing recommendation methods focus on capturing users' +personalized preferences through historical user-item interactions, which may +potentially violate user privacy. Additionally, these approaches often overlook +the significance of the temporal fluctuation in item popularity that can sway +users' decision-making. To bridge this gap, we propose Popularity-Aware +Recommender (PARE), which makes non-personalized recommendations by predicting +the items that will attain the highest popularity. PARE consists of four +modules, each focusing on a different aspect: popularity history, temporal +impact, periodic impact, and side information. Finally, an attention layer is +leveraged to fuse the outputs of four modules. To our knowledge, this is the +first work to explicitly model item popularity in recommendation systems. +Extensive experiments show that PARE performs on par or even better than +sophisticated state-of-the-art recommendation methods. Since PARE prioritizes +item popularity over personalized user preferences, it can enhance existing +recommendation methods as a complementary component. Our experiments +demonstrate that integrating PARE with existing recommendation methods +significantly surpasses the performance of standalone models, highlighting +PARE's potential as a complement to existing recommendation methods. +Furthermore, the simplicity of PARE makes it immensely practical for industrial +applications and a valuable baseline for future research. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ Real-Time Construction Algorithm of Co-Occurrence Network Based on + Inverted Index + + +
+ Co-occurrence networks are an important method in the field of natural +language processing and text mining for discovering semantic relationships +within texts. However, the traditional traversal algorithm for constructing +co-occurrence networks has high time complexity and space complexity when +dealing with large-scale text data. In this paper, we propose an optimized +algorithm based on inverted indexing and breadth-first search to improve the +efficiency of co-occurrence network construction and reduce memory consumption. +Firstly, the traditional traversal algorithm is analyzed, and its performance +issues in constructing co-occurrence networks are identified. Then, the +detailed implementation process of the optimized algorithm is presented. +Subsequently, the CSL large-scale Chinese scientific literature dataset is used +for experimental validation, comparing the performance of the traditional +traversal algorithm and the optimized algorithm in terms of running time and +memory usage. Finally, using non-parametric test methods, the optimized +algorithm is proven to have significantly better performance than the +traditional traversal algorithm. The research in this paper provides an +effective method for the rapid construction of co-occurrence networks, +contributing to the further development of the Information Organization fields. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ BERT4CTR: An Efficient Framework to Combine Pre-trained Language Model + with Non-textual Features for CTR Prediction + + +
+ Although deep pre-trained language models have shown promising benefit in a +large set of industrial scenarios, including Click-Through-Rate (CTR) +prediction, how to integrate pre-trained language models that handle only +textual signals into a prediction pipeline with non-textual features is +challenging. + Up to now two directions have been explored to integrate multi-modal inputs +in fine-tuning of pre-trained language models. One consists of fusing the +outcome of language models and non-textual features through an aggregation +layer, resulting into ensemble framework, where the cross-information between +textual and non-textual inputs are only learned in the aggregation layer. The +second one consists of splitting non-textual features into fine-grained +fragments and transforming the fragments to new tokens combined with textual +ones, so that they can be fed directly to transformer layers in language +models. However, this approach increases the complexity of the learning and +inference because of the numerous additional tokens. + To address these limitations, we propose in this work a novel framework +BERT4CTR, with the Uni-Attention mechanism that can benefit from the +interactions between non-textual and textual features while maintaining low +time-costs in training and inference through a dimensionality reduction. +Comprehensive experiments on both public and commercial data demonstrate that +BERT4CTR can outperform significantly the state-of-the-art frameworks to handle +multi-modal inputs and be applicable to CTR prediction. + +
+
+
+
+
+ + ♻ ☆ Contrastive Counterfactual Learning for Causality-aware Interpretable + Recommender Systems + + +
+ The field of generating recommendations within the framework of causal +inference has seen a recent surge, with recommendations being likened to +treatments. This approach enhances insights into the influence of +recommendations on user behavior and helps in identifying the underlying +factors. Existing research has often leveraged propensity scores to mitigate +bias, albeit at the risk of introducing additional variance. Others have +explored the use of unbiased data from randomized controlled trials, although +this comes with assumptions that may prove challenging in practice. In this +paper, we first present the causality-aware interpretation of recommendations +and reveal how the underlying exposure mechanism can bias the maximum +likelihood estimation (MLE) of observational feedback. Recognizing that +confounders may be elusive, we propose a contrastive self-supervised learning +to minimize exposure bias, employing inverse propensity scores and expanding +the positive sample set. Building on this foundation, we present a novel +contrastive counterfactual learning method (CCL) that incorporates three unique +positive sampling strategies grounded in estimated exposure probability or +random counterfactual samples. Through extensive experiments on two real-world +datasets, we demonstrate that our CCL outperforms the state-of-the-art methods. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Vague Preference Policy Learning for Multi-round Conversational + Recommendation + + +
+ Conversational recommendation systems (CRS) effectively address information +asymmetry by dynamically eliciting user preferences through multi-turn +interactions. Existing CRS widely assumes that users have clear preferences. +Under this assumption, the agent will completely trust the user feedback and +treat the accepted or rejected signals as strong indicators to filter items and +reduce the candidate space, which may lead to the problem of over-filtering. +However, in reality, users' preferences are often vague and volatile, with +uncertainty about their desires and changing decisions during interactions. + To address this issue, we introduce a novel scenario called Vague Preference +Multi-round Conversational Recommendation (VPMCR), which considers users' vague +and volatile preferences in CRS.VPMCR employs a soft estimation mechanism to +assign a non-zero confidence score for all candidate items to be displayed, +naturally avoiding the over-filtering problem. In the VPMCR setting, we +introduce an solution called Adaptive Vague Preference Policy Learning (AVPPL), +which consists of two main components: Uncertainty-aware Soft Estimation (USE) +and Uncertainty-aware Policy Learning (UPL). USE estimates the uncertainty of +users' vague feedback and captures their dynamic preferences using a +choice-based preferences extraction module and a time-aware decaying strategy. +UPL leverages the preference distribution estimated by USE to guide the +conversation and adapt to changes in users' preferences to make recommendations +or ask for attributes. + Our extensive experiments demonstrate the effectiveness of our method in the +VPMCR scenario, highlighting its potential for practical applications and +improving the overall performance and applicability of CRS in real-world +settings, particularly for users with vague or dynamic preferences. + +
+
+
+
+
+ + ♻ ☆ CDR: Conservative Doubly Robust Learning for Debiased Recommendation + + +
+ In recommendation systems (RS), user behavior data is observational rather +than experimental, resulting in widespread bias in the data. Consequently, +tackling bias has emerged as a major challenge in the field of recommendation +systems. Recently, Doubly Robust Learning (DR) has gained significant attention +due to its remarkable performance and robust properties. However, our +experimental findings indicate that existing DR methods are severely impacted +by the presence of so-called Poisonous Imputation, where the imputation +significantly deviates from the truth and becomes counterproductive. + To address this issue, this work proposes Conservative Doubly Robust strategy +(CDR) which filters imputations by scrutinizing their mean and variance. +Theoretical analyses show that CDR offers reduced variance and improved tail +bounds.In addition, our experimental investigations illustrate that CDR +significantly enhances performance and can indeed reduce the frequency of +poisonous imputation. + +
+
+
+
+
+
+
+
+ + Machine Learning 38 + +
+
+
+ + ☆ A Model-Agnostic Framework for Recommendation via Interest-aware Item + Embeddings RecSys + + +
+ Item representation holds significant importance in recommendation systems, +which encompasses domains such as news, retail, and videos. Retrieval and +ranking models utilise item representation to capture the user-item +relationship based on user behaviours. While existing representation learning +methods primarily focus on optimising item-based mechanisms, such as attention +and sequential modelling. However, these methods lack a modelling mechanism to +directly reflect user interests within the learned item representations. +Consequently, these methods may be less effective in capturing user interests +indirectly. To address this challenge, we propose a novel Interest-aware +Capsule network (IaCN) recommendation model, a model-agnostic framework that +directly learns interest-oriented item representations. IaCN serves as an +auxiliary task, enabling the joint learning of both item-based and +interest-based representations. This framework adopts existing recommendation +models without requiring substantial redesign. We evaluate the proposed +approach on benchmark datasets, exploring various scenarios involving different +deep neural networks, behaviour sequence lengths, and joint learning ratios of +interest-oriented item representations. Experimental results demonstrate +significant performance enhancements across diverse recommendation models, +validating the effectiveness of our approach. + +
+
+ comment: Accepted Paper under LBR track in the Seventeenth ACM Conference on + Recommender Systems (RecSys) 2023 +
+
+
+
+
+ + ☆ TinyProp -- Adaptive Sparse Backpropagation for Efficient TinyML + On-device Learning + + +
+ Training deep neural networks using backpropagation is very memory and +computationally intensive. This makes it difficult to run on-device learning or +fine-tune neural networks on tiny, embedded devices such as low-power +micro-controller units (MCUs). Sparse backpropagation algorithms try to reduce +the computational load of on-device learning by training only a subset of the +weights and biases. Existing approaches use a static number of weights to +train. A poor choice of this so-called backpropagation ratio limits either the +computational gain or can lead to severe accuracy losses. In this paper we +present TinyProp, the first sparse backpropagation method that dynamically +adapts the back-propagation ratio during on-device training for each training +step. TinyProp induces a small calculation overhead to sort the elements of the +gradient, which does not significantly impact the computational gains. TinyProp +works particularly well on fine-tuning trained networks on MCUs, which is a +typical use case for embedded applications. For typical datasets from three +datasets MNIST, DCASE2020 and CIFAR10, we are 5 times faster compared to +non-sparse training with an accuracy loss of on average 1%. On average, +TinyProp is 2.9 times faster than existing, static sparse backpropagation +algorithms and the accuracy loss is reduced on average by 6 % compared to a +typical static setting of the back-propagation ratio. + +
+
+ comment: 7 Pages, AIPE Conference 2023 +
+
+
+
+
+ + ☆ Polynomial Bounds for Learning Noisy Optical Physical Unclonable + Functions and Connections to Learning With Errors + + +
+ It is shown that a class of optical physical unclonable functions (PUFs) can +be learned to arbitrary precision with arbitrarily high probability, even in +the presence of noise, given access to polynomially many challenge-response +pairs and polynomially bounded computational power, under mild assumptions +about the distributions of the noise and challenge vectors. This extends the +results of Rh\"uramir et al. (2013), who showed a subset of this class of PUFs +to be learnable in polynomial time in the absence of noise, under the +assumption that the optics of the PUF were either linear or had negligible +nonlinear effects. We derive polynomial bounds for the required number of +samples and the computational complexity of a linear regression algorithm, +based on size parameters of the PUF, the distributions of the challenge and +noise vectors, and the probability and accuracy of the regression algorithm, +with a similar analysis to one done by Bootle et al. (2018), who demonstrated a +learning attack on a poorly implemented version of the Learning With Errors +problem. + +
+
+ comment: 10 pages, 2 figures, submitted to IEEE Transactions on Information + Forensics and Security +
+
+
+
+
+ + ☆ Half-Hop: A graph upsampling approach for slowing down message passing ICML 2023 + + +
+ Message passing neural networks have shown a lot of success on +graph-structured data. However, there are many instances where message passing +can lead to over-smoothing or fail when neighboring nodes belong to different +classes. In this work, we introduce a simple yet general framework for +improving learning in message passing neural networks. Our approach essentially +upsamples edges in the original graph by adding "slow nodes" at each edge that +can mediate communication between a source and a target node. Our method only +modifies the input graph, making it plug-and-play and easy to use with existing +models. To understand the benefits of slowing down message passing, we provide +theoretical and empirical analyses. We report results on several supervised and +self-supervised benchmarks, and show improvements across the board, notably in +heterophilic conditions where adjacent nodes are more likely to have different +labels. Finally, we show how our approach can be used to generate augmentations +for self-supervised learning, where slow nodes are randomly introduced into +different edges in the graph to generate multi-scale views with variable path +lengths. + +
+
+ comment: Published as a conference paper at ICML 2023 +
+
+
+
+
+ + ☆ A Comparative Study of Text Embedding Models for Semantic Text + Similarity in Bug Reports + + +
+ Bug reports are an essential aspect of software development, and it is +crucial to identify and resolve them quickly to ensure the consistent +functioning of software systems. Retrieving similar bug reports from an +existing database can help reduce the time and effort required to resolve bugs. +In this paper, we compared the effectiveness of semantic textual similarity +methods for retrieving similar bug reports based on a similarity score. We +explored several embedding models such as TF-IDF (Baseline), FastText, Gensim, +BERT, and ADA. We used the Software Defects Data containing bug reports for +various software projects to evaluate the performance of these models. Our +experimental results showed that BERT generally outperformed the rest of the +models regarding recall, followed by ADA, Gensim, FastText, and TFIDF. Our +study provides insights into the effectiveness of different embedding methods +for retrieving similar bug reports and highlights the impact of selecting the +appropriate one for this task. Our code is available on GitHub. + +
+
+ comment: 7 Pages +
+
+
+
+
+ + ☆ Regularizing Adversarial Imitation Learning Using Causal Invariance ICML 2023 + + +
+ Imitation learning methods are used to infer a policy in a Markov decision +process from a dataset of expert demonstrations by minimizing a divergence +measure between the empirical state occupancy measures of the expert and the +policy. The guiding signal to the policy is provided by the discriminator used +as part of an versarial optimization procedure. We observe that this model is +prone to absorbing spurious correlations present in the expert data. To +alleviate this issue, we propose to use causal invariance as a regularization +principle for adversarial training of these models. The regularization +objective is applicable in a straightforward manner to existing adversarial +imitation frameworks. We demonstrate the efficacy of the regularized +formulation in an illustrative two-dimensional setting as well as a number of +high-dimensional robot locomotion benchmark tasks. + +
+
+ comment: Published at the ICML 2023 Workshop on Spurious Correlations, + Invariance, and Stability +
+
+
+
+
+ + ☆ Distributed Extra-gradient with Optimal Complexity and Communication + Guarantees ICLR 2023 + + +
+ We consider monotone variational inequality (VI) problems in multi-GPU +settings where multiple processors/workers/clients have access to local +stochastic dual vectors. This setting includes a broad range of important +problems from distributed convex minimization to min-max and games. +Extra-gradient, which is a de facto algorithm for monotone VI problems, has not +been designed to be communication-efficient. To this end, we propose a +quantized generalized extra-gradient (Q-GenX), which is an unbiased and +adaptive compression method tailored to solve VIs. We provide an adaptive +step-size rule, which adapts to the respective noise profiles at hand and +achieve a fast rate of ${\mathcal O}(1/T)$ under relative noise, and an +order-optimal ${\mathcal O}(1/\sqrt{T})$ under absolute noise and show +distributed training accelerates convergence. Finally, we validate our +theoretical results by providing real-world experiments and training generative +adversarial networks on multiple GPUs. + +
+
+ comment: International Conference on Learning Representations (ICLR 2023) +
+
+
+
+
+ + ☆ RatGPT: Turning online LLMs into Proxies for Malware Attacks + + +
+ The evolution of Generative AI and the capabilities of the newly released +Large Language Models (LLMs) open new opportunities in software engineering. +However, they also lead to new challenges in cybersecurity. Recently, +researchers have shown the possibilities of using LLMs such as ChatGPT to +generate malicious content that can directly be exploited or guide +inexperienced hackers to weaponize tools and code. Those studies covered +scenarios that still require the attacker in the middle of the loop. In this +study, we leverage openly available plugins and use an LLM as proxy between the +attacker and the victim. We deliver a proof-of-concept where ChatGPT is used +for the dissemination of malicious software while evading detection, alongside +establishing the communication to a command and control (C2) server to receive +commands to interact with a victim's system. Finally, we present the general +approach as well as essential elements in order to stay undetected and make the +attack a success. This proof-of-concept highlights significant cybersecurity +issues with openly available plugins and LLMs, which require the development of +security guidelines, controls, and mitigation strategies. + +
+
+
+
+
+ + ☆ Diversifying AI: Towards Creative Chess with AlphaZero + + +
+ In recent years, Artificial Intelligence (AI) systems have surpassed human +intelligence in a variety of computational tasks. However, AI systems, like +humans, make mistakes, have blind spots, hallucinate, and struggle to +generalize to new situations. This work explores whether AI can benefit from +creative decision-making mechanisms when pushed to the limits of its +computational rationality. In particular, we investigate whether a team of +diverse AI systems can outperform a single AI in challenging tasks by +generating more ideas as a group and then selecting the best ones. We study +this question in the game of chess, the so-called drosophila of AI. We build on +AlphaZero (AZ) and extend it to represent a league of agents via a +latent-conditioned architecture, which we call AZ_db. We train AZ_db to +generate a wider range of ideas using behavioral diversity techniques and +select the most promising ones with sub-additive planning. Our experiments +suggest that AZ_db plays chess in diverse ways, solves more puzzles as a group +and outperforms a more homogeneous team. Notably, AZ_db solves twice as many +challenging puzzles as AZ, including the challenging Penrose positions. When +playing chess from different openings, we notice that players in AZ_db +specialize in different openings, and that selecting a player for each opening +using sub-additive planning results in a 50 Elo improvement over AZ. Our +findings suggest that diversity bonuses emerge in teams of AI agents, just as +they do in teams of humans and that diversity is a valuable asset in solving +computationally hard problems. + +
+
+
+
+
+ + ☆ Forensic Data Analytics for Anomaly Detection in Evolving Networks + + +
+ In the prevailing convergence of traditional infrastructure-based deployment +(i.e., Telco and industry operational networks) towards evolving deployments +enabled by 5G and virtualization, there is a keen interest in elaborating +effective security controls to protect these deployments in-depth. By +considering key enabling technologies like 5G and virtualization, evolving +networks are democratized, facilitating the establishment of point presences +integrating different business models ranging from media, dynamic web content, +gaming, and a plethora of IoT use cases. Despite the increasing services +provided by evolving networks, many cybercrimes and attacks have been launched +in evolving networks to perform malicious activities. Due to the limitations of +traditional security artifacts (e.g., firewalls and intrusion detection +systems), the research on digital forensic data analytics has attracted more +attention. Digital forensic analytics enables people to derive detailed +information and comprehensive conclusions from different perspectives of +cybercrimes to assist in convicting criminals and preventing future crimes. +This chapter presents a digital analytics framework for network anomaly +detection, including multi-perspective feature engineering, unsupervised +anomaly detection, and comprehensive result correction procedures. Experiments +on real-world evolving network data show the effectiveness of the proposed +forensic data analytics solution. + +
+
+ comment: Electronic version of an article published as [Book Series: World + Scientific Series in Digital Forensics and Cybersecurity, Volume 2, + Innovations in Digital Forensics, 2023, Pages 99-137] + [DOI:10.1142/9789811273209_0004] \c{opyright} copyright World Scientific + Publishing Company [https://doi.org/10.1142/9789811273209_0004] +
+
+
+
+
+ + ☆ FedPerfix: Towards Partial Model Personalization of Vision Transformers + in Federated Learning ICCV + + +
+ Personalized Federated Learning (PFL) represents a promising solution for +decentralized learning in heterogeneous data environments. Partial model +personalization has been proposed to improve the efficiency of PFL by +selectively updating local model parameters instead of aggregating all of them. +However, previous work on partial model personalization has mainly focused on +Convolutional Neural Networks (CNNs), leaving a gap in understanding how it can +be applied to other popular models such as Vision Transformers (ViTs). In this +work, we investigate where and how to partially personalize a ViT model. +Specifically, we empirically evaluate the sensitivity to data distribution of +each type of layer. Based on the insights that the self-attention layer and the +classification head are the most sensitive parts of a ViT, we propose a novel +approach called FedPerfix, which leverages plugins to transfer information from +the aggregated model to the local client as a personalization. Finally, we +evaluate the proposed approach on CIFAR-100, OrganAMNIST, and Office-Home +datasets and demonstrate its effectiveness in improving the model's performance +compared to several advanced PFL methods. + +
+
+ comment: 2023 IEEE/CVF International Conference on Computer Vision (ICCV) +
+
+
+
+
+ + ☆ ZhiJian: A Unifying and Rapidly Deployable Toolbox for Pre-trained Model + Reuse + + +
+ The rapid expansion of foundation pre-trained models and their fine-tuned +counterparts has significantly contributed to the advancement of machine +learning. Leveraging pre-trained models to extract knowledge and expedite +learning in real-world tasks, known as "Model Reuse", has become crucial in +various applications. Previous research focuses on reusing models within a +certain aspect, including reusing model weights, structures, and hypothesis +spaces. This paper introduces ZhiJian, a comprehensive and user-friendly +toolbox for model reuse, utilizing the PyTorch backend. ZhiJian presents a +novel paradigm that unifies diverse perspectives on model reuse, encompassing +target architecture construction with PTM, tuning target model with PTM, and +PTM-based inference. This empowers deep learning practitioners to explore +downstream tasks and identify the complementary advantages among different +methods. ZhiJian is readily accessible at +https://github.com/zhangyikaii/lamda-zhijian facilitating seamless utilization +of pre-trained models and streamlining the model reuse process for researchers +and developers. + +
+
+
+
+
+ + ☆ Accurate machine learning force fields via experimental and simulation + data fusion + + +
+ Machine Learning (ML)-based force fields are attracting ever-increasing +interest due to their capacity to span spatiotemporal scales of classical +interatomic potentials at quantum-level accuracy. They can be trained based on +high-fidelity simulations or experiments, the former being the common case. +However, both approaches are impaired by scarce and erroneous data resulting in +models that either do not agree with well-known experimental observations or +are under-constrained and only reproduce some properties. Here we leverage both +Density Functional Theory (DFT) calculations and experimentally measured +mechanical properties and lattice parameters to train an ML potential of +titanium. We demonstrate that the fused data learning strategy can concurrently +satisfy all target objectives, thus resulting in a molecular model of higher +accuracy compared to the models trained with a single data source. The +inaccuracies of DFT functionals at target experimental properties were +corrected, while the investigated off-target properties remained largely +unperturbed. Our approach is applicable to any material and can serve as a +general strategy to obtain highly accurate ML potentials. + +
+
+
+
+
+ + ☆ RTB Formulation Using Point Process + + +
+ We propose a general stochastic framework for modelling repeated auctions in +the Real Time Bidding (RTB) ecosystem using point processes. The flexibility of +the framework allows a variety of auction scenarios including configuration of +information provided to player, determination of auction winner and +quantification of utility gained from each auctions. We propose theoretical +results on how this formulation of process can be approximated to a Poisson +point process, which enables the analyzer to take advantage of well-established +properties. Under this framework, we specify the player's optimal strategy +under various scenarios. We also emphasize that it is critical to consider the +joint distribution of utility and market condition instead of estimating the +marginal distributions independently. + +
+
+
+
+
+ + ☆ Multi-fidelity Fourier Neural Operator for Fast Modeling of Large-Scale + Geological Carbon Storage + + +
+ Deep learning-based surrogate models have been widely applied in geological +carbon storage (GCS) problems to accelerate the prediction of reservoir +pressure and CO2 plume migration. Large amounts of data from physics-based +numerical simulators are required to train a model to accurately predict the +complex physical behaviors associated with this process. In practice, the +available training data are always limited in large-scale 3D problems due to +the high computational cost. Therefore, we propose to use a multi-fidelity +Fourier Neural Operator to solve large-scale GCS problems with more affordable +multi-fidelity training datasets. The Fourier Neural Operator has a desirable +grid-invariant property, which simplifies the transfer learning procedure +between datasets with different discretization. We first test the model +efficacy on a GCS reservoir model being discretized into 110k grid cells. The +multi-fidelity model can predict with accuracy comparable to a high-fidelity +model trained with the same amount of high-fidelity data with 81% less data +generation costs. We further test the generalizability of the multi-fidelity +model on a same reservoir model with a finer discretization of 1 million grid +cells. This case was made more challenging by employing high-fidelity and +low-fidelity datasets generated by different geostatistical models and +reservoir simulators. We observe that the multi-fidelity FNO model can predict +pressure fields with reasonable accuracy even when the high-fidelity data are +extremely limited. + +
+
+
+
+
+ + ☆ Learning Lightweight Object Detectors via Multi-Teacher Progressive + Distillation ICML 2023 + + +
+ Resource-constrained perception systems such as edge computing and +vision-for-robotics require vision models to be both accurate and lightweight +in computation and memory usage. While knowledge distillation is a proven +strategy to enhance the performance of lightweight classification models, its +application to structured outputs like object detection and instance +segmentation remains a complicated task, due to the variability in outputs and +complex internal network modules involved in the distillation process. In this +paper, we propose a simple yet surprisingly effective sequential approach to +knowledge distillation that progressively transfers the knowledge of a set of +teacher detectors to a given lightweight student. To distill knowledge from a +highly accurate but complex teacher model, we construct a sequence of teachers +to help the student gradually adapt. Our progressive strategy can be easily +combined with existing detection distillation mechanisms to consistently +maximize student performance in various settings. To the best of our knowledge, +we are the first to successfully distill knowledge from Transformer-based +teacher detectors to convolution-based students, and unprecedentedly boost the +performance of ResNet-50 based RetinaNet from 36.5% to 42.0% AP and Mask R-CNN +from 38.2% to 42.5% AP on the MS COCO benchmark. + +
+
+ comment: ICML 2023 +
+
+
+
+
+ + ☆ A comprehensive study of spike and slab shrinkage priors for + structurally sparse Bayesian neural networks + + +
+ Network complexity and computational efficiency have become increasingly +significant aspects of deep learning. Sparse deep learning addresses these +challenges by recovering a sparse representation of the underlying target +function by reducing heavily over-parameterized deep neural networks. +Specifically, deep neural architectures compressed via structured sparsity +(e.g. node sparsity) provide low latency inference, higher data throughput, and +reduced energy consumption. In this paper, we explore two well-established +shrinkage techniques, Lasso and Horseshoe, for model compression in Bayesian +neural networks. To this end, we propose structurally sparse Bayesian neural +networks which systematically prune excessive nodes with (i) Spike-and-Slab +Group Lasso (SS-GL), and (ii) Spike-and-Slab Group Horseshoe (SS-GHS) priors, +and develop computationally tractable variational inference including +continuous relaxation of Bernoulli variables. We establish the contraction +rates of the variational posterior of our proposed models as a function of the +network topology, layer-wise node cardinalities, and bounds on the network +weights. We empirically demonstrate the competitive performance of our models +compared to the baseline models in prediction accuracy, model compression, and +inference latency. + +
+
+
+
+
+ + ☆ Modeling Edge Features with Deep Bayesian Graph Networks + + +
+ We propose an extension of the Contextual Graph Markov Model, a deep and +probabilistic machine learning model for graphs, to model the distribution of +edge features. Our approach is architectural, as we introduce an additional +Bayesian network mapping edge features into discrete states to be used by the +original model. In doing so, we are also able to build richer graph +representations even in the absence of edge features, which is confirmed by the +performance improvements on standard graph classification benchmarks. Moreover, +we successfully test our proposal in a graph regression scenario where edge +features are of fundamental importance, and we show that the learned edge +representation provides substantial performance improvements against the +original model on three link prediction tasks. By keeping the computational +complexity linear in the number of edges, the proposed model is amenable to +large-scale graph processing. + +
+
+ comment: Releasing pre-print version to comply with TAILOR project + requirements +
+
+
+
+
+ + ☆ Embracing assay heterogeneity with neural processes for markedly + improved bioactivity predictions + + +
+ Predicting the bioactivity of a ligand is one of the hardest and most +important challenges in computer-aided drug discovery. Despite years of data +collection and curation efforts by research organizations worldwide, +bioactivity data remains sparse and heterogeneous, thus hampering efforts to +build predictive models that are accurate, transferable and robust. The +intrinsic variability of the experimental data is further compounded by data +aggregation practices that neglect heterogeneity to overcome sparsity. Here we +discuss the limitations of these practices and present a hierarchical +meta-learning framework that exploits the information synergy across disparate +assays by successfully accounting for assay heterogeneity. We show that the +model achieves a drastic improvement in affinity prediction across diverse +protein targets and assay types compared to conventional baselines. It can +quickly adapt to new target contexts using very few observations, thus enabling +large-scale virtual screening in early-phase drug discovery. + +
+
+
+
+
+ + ☆ MovePose: A High-performance Human Pose Estimation Algorithm on Mobile + and Edge Devices + + +
+ We present MovePose, an optimized lightweight convolutional neural network +designed specifically for real-time body pose estimation on CPU-based mobile +devices. The current solutions do not provide satisfactory accuracy and speed +for human posture estimation, and MovePose addresses this gap. It aims to +maintain real-time performance while improving the accuracy of human posture +estimation for mobile devices. The network produces 17 keypoints for each +individual at a rate exceeding 11 frames per second, making it suitable for +real-time applications such as fitness tracking, sign language interpretation, +and advanced mobile human posture estimation. Our MovePose algorithm has +attained an Mean Average Precision (mAP) score of 67.7 on the COCO +\cite{cocodata} validation dataset. The MovePose algorithm displayed efficiency +with a performance of 69+ frames per second (fps) when run on an Intel +i9-10920x CPU. Additionally, it showcased an increased performance of 452+ fps +on an NVIDIA RTX3090 GPU. On an Android phone equipped with a Snapdragon 8 + 4G +processor, the fps reached above 11. To enhance accuracy, we incorporated three +techniques: deconvolution, large kernel convolution, and coordinate +classification methods. Compared to basic upsampling, deconvolution is +trainable, improves model capacity, and enhances the receptive field. Large +kernel convolution strengthens these properties at a decreased computational +cost. In summary, MovePose provides high accuracy and real-time performance, +marking it a potential tool for a variety of applications, including those +focused on mobile-side human posture estimation. The code and models for this +algorithm will be made publicly accessible. + +
+
+
+
+
+ + ☆ Over-the-Air Computation Aided Federated Learning with the Aggregation + of Normalized Gradient + + +
+ Over-the-air computation is a communication-efficient solution for federated +learning (FL). In such a system, iterative procedure is performed: Local +gradient of private loss function is updated, amplified and then transmitted by +every mobile device; the server receives the aggregated gradient all-at-once, +generates and then broadcasts updated model parameters to every mobile device. +In terms of amplification factor selection, most related works suppose the +local gradient's maximal norm always happens although it actually fluctuates +over iterations, which may degrade convergence performance. To circumvent this +problem, we propose to turn local gradient to be normalized one before +amplifying it. Under our proposed method, when the loss function is smooth, we +prove our proposed method can converge to stationary point at sub-linear rate. +In case of smooth and strongly convex loss function, we prove our proposed +method can achieve minimal training loss at linear rate with any small positive +tolerance. Moreover, a tradeoff between convergence rate and the tolerance is +discovered. To speedup convergence, problems optimizing system parameters are +also formulated for above two cases. Although being non-convex, optimal +solution with polynomial complexity of the formulated problems are derived. +Experimental results show our proposed method can outperform benchmark methods +on convergence performance. + +
+
+
+
+
+ + ♻ ☆ Robust Multi-Task Learning and Online Refinement for Spacecraft Pose + Estimation across Domain Gap + + +
+ This work presents Spacecraft Pose Network v2 (SPNv2), a Convolutional Neural +Network (CNN) for pose estimation of noncooperative spacecraft across domain +gap. SPNv2 is a multi-scale, multi-task CNN which consists of a shared +multi-scale feature encoder and multiple prediction heads that perform +different tasks on a shared feature output. These tasks are all related to +detection and pose estimation of a target spacecraft from an image, such as +prediction of pre-defined satellite keypoints, direct pose regression, and +binary segmentation of the satellite foreground. It is shown that by jointly +training on different yet related tasks with extensive data augmentations on +synthetic images only, the shared encoder learns features that are common +across image domains that have fundamentally different visual characteristics +compared to synthetic images. This work also introduces Online Domain +Refinement (ODR) which refines the parameters of the normalization layers of +SPNv2 on the target domain images online at deployment. Specifically, ODR +performs self-supervised entropy minimization of the predicted satellite +foreground, thereby improving the CNN's performance on the target domain images +without their pose labels and with minimal computational efforts. The GitHub +repository for SPNv2 is available at https://github.com/tpark94/spnv2. + +
+
+ comment: Accepted to Advances in Space Research; fixed error on reporting + translation from heatmaps +
+
+
+
+
+ + ♻ ☆ Enhance Diffusion to Improve Robust Generalization KDD 2023 + + +
+ Deep neural networks are susceptible to human imperceptible adversarial +perturbations. One of the strongest defense mechanisms is \emph{Adversarial +Training} (AT). In this paper, we aim to address two predominant problems in +AT. First, there is still little consensus on how to set hyperparameters with a +performance guarantee for AT research, and customized settings impede a fair +comparison between different model designs in AT research. Second, the robustly +trained neural networks struggle to generalize well and suffer from tremendous +overfitting. This paper focuses on the primary AT framework - Projected +Gradient Descent Adversarial Training (PGD-AT). We approximate the dynamic of +PGD-AT by a continuous-time Stochastic Differential Equation (SDE), and show +that the diffusion term of this SDE determines the robust generalization. An +immediate implication of this theoretical finding is that robust generalization +is positively correlated with the ratio between learning rate and batch size. +We further propose a novel approach, \emph{Diffusion Enhanced Adversarial +Training} (DEAT), to manipulate the diffusion term to improve robust +generalization with virtually no extra computational burden. We theoretically +show that DEAT obtains a tighter generalization bound than PGD-AT. Our +empirical investigation is extensive and firmly attests that DEAT universally +outperforms PGD-AT by a significant margin. + +
+
+ comment: Accepted at KDD 2023 +
+
+
+
+
+ + ♻ ☆ Robust Framework for Explanation Evaluation in Time Series + Classification + + +
+ Time series classification is a task which deals with a prevalent data type +in domains such as human activity recognition, sports analytics and general +healthcare. This paper provides a framework to quantitatively evaluate and rank +explanation methods for time series classification. The recent interest in +explanation methods for time series has provided a great variety of explanation +techniques. Nevertheless, when the explanations disagree on a specific problem, +it remains unclear which of them to use. Comparing multiple explanations to +find the right answer is non-trivial. Two key challenges remain: how to +quantitatively and robustly evaluate the informativeness of a given explanation +method (i.e., relevance for the classification task), and how to compare +explanation methods side-by-side. We propose AMEE, a robust Model-Agnostic +Explanation Evaluation framework for quantifying and comparing multiple +saliency-based explanations for time series classification. Data perturbation +is added to the input time series guided by the saliency maps. The impact of +perturbation on classification accuracy is measured and used for explanation +evaluation. The results show that perturbing discriminative parts of the time +series leads to significant changes in classification accuracy. To be robust to +different types of perturbations and different types of classifiers, we +aggregate the accuracy loss across perturbations and classifiers. This allows +us to objectively quantify and rank different explanation methods. We provide a +quantitative and qualitative analysis for synthetic datasets, a variety of +time-series datasets, as well as a real-world dataset with known expert ground +truth. + +
+
+ comment: Pre-print +
+
+
+
+
+ + ♻ ☆ Parallel and Distributed Graph Neural Networks: An In-Depth Concurrency + Analysis + + +
+ Graph neural networks (GNNs) are among the most powerful tools in deep +learning. They routinely solve complex problems on unstructured networks, such +as node classification, graph classification, or link prediction, with high +accuracy. However, both inference and training of GNNs are complex, and they +uniquely combine the features of irregular graph processing with dense and +regular computations. This complexity makes it very challenging to execute GNNs +efficiently on modern massively parallel architectures. To alleviate this, we +first design a taxonomy of parallelism in GNNs, considering data and model +parallelism, and different forms of pipelining. Then, we use this taxonomy to +investigate the amount of parallelism in numerous GNN models, GNN-driven +machine learning tasks, software frameworks, or hardware accelerators. We use +the work-depth model, and we also assess communication volume and +synchronization. We specifically focus on the sparsity/density of the +associated tensors, in order to understand how to effectively apply techniques +such as vectorization. We also formally analyze GNN pipelining, and we +generalize the established Message-Passing class of GNN models to cover +arbitrary pipeline depths, facilitating future optimizations. Finally, we +investigate different forms of asynchronicity, navigating the path for future +asynchronous parallel GNN pipelines. The outcomes of our analysis are +synthesized in a set of insights that help to maximize GNN performance, and a +comprehensive list of challenges and opportunities for further research into +efficient GNN computations. Our work will help to advance the design of future +GNNs. + +
+
+
+
+
+ + ♻ ☆ Why do networks have inhibitory/negative connections? ICCV2023 + + +
+ Why do brains have inhibitory connections? Why do deep networks have negative +weights? We propose an answer from the perspective of representation capacity. +We believe representing functions is the primary role of both (i) the brain in +natural intelligence, and (ii) deep networks in artificial intelligence. Our +answer to why there are inhibitory/negative weights is: to learn more +functions. We prove that, in the absence of negative weights, neural networks +with non-decreasing activation functions are not universal approximators. While +this may be an intuitive result to some, to the best of our knowledge, there is +no formal theory, in either machine learning or neuroscience, that demonstrates +why negative weights are crucial in the context of representation capacity. +Further, we provide insights on the geometric properties of the representation +space that non-negative deep networks cannot represent. We expect these +insights will yield a deeper understanding of more sophisticated inductive +priors imposed on the distribution of weights that lead to more efficient +biological and machine learning. + +
+
+ comment: ICCV2023 camera-ready +
+
+
+
+
+ + ♻ ☆ MedLens: Improve Mortality Prediction Via Medical Signs Selecting and + Regression + + +
+ Monitoring the health status of patients and predicting mortality in advance +is vital for providing patients with timely care and treatment. Massive medical +signs in electronic health records (EHR) are fitted into advanced machine +learning models to make predictions. However, the data-quality problem of +original clinical signs is less discussed in the literature. Based on an +in-depth measurement of the missing rate and correlation score across various +medical signs and a large amount of patient hospital admission records, we +discovered the comprehensive missing rate is extremely high, and a large number +of useless signs could hurt the performance of prediction models. Then we +concluded that only improving data-quality could improve the baseline accuracy +of different prediction algorithms. We designed MEDLENS, with an automatic +vital medical signs selection approach via statistics and a flexible +interpolation approach for high missing rate time series. After augmenting the +data-quality of original medical signs, MEDLENS applies ensemble classifiers to +boost the accuracy and reduce the computation overhead at the same time. It +achieves a very high accuracy performance of 0.96 AUC-ROC and 0.81 AUC-PR, +which exceeds the previous benchmark. + +
+
+
+
+
+ + ♻ ☆ High-dimensional limit theorems for SGD: Effective dynamics and critical + scaling + + +
+ We study the scaling limits of stochastic gradient descent (SGD) with +constant step-size in the high-dimensional regime. We prove limit theorems for +the trajectories of summary statistics (i.e., finite-dimensional functions) of +SGD as the dimension goes to infinity. Our approach allows one to choose the +summary statistics that are tracked, the initialization, and the step-size. It +yields both ballistic (ODE) and diffusive (SDE) limits, with the limit +depending dramatically on the former choices. We show a critical scaling regime +for the step-size, below which the effective ballistic dynamics matches +gradient flow for the population loss, but at which, a new correction term +appears which changes the phase diagram. About the fixed points of this +effective dynamics, the corresponding diffusive limits can be quite complex and +even degenerate. We demonstrate our approach on popular examples including +estimation for spiked matrix and tensor models and classification via two-layer +networks for binary and XOR-type Gaussian mixture models. These examples +exhibit surprising phenomena including multimodal timescales to convergence as +well as convergence to sub-optimal solutions with probability bounded away from +zero from random (e.g., Gaussian) initializations. At the same time, we +demonstrate the benefit of overparametrization by showing that the latter +probability goes to zero as the second layer width grows. + +
+
+ comment: 43 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Dealing With Non-stationarity in Decentralized Cooperative Multi-Agent + Deep Reinforcement Learning via Multi-Timescale Learning + + +
+ Decentralized cooperative multi-agent deep reinforcement learning (MARL) can +be a versatile learning framework, particularly in scenarios where centralized +training is either not possible or not practical. One of the critical +challenges in decentralized deep MARL is the non-stationarity of the learning +environment when multiple agents are learning concurrently. A commonly used and +efficient scheme for decentralized MARL is independent learning in which agents +concurrently update their policies independently of each other. We first show +that independent learning does not always converge, while sequential learning +where agents update their policies one after another in a sequence is +guaranteed to converge to an agent-by-agent optimal solution. In sequential +learning, when one agent updates its policy, all other agent's policies are +kept fixed, alleviating the challenge of non-stationarity due to simultaneous +updates in other agents' policies. However, it can be slow because only one +agent is learning at any time. Therefore it might also not always be practical. +In this work, we propose a decentralized cooperative MARL algorithm based on +multi-timescale learning. In multi-timescale learning, all agents learn +simultaneously, but at different learning rates. In our proposed method, when +one agent updates its policy, other agents are allowed to update their policies +as well, but at a slower rate. This speeds up sequential learning, while also +minimizing non-stationarity caused by other agents updating concurrently. +Multi-timescale learning outperforms state-of-the-art decentralized learning +methods on a set of challenging multi-agent cooperative tasks in the +epymarl(Papoudakis et al., 2020) benchmark. This can be seen as a first step +towards more general decentralized cooperative deep MARL methods based on +multi-timescale learning. + +
+
+
+
+
+ + ♻ ☆ Experimental Design for Causal Effect Identification ICML2022 + + +
+ Pearl's do calculus is a complete axiomatic approach to learn the +identifiable causal effects from observational data. When such an effect is not +identifiable, it is necessary to perform a collection of often costly +interventions in the system to learn the causal effect. In this work, we +consider the problem of designing the collection of interventions with the +minimum cost to identify the desired effect. First, we prove that this problem +is NP-hard, and subsequently propose an algorithm that can either find the +optimal solution or a logarithmic-factor approximation of it. This is done by +establishing a connection between our problem and the minimum hitting set +problem. Additionally, we propose several polynomial-time heuristic algorithms +to tackle the computational complexity of the problem. Although these +algorithms could potentially stumble on sub-optimal solutions, our simulations +show that they achieve small regrets on random graphs. + +
+
+ comment: 53 pages, 13 figures, extending the findings of our ICML2022 paper +
+
+
+
+
+ + ♻ ☆ Targeted Adversarial Attacks on Wind Power Forecasts + + +
+ In recent years, researchers proposed a variety of deep learning models for +wind power forecasting. These models predict the wind power generation of wind +farms or entire regions more accurately than traditional machine learning +algorithms or physical models. However, latest research has shown that deep +learning models can often be manipulated by adversarial attacks. Since wind +power forecasts are essential for the stability of modern power systems, it is +important to protect them from this threat. In this work, we investigate the +vulnerability of two different forecasting models to targeted, semi-targeted, +and untargeted adversarial attacks. We consider a Long Short-Term Memory (LSTM) +network for predicting the power generation of individual wind farms and a +Convolutional Neural Network (CNN) for forecasting the wind power generation +throughout Germany. Moreover, we propose the Total Adversarial Robustness Score +(TARS), an evaluation metric for quantifying the robustness of regression +models to targeted and semi-targeted adversarial attacks. It assesses the +impact of attacks on the model's performance, as well as the extent to which +the attacker's goal was achieved, by assigning a score between 0 (very +vulnerable) and 1 (very robust). In our experiments, the LSTM forecasting model +was fairly robust and achieved a TARS value of over 0.78 for all adversarial +attacks investigated. The CNN forecasting model only achieved TARS values below +0.10 when trained ordinarily, and was thus very vulnerable. Yet, its robustness +could be significantly improved by adversarial training, which always resulted +in a TARS above 0.46. + +
+
+ comment: 21 pages, including appendix, 12 figures +
+
+
+
+
+ + ♻ ☆ Optimal Prediction Using Expert Advice and Randomized Littlestone + Dimension + + +
+ A classical result in online learning characterizes the optimal mistake bound +achievable by deterministic learners using the Littlestone dimension +(Littlestone '88). We prove an analogous result for randomized learners: we +show that the optimal expected mistake bound in learning a class $\mathcal{H}$ +equals its randomized Littlestone dimension, which is the largest $d$ for which +there exists a tree shattered by $\mathcal{H}$ whose average depth is $2d$. We +further study optimal mistake bounds in the agnostic case, as a function of the +number of mistakes made by the best function in $\mathcal{H}$, denoted by $k$. +We show that the optimal randomized mistake bound for learning a class with +Littlestone dimension $d$ is $k + \Theta (\sqrt{k d} + d )$. This also implies +an optimal deterministic mistake bound of $2k + \Theta(d) + O(\sqrt{k d})$, +thus resolving an open question which was studied by Auer and Long ['99]. + As an application of our theory, we revisit the classical problem of +prediction using expert advice: about 30 years ago Cesa-Bianchi, Freund, +Haussler, Helmbold, Schapire and Warmuth studied prediction using expert +advice, provided that the best among the $n$ experts makes at most $k$ +mistakes, and asked what are the optimal mistake bounds. Cesa-Bianchi, Freund, +Helmbold, and Warmuth ['93, '96] provided a nearly optimal bound for +deterministic learners, and left the randomized case as an open problem. We +resolve this question by providing an optimal learning rule in the randomized +case, and showing that its expected mistake bound equals half of the +deterministic bound of Cesa-Bianchi et al. ['93,'96], up to negligible additive +terms. In contrast with previous works by Abernethy, Langford, and Warmuth +['06], and by Br\^anzei and Peres ['19], our result applies to all pairs $n,k$. + +
+
+
+
+
+ + ♻ ☆ Parametric Classification for Generalized Category Discovery: A Baseline + Study ICCV 2023 + + +
+ Generalized Category Discovery (GCD) aims to discover novel categories in +unlabelled datasets using knowledge learned from labelled samples. Previous +studies argued that parametric classifiers are prone to overfitting to seen +categories, and endorsed using a non-parametric classifier formed with +semi-supervised k-means. However, in this study, we investigate the failure of +parametric classifiers, verify the effectiveness of previous design choices +when high-quality supervision is available, and identify unreliable +pseudo-labels as a key problem. We demonstrate that two prediction biases +exist: the classifier tends to predict seen classes more often, and produces an +imbalanced distribution across seen and novel categories. Based on these +findings, we propose a simple yet effective parametric classification method +that benefits from entropy regularisation, achieves state-of-the-art +performance on multiple GCD benchmarks and shows strong robustness to unknown +class numbers. We hope the investigation and proposed simple framework can +serve as a strong baseline to facilitate future studies in this field. Our code +is available at: https://github.com/CVMI-Lab/SimGCD. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ From Chaos Comes Order: Ordering Event Representations for Object + Recognition and Detection ICCV 2023 + + +
+ Today, state-of-the-art deep neural networks that process events first +convert them into dense, grid-like input representations before using an +off-the-shelf network. However, selecting the appropriate representation for +the task traditionally requires training a neural network for each +representation and selecting the best one based on the validation score, which +is very time-consuming. This work eliminates this bottleneck by selecting +representations based on the Gromov-Wasserstein Discrepancy (GWD) between raw +events and their representation. It is about 200 times faster to compute than +training a neural network and preserves the task performance ranking of event +representations across multiple representations, network backbones, datasets, +and tasks. Thus finding representations with high task scores is equivalent to +finding representations with a low GWD. We use this insight to, for the first +time, perform a hyperparameter search on a large family of event +representations, revealing new and powerful representations that exceed the +state-of-the-art. Our optimized representations outperform existing +representations by 1.7 mAP on the 1 Mpx dataset and 0.3 mAP on the Gen1 +dataset, two established object detection benchmarks, and reach a 3.8% higher +classification score on the mini N-ImageNet benchmark. Moreover, we outperform +state-of-the-art by 2.1 mAP on Gen1 and state-of-the-art feed-forward methods +by 6.0 mAP on the 1 Mpx datasets. This work opens a new unexplored field of +explicit representation optimization for event-based learning. + +
+
+ comment: 15 pages, 11 figures, 2 tables, ICCV 2023 Camera Ready paper +
+
+
+
+
+ + ♻ ☆ Black Box Few-Shot Adaptation for Vision-Language models ICCV 2023 + + +
+ Vision-Language (V-L) models trained with contrastive learning to align the +visual and language modalities have been shown to be strong few-shot learners. +Soft prompt learning is the method of choice for few-shot downstream adaptation +aiming to bridge the modality gap caused by the distribution shift induced by +the new domain. While parameter-efficient, prompt learning still requires +access to the model weights and can be computationally infeasible for large +models with billions of parameters. To address these shortcomings, in this +work, we describe a black-box method for V-L few-shot adaptation that (a) +operates on pre-computed image and text features and hence works without access +to the model's weights, (b) it is orders of magnitude faster at training time, +(c) it is amenable to both supervised and unsupervised training, and (d) it can +be even used to align image and text features computed from uni-modal models. +To achieve this, we propose Linear Feature Alignment (LFA), a simple linear +approach for V-L re-alignment in the target domain. LFA is initialized from a +closed-form solution to a least-squares problem and then it is iteratively +updated by minimizing a re-ranking loss. Despite its simplicity, our approach +can even surpass soft-prompt learning methods as shown by extensive experiments +on 11 image and 2 video datasets. + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ GRAF: Graph Attention-aware Fusion Networks + + +
+ A large number of real-world networks include multiple types of nodes and +edges. Graph Neural Network (GNN) emerged as a deep learning framework to +generate node and graph embeddings for downstream machine learning tasks. +However, popular GNN-based architectures operate on single homogeneous +networks. Enabling them to work on multiple networks brings additional +challenges due to the heterogeneity of the networks and the multiplicity of the +existing associations. In this study, we present a computational approach named +GRAF (Graph Attention-aware Fusion Networks) utilizing GNN-based approaches on +multiple networks with the help of attention mechanisms and network fusion. +Using attention-based neighborhood aggregation, GRAF learns the importance of +each neighbor per node (called node-level attention) followed by the importance +of association (called association-level attention). Then, GRAF processes a +network fusion step weighing each edge according to learned node- and +association-level attentions. Considering that the fused network could be a +highly dense network with many weak edges depending on the given input +networks, we included an edge elimination step with respect to edges' weights. +Finally, GRAF utilizes Graph Convolutional Network (GCN) on the fused network +and incorporates node features on graph-structured data for a node +classification or a similar downstream task. To demonstrate GRAF's +generalizability, we applied it to four datasets from different domains and +observed that GRAF outperformed or was on par with the baselines, +state-of-the-art methods, and its own variations for each node classification +task. Source code for our tool is publicly available at +https://github.com/bozdaglab/GRAF . + +
+
+ comment: 9 pages, 7 supplemental pages, 1 figure, 6 supplemental figures +
+
+
+
+
+ + ♻ ☆ Using AI to Measure Parkinson's Disease Severity at Home + + +
+ We present an artificial intelligence system to remotely assess the motor +performance of individuals with Parkinson's disease (PD). Participants +performed a motor task (i.e., tapping fingers) in front of a webcam, and data +from 250 global participants were rated by three expert neurologists following +the Movement Disorder Society Unified Parkinson's Disease Rating Scale +(MDS-UPDRS). The neurologists' ratings were highly reliable, with an +intra-class correlation coefficient (ICC) of 0.88. We developed computer +algorithms to obtain objective measurements that align with the MDS-UPDRS +guideline and are strongly correlated with the neurologists' ratings. Our +machine learning model trained on these measures outperformed an MDS-UPDRS +certified rater, with a mean absolute error (MAE) of 0.59 compared to the +rater's MAE of 0.79. However, the model performed slightly worse than the +expert neurologists (0.53 MAE). The methodology can be replicated for similar +motor tasks, providing the possibility of evaluating individuals with PD and +other movement disorders remotely, objectively, and in areas with limited +access to neurological care. + +
+
+
+
+
+ + ♻ ☆ Change is Hard: A Closer Look at Subpopulation Shift ICML 2023 + + +
+ Machine learning models often perform poorly on subgroups that are +underrepresented in the training data. Yet, little is understood on the +variation in mechanisms that cause subpopulation shifts, and how algorithms +generalize across such diverse shifts at scale. In this work, we provide a +fine-grained analysis of subpopulation shift. We first propose a unified +framework that dissects and explains common shifts in subgroups. We then +establish a comprehensive benchmark of 20 state-of-the-art algorithms evaluated +on 12 real-world datasets in vision, language, and healthcare domains. With +results obtained from training over 10,000 models, we reveal intriguing +observations for future progress in this space. First, existing algorithms only +improve subgroup robustness over certain types of shifts but not others. +Moreover, while current algorithms rely on group-annotated validation data for +model selection, we find that a simple selection criterion based on worst-class +accuracy is surprisingly effective even without any group information. Finally, +unlike existing works that solely aim to improve worst-group accuracy (WGA), we +demonstrate the fundamental tradeoff between WGA and other important metrics, +highlighting the need to carefully choose testing metrics. Code and data are +available at: https://github.com/YyzHarry/SubpopBench. + +
+
+ comment: ICML 2023 +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Bridging High-Quality Audio and Video via Language for Sound Effects + Retrieval from Visual Queries SP + + +
+ Finding the right sound effects (SFX) to match moments in a video is a +difficult and time-consuming task, and relies heavily on the quality and +completeness of text metadata. Retrieving high-quality (HQ) SFX using a video +frame directly as the query is an attractive alternative, removing the reliance +on text metadata and providing a low barrier to entry for non-experts. Due to +the lack of HQ audio-visual training data, previous work on audio-visual +retrieval relies on YouTube (in-the-wild) videos of varied quality for +training, where the audio is often noisy and the video of amateur quality. As +such it is unclear whether these systems would generalize to the task of +matching HQ audio to production-quality video. To address this, we propose a +multimodal framework for recommending HQ SFX given a video frame by (1) +leveraging large language models and foundational vision-language models to +bridge HQ audio and video to create audio-visual pairs, resulting in a highly +scalable automatic audio-visual data curation pipeline; and (2) using +pre-trained audio and visual encoders to train a contrastive learning-based +retrieval system. We show that our system, trained using our automatic data +curation pipeline, significantly outperforms baselines trained on in-the-wild +data on the task of HQ SFX retrieval for video. Furthermore, while the +baselines fail to generalize to this task, our system generalizes well from +clean to in-the-wild data, outperforming the baselines on a dataset of YouTube +videos despite only being trained on the HQ audio-visual pairs. A user study +confirms that people prefer SFX retrieved by our system over the baseline 67% +of the time both for HQ and in-the-wild data. Finally, we present ablations to +determine the impact of model and data pipeline design choices on downstream +retrieval performance. Please visit our project website to listen to and view +our SFX retrieval results. + +
+
+ comment: WASPAA 2023. Project page: + https://juliawilkins.github.io/sound-effects-retrieval-from-video/. 4 pages, + 2 figures, 2 tables +
+
+
+
+
+ + ☆ Dynamic Kernel-Based Adaptive Spatial Aggregation for Learned Image + Compression + + +
+ Learned image compression methods have shown superior rate-distortion +performance and remarkable potential compared to traditional compression +methods. Most existing learned approaches use stacked convolution or +window-based self-attention for transform coding, which aggregate spatial +information in a fixed range. In this paper, we focus on extending spatial +aggregation capability and propose a dynamic kernel-based transform coding. The +proposed adaptive aggregation generates kernel offsets to capture valid +information in the content-conditioned range to help transform. With the +adaptive aggregation strategy and the sharing weights mechanism, our method can +achieve promising transform capability with acceptable model complexity. +Besides, according to the recent progress of entropy model, we define a +generalized coarse-to-fine entropy model, considering the coarse global +context, the channel-wise, and the spatial context. Based on it, we introduce +dynamic kernel in hyper-prior to generate more expressive global context. +Furthermore, we propose an asymmetric spatial-channel entropy model according +to the investigation of the spatial characteristics of the grouped latents. The +asymmetric entropy model aims to reduce statistical redundancy while +maintaining coding efficiency. Experimental results demonstrate that our method +achieves superior rate-distortion performance on three benchmarks compared to +the state-of-the-art learning-based methods. + +
+
+
+
+
+ + ♻ ☆ Towards General Low-Light Raw Noise Synthesis and Modeling ICCV 2023 + + +
+ Modeling and synthesizing low-light raw noise is a fundamental problem for +computational photography and image processing applications. Although most +recent works have adopted physics-based models to synthesize noise, the +signal-independent noise in low-light conditions is far more complicated and +varies dramatically across camera sensors, which is beyond the description of +these models. To address this issue, we introduce a new perspective to +synthesize the signal-independent noise by a generative model. Specifically, we +synthesize the signal-dependent and signal-independent noise in a physics- and +learning-based manner, respectively. In this way, our method can be considered +as a general model, that is, it can simultaneously learn different noise +characteristics for different ISO levels and generalize to various sensors. +Subsequently, we present an effective multi-scale discriminator termed Fourier +transformer discriminator (FTD) to distinguish the noise distribution +accurately. Additionally, we collect a new low-light raw denoising (LRD) +dataset for training and benchmarking. Qualitative validation shows that the +noise generated by our proposed noise model can be highly similar to the real +noise in terms of distribution. Furthermore, extensive denoising experiments +demonstrate that our method performs favorably against state-of-the-art methods +on different sensors. + +
+
+ comment: 11 pages, 7 figures. Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Sparsity and Coefficient Permutation Based Two-Domain AMP for Image + Block Compressed Sensing + + +
+ The learned denoising-based approximate message passing (LDAMP) algorithm has +attracted great attention for image compressed sensing (CS) tasks. However, it +has two issues: first, its global measurement model severely restricts its +applicability to high-dimensional images, and its block-based measurement +method exhibits obvious block artifacts; second, the denoiser in the LDAMP is +too simple, and existing denoisers have limited ability in detail recovery. In +this paper, to overcome the issues and develop a high-performance LDAMP method +for image block compressed sensing (BCS), we propose a novel sparsity and +coefficient permutation-based AMP (SCP-AMP) method consisting of the +block-based sampling and the two-domain reconstruction modules. In the sampling +module, SCP-AMP adopts a discrete cosine transform (DCT) based sparsity +strategy to reduce the impact of the high-frequency coefficient on the +reconstruction, followed by a coefficient permutation strategy to avoid block +artifacts. In the reconstruction module, a two-domain AMP method with DCT +domain noise correction and pixel domain denoising is proposed for iterative +reconstruction. Regarding the denoiser, we proposed a multi-level deep +attention network (MDANet) to enhance the texture details by employing +multi-level features and multiple attention mechanisms. Extensive experiments +demonstrated that the proposed SCP-AMP method achieved better reconstruction +accuracy than other state-of-the-art BCS algorithms in terms of both visual +perception and objective metrics. + +
+
+ comment: The content modification has been upgraded and corrected on a large + scale, and request to withdraw this version +
+
+
+
+
+ + ♻ ☆ OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion + and Infinite Data Generation + + +
+ This paper presents OmniDataComposer, an innovative approach for multimodal +data fusion and unlimited data generation with an intent to refine and +uncomplicate interplay among diverse data modalities. Coming to the core +breakthrough, it introduces a cohesive data structure proficient in processing +and merging multimodal data inputs, which include video, audio, and text. + Our crafted algorithm leverages advancements across multiple operations such +as video/image caption extraction, dense caption extraction, Automatic Speech +Recognition (ASR), Optical Character Recognition (OCR), Recognize Anything +Model(RAM), and object tracking. OmniDataComposer is capable of identifying +over 6400 categories of objects, substantially broadening the spectrum of +visual information. It amalgamates these diverse modalities, promoting +reciprocal enhancement among modalities and facilitating cross-modal data +correction. \textbf{The final output metamorphoses each video input into an +elaborate sequential document}, virtually transmuting videos into thorough +narratives, making them easier to be processed by large language models. + Future prospects include optimizing datasets for each modality to encourage +unlimited data generation. This robust base will offer priceless insights to +models like ChatGPT, enabling them to create higher quality datasets for video +captioning and easing question-answering tasks based on video content. +OmniDataComposer inaugurates a new stage in multimodal learning, imparting +enormous potential for augmenting AI's understanding and generation of complex, +real-world data. + +
+
+
+
+
+ + ♻ ☆ DEEPCHORUS: A Hybrid Model of Multi-scale Convolution and Self-attention + for Chorus Detection ICASSP 2022 + + +
+ Chorus detection is a challenging problem in musical signal processing as the +chorus often repeats more than once in popular songs, usually with rich +instruments and complex rhythm forms. Most of the existing works focus on the +receptiveness of chorus sections based on some explicit features such as +loudness and occurrence frequency. These pre-assumptions for chorus limit the +generalization capacity of these methods, causing misdetection on other +repeated sections such as verse. To solve the problem, in this paper we propose +an end-to-end chorus detection model DeepChorus, reducing the engineering +effort and the need for prior knowledge. The proposed model includes two main +structures: i) a Multi-Scale Network to derive preliminary representations of +chorus segments, and ii) a Self-Attention Convolution Network to further +process the features into probability curves representing chorus presence. To +obtain the final results, we apply an adaptive threshold to binarize the +original curve. The experimental results show that DeepChorus outperforms +existing state-of-the-art methods in most cases. + +
+
+ comment: Accepted by ICASSP 2022 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 53 + +
+
+
+ + ☆ Time Travel in LLMs: Tracing Data Contamination in Large Language Models + + +
+ Data contamination, i.e., the presence of test data from downstream tasks in +the training data of large language models (LLMs), is a potential major issue +in understanding LLMs' effectiveness on other tasks. We propose a +straightforward yet effective method for identifying data contamination within +LLMs. At its core, our approach starts by identifying potential contamination +in individual instances that are drawn from a small random sample; using this +information, our approach then assesses if an entire dataset partition is +contaminated. To estimate contamination of individual instances, we employ +"guided instruction:" a prompt consisting of the dataset name, partition type, +and the initial segment of a reference instance, asking the LLM to complete it. +An instance is flagged as contaminated if the LLM's output either exactly or +closely matches the latter segment of the reference. To understand if an entire +partition is contaminated, we propose two ideas. The first idea marks a dataset +partition as contaminated if the average overlap score with the reference +instances (as measured by ROUGE or BLEURT) is statistically significantly +better with the guided instruction vs. a general instruction that does not +include the dataset and partition name. The second idea marks a dataset as +contaminated if a classifier based on GPT-4 with in-context learning prompting +marks multiple instances as contaminated. Our best method achieves an accuracy +between 92% and 100% in detecting if an LLM is contaminated with seven +datasets, containing train and test/validation partitions, when contrasted with +manual evaluation by human expert. Further, our findings indicate that GPT-4 is +contaminated with AG News, WNLI, and XSum datasets. + +
+
+ comment: v1 preprint +
+
+
+
+
+ + ☆ Mitigating the Exposure Bias in Sentence-Level Grapheme-to-Phoneme (G2P) + Transduction INTERSPEECH 2023 + + +
+ Text-to-Text Transfer Transformer (T5) has recently been considered for the +Grapheme-to-Phoneme (G2P) transduction. As a follow-up, a tokenizer-free +byte-level model based on T5 referred to as ByT5, recently gave promising +results on word-level G2P conversion by representing each input character with +its corresponding UTF-8 encoding. Although it is generally understood that +sentence-level or paragraph-level G2P can improve usability in real-world +applications as it is better suited to perform on heteronyms and linking sounds +between words, we find that using ByT5 for these scenarios is nontrivial. Since +ByT5 operates on the character level, it requires longer decoding steps, which +deteriorates the performance due to the exposure bias commonly observed in +auto-regressive generation models. This paper shows that the performance of +sentence-level and paragraph-level G2P can be improved by mitigating such +exposure bias using our proposed loss-based sampling method. + +
+
+ comment: INTERSPEECH 2023 +
+
+
+
+
+ + ☆ Knowledge-Enhanced Multi-Label Few-Shot Product Attribute-Value + Extraction CIKM 2023 + + +
+ Existing attribute-value extraction (AVE) models require large quantities of +labeled data for training. However, new products with new attribute-value pairs +enter the market every day in real-world e-Commerce. Thus, we formulate AVE in +multi-label few-shot learning (FSL), aiming to extract unseen attribute value +pairs based on a small number of training examples. We propose a +Knowledge-Enhanced Attentive Framework (KEAF) based on prototypical networks, +leveraging the generated label description and category information to learn +more discriminative prototypes. Besides, KEAF integrates with hybrid attention +to reduce noise and capture more informative semantics for each class by +calculating the label-relevant and query-related weights. To achieve +multi-label inference, KEAF further learns a dynamic threshold by integrating +the semantic information from both the support set and the query set. Extensive +experiments with ablation studies conducted on two datasets demonstrate that +KEAF outperforms other SOTA models for information extraction in FSL. The code +can be found at: https://github.com/gjiaying/KEAF + +
+
+ comment: 6 pages, 2 figures, published in CIKM 2023 +
+
+
+
+
+ + ☆ Advancing continual lifelong learning in neural information retrieval: + definition, dataset, framework, and empirical evaluation + + +
+ Continual learning refers to the capability of a machine learning model to +learn and adapt to new information, without compromising its performance on +previously learned tasks. Although several studies have investigated continual +learning methods for information retrieval tasks, a well-defined task +formulation is still lacking, and it is unclear how typical learning strategies +perform in this context. To address this challenge, a systematic task +formulation of continual neural information retrieval is presented, along with +a multiple-topic dataset that simulates continuous information retrieval. A +comprehensive continual neural information retrieval framework consisting of +typical retrieval models and continual learning strategies is then proposed. +Empirical evaluations illustrate that the proposed framework can successfully +prevent catastrophic forgetting in neural information retrieval and enhance +performance on previously learned tasks. The results indicate that +embedding-based retrieval models experience a decline in their continual +learning performance as the topic shift distance and dataset volume of new +tasks increase. In contrast, pretraining-based models do not show any such +correlation. Adopting suitable learning strategies can mitigate the effects of +topic shift and data augmentation. + +
+
+ comment: Submitted to Information Sciences +
+
+
+
+
+ + ☆ SummHelper: Collaborative Human-Computer Summarization + + +
+ Current approaches for text summarization are predominantly automatic, with +rather limited space for human intervention and control over the process. In +this paper, we introduce SummHelper, a 2-phase summarization assistant designed +to foster human-machine collaboration. The initial phase involves content +selection, where the system recommends potential content, allowing users to +accept, modify, or introduce additional selections. The subsequent phase, +content consolidation, involves SummHelper generating a coherent summary from +these selections, which users can then refine using visual mappings between the +summary and the source text. Small-scale user studies reveal the effectiveness +of our application, with participants being especially appreciative of the +balance between automated guidance and opportunities for personal input. + +
+
+ comment: Demo paper +
+
+
+
+
+ + ☆ Detoxify Language Model Step-by-Step + + +
+ Detoxification for LLMs is challenging since it requires models to avoid +generating harmful content while maintaining the generation capability. To +ensure the safety of generations, previous detoxification methods detoxify the +models by changing the data distributions or constraining the generations from +different aspects in a single-step manner. However, these approaches will +dramatically affect the generation quality of LLMs, e.g., discourse coherence +and semantic consistency, since language models tend to generate along the +toxic prompt while detoxification methods work in the opposite direction. To +handle such a conflict, we decompose the detoxification process into different +sub-steps, where the detoxification is concentrated in the input stage and the +subsequent continual generation is based on the non-toxic prompt. Besides, we +also calibrate the strong reasoning ability of LLMs by designing a Detox-Chain +to connect the above sub-steps in an orderly manner, which allows LLMs to +detoxify the text step-by-step. Automatic and human evaluation on two +benchmarks reveals that by training with Detox-Chain, six LLMs scaling from 1B +to 33B can obtain significant detoxification and generation improvement. Our +code and data are available at https://github.com/CODINNLG/Detox-CoT. Warning: +examples in the paper may contain uncensored offensive content. + +
+
+
+
+
+ + ☆ Pre-training with Large Language Model-based Document Expansion for + Dense Passage Retrieval + + +
+ In this paper, we systematically study the potential of pre-training with +Large Language Model(LLM)-based document expansion for dense passage retrieval. +Concretely, we leverage the capabilities of LLMs for document expansion, i.e. +query generation, and effectively transfer expanded knowledge to retrievers +using pre-training strategies tailored for passage retrieval. These strategies +include contrastive learning and bottlenecked query generation. Furthermore, we +incorporate a curriculum learning strategy to reduce the reliance on LLM +inferences. Experimental results demonstrate that pre-training with LLM-based +document expansion significantly boosts the retrieval performance on +large-scale web-search tasks. Our work shows strong zero-shot and out-of-domain +retrieval abilities, making it more widely applicable for retrieval when +initializing with no human-labeled data. + +
+
+ comment: 10 pages, 3 tables, 4 figures, under review +
+
+
+
+
+ + ☆ Benchmarking Neural Network Generalization for Grammar Induction + + +
+ How well do neural networks generalize? Even for grammar induction tasks, +where the target generalization is fully known, previous works have left the +question open, testing very limited ranges beyond the training set and using +different success criteria. We provide a measure of neural network +generalization based on fully specified formal languages. Given a model and a +formal grammar, the method assigns a generalization score representing how well +a model generalizes to unseen samples in inverse relation to the amount of data +it was trained on. The benchmark includes languages such as $a^nb^n$, +$a^nb^nc^n$, $a^nb^mc^{n+m}$, and Dyck-1 and 2. We evaluate selected +architectures using the benchmark and find that networks trained with a Minimum +Description Length objective (MDL) generalize better and using less data than +networks trained using standard loss functions. The benchmark is available at +https://github.com/taucompling/bliss. + +
+
+ comment: 10 pages, 4 figures, 2 tables. Conference: Learning with Small Data + 2023 +
+
+
+
+
+ + ☆ TEST: Text Prototype Aligned Embedding to Activate LLM's Ability for + Time Series + + +
+ This work summarizes two strategies for completing time-series (TS) tasks +using today's language model (LLM): LLM-for-TS, design and train a fundamental +large model for TS data; TS-for-LLM, enable the pre-trained LLM to handle TS +data. Considering the insufficient data accumulation, limited resources, and +semantic context requirements, this work focuses on TS-for-LLM methods, where +we aim to activate LLM's ability for TS data by designing a TS embedding method +suitable for LLM. The proposed method is named TEST. It first tokenizes TS, +builds an encoder to embed them by instance-wise, feature-wise, and +text-prototype-aligned contrast, and then creates prompts to make LLM more open +to embeddings, and finally implements TS tasks. Experiments are carried out on +TS classification and forecasting tasks using 8 LLMs with different structures +and sizes. Although its results cannot significantly outperform the current +SOTA models customized for TS tasks, by treating LLM as the pattern machine, it +can endow LLM's ability to process TS data without compromising the language +ability. This paper is intended to serve as a foundational work that will +inspire further research. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ MemoChat: Tuning LLMs to Use Memos for Consistent Long-Range Open-Domain + Conversation + + +
+ We propose MemoChat, a pipeline for refining instructions that enables large +language models (LLMs) to effectively employ self-composed memos for +maintaining consistent long-range open-domain conversations. We demonstrate a +long-range open-domain conversation through iterative +"memorization-retrieval-response" cycles. This requires us to carefully design +tailored tuning instructions for each distinct stage. The instructions are +reconstructed from a collection of public datasets to teach the LLMs to +memorize and retrieve past dialogues with structured memos, leading to enhanced +consistency when participating in future conversations. We invite experts to +manually annotate a test set designed to evaluate the consistency of long-range +conversations questions. Experiments on three testing scenarios involving both +open-source and API-accessible chatbots at scale verify the efficacy of +MemoChat, which outperforms strong baselines. + +
+
+ comment: Codes, data and models will be available soon +
+
+
+
+
+ + ☆ Challenges and Opportunities of Using Transformer-Based Multi-Task + Learning in NLP Through ML Lifecycle: A Survey + + +
+ The increasing adoption of natural language processing (NLP) models across +industries has led to practitioners' need for machine learning systems to +handle these models efficiently, from training to serving them in production. +However, training, deploying, and updating multiple models can be complex, +costly, and time-consuming, mainly when using transformer-based pre-trained +language models. Multi-Task Learning (MTL) has emerged as a promising approach +to improve efficiency and performance through joint training, rather than +training separate models. Motivated by this, we first provide an overview of +transformer-based MTL approaches in NLP. Then, we discuss the challenges and +opportunities of using MTL approaches throughout typical ML lifecycle phases, +specifically focusing on the challenges related to data engineering, model +development, deployment, and monitoring phases. This survey focuses on +transformer-based MTL architectures and, to the best of our knowledge, is novel +in that it systematically analyses how transformer-based MTL in NLP fits into +ML lifecycle phases. Furthermore, we motivate research on the connection +between MTL and continual learning (CL), as this area remains unexplored. We +believe it would be practical to have a model that can handle both MTL and CL, +as this would make it easier to periodically re-train the model, update it due +to distribution shifts, and add new capabilities to meet real-world +requirements. + +
+
+
+
+
+ + ☆ MoCoSA: Momentum Contrast for Knowledge Graph Completion with + Structure-Augmented Pre-trained Language Models + + +
+ Knowledge Graph Completion (KGC) aims to conduct reasoning on the facts +within knowledge graphs and automatically infer missing links. Existing methods +can mainly be categorized into structure-based or description-based. On the one +hand, structure-based methods effectively represent relational facts in +knowledge graphs using entity embeddings. However, they struggle with +semantically rich real-world entities due to limited structural information and +fail to generalize to unseen entities. On the other hand, description-based +methods leverage pre-trained language models (PLMs) to understand textual +information. They exhibit strong robustness towards unseen entities. However, +they have difficulty with larger negative sampling and often lag behind +structure-based methods. To address these issues, in this paper, we propose +Momentum Contrast for knowledge graph completion with Structure-Augmented +pre-trained language models (MoCoSA), which allows the PLM to perceive the +structural information by the adaptable structure encoder. To improve learning +efficiency, we proposed momentum hard negative and intra-relation negative +sampling. Experimental results demonstrate that our approach achieves +state-of-the-art performance in terms of mean reciprocal rank (MRR), with +improvements of 2.5% on WN18RR and 21% on OpenBG500. + +
+
+
+
+
+ + ☆ ChinaTelecom System Description to VoxCeleb Speaker Recognition + Challenge 2023 + + +
+ This technical report describes ChinaTelecom system for Track 1 (closed) of +the VoxCeleb2023 Speaker Recognition Challenge (VoxSRC 2023). Our system +consists of several ResNet variants trained only on VoxCeleb2, which were fused +for better performance later. Score calibration was also applied for each +variant and the fused system. The final submission achieved minDCF of 0.1066 +and EER of 1.980%. + +
+
+ comment: System description of VoxSRC 2023 +
+
+
+
+
+ + ☆ RSpell: Retrieval-augmented Framework for Domain Adaptive Chinese + Spelling Check + + +
+ Chinese Spelling Check (CSC) refers to the detection and correction of +spelling errors in Chinese texts. In practical application scenarios, it is +important to make CSC models have the ability to correct errors across +different domains. In this paper, we propose a retrieval-augmented spelling +check framework called RSpell, which searches corresponding domain terms and +incorporates them into CSC models. Specifically, we employ pinyin fuzzy +matching to search for terms, which are combined with the input and fed into +the CSC model. Then, we introduce an adaptive process control mechanism to +dynamically adjust the impact of external knowledge on the model. Additionally, +we develop an iterative strategy for the RSpell framework to enhance reasoning +capabilities. We conducted experiments on CSC datasets in three domains: law, +medicine, and official document writing. The results demonstrate that RSpell +achieves state-of-the-art performance in both zero-shot and fine-tuning +scenarios, demonstrating the effectiveness of the retrieval-augmented CSC +framework. Our code is available at https://github.com/47777777/Rspell. + +
+
+
+
+
+ + ☆ Enhancing Performance on Seen and Unseen Dialogue Scenarios using + Retrieval-Augmented End-to-End Task-Oriented System SIGDIAL 2023 + + +
+ End-to-end task-oriented dialogue (TOD) systems have achieved promising +performance by leveraging sophisticated natural language understanding and +natural language generation capabilities of pre-trained models. This work +enables the TOD systems with more flexibility through a simple cache. The cache +provides the flexibility to dynamically update the TOD systems and handle both +existing and unseen dialogue scenarios. Towards this end, we first fine-tune a +retrieval module to effectively retrieve the most relevant information entries +from the cache. We then train end-to-end TOD models that can refer to and +ground on both dialogue history and retrieved information during TOD +generation. The cache is straightforward to construct, and the backbone models +of TOD systems are compatible with existing pre-trained generative models. +Extensive experiments demonstrate the superior performance of our framework, +with a notable improvement in non-empty joint goal accuracy by 6.7% compared to +strong baselines. + +
+
+ comment: Accepted by SIGDIAL 2023 as a long paper +
+
+
+
+
+ + ☆ Sarcasm Detection in a Disaster Context + + +
+ During natural disasters, people often use social media platforms such as +Twitter to ask for help, to provide information about the disaster situation, +or to express contempt about the unfolding event or public policies and +guidelines. This contempt is in some cases expressed as sarcasm or irony. +Understanding this form of speech in a disaster-centric context is essential to +improving natural language understanding of disaster-related tweets. In this +paper, we introduce HurricaneSARC, a dataset of 15,000 tweets annotated for +intended sarcasm, and provide a comprehensive investigation of sarcasm +detection using pre-trained language models. Our best model is able to obtain +as much as 0.70 F1 on our dataset. We also demonstrate that the performance on +HurricaneSARC can be improved by leveraging intermediate task transfer +learning. We release our data and code at +https://github.com/tsosea2/HurricaneSarc. + +
+
+
+
+
+ + ☆ AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation + Framework + + +
+ This technical report presents AutoGen, a new framework that enables +development of LLM applications using multiple agents that can converse with +each other to solve tasks. AutoGen agents are customizable, conversable, and +seamlessly allow human participation. They can operate in various modes that +employ combinations of LLMs, human inputs, and tools. AutoGen's design offers +multiple advantages: a) it gracefully navigates the strong but imperfect +generation and reasoning abilities of these LLMs; b) it leverages human +understanding and intelligence, while providing valuable automation through +conversations between agents; c) it simplifies and unifies the implementation +of complex LLM workflows as automated agent chats. We provide many diverse +examples of how developers can easily use AutoGen to effectively solve tasks or +build applications, ranging from coding, mathematics, operations research, +entertainment, online decision-making, question answering, etc. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ☆ Fast Training of NMT Model with Data Sorting + + +
+ The Transformer model has revolutionized Natural Language Processing tasks +such as Neural Machine Translation, and many efforts have been made to study +the Transformer architecture, which increased its efficiency and accuracy. One +potential area for improvement is to address the computation of empty tokens +that the Transformer computes only to discard them later, leading to an +unnecessary computational burden. To tackle this, we propose an algorithm that +sorts translation sentence pairs based on their length before batching, +minimizing the waste of computing power. Since the amount of sorting could +violate the independent and identically distributed (i.i.d) data assumption, we +sort the data partially. In experiments, we apply the proposed method to +English-Korean and English-Luganda language pairs for machine translation and +show that there are gains in computational time while maintaining the +performance. Our method is independent of architectures, so that it can be +easily integrated into any training process with flexible data lengths. + +
+
+
+
+
+ + ☆ MDDial: A Multi-turn Differential Diagnosis Dialogue Dataset with + Reliability Evaluation + + +
+ Dialogue systems for Automatic Differential Diagnosis (ADD) have a wide range +of real-life applications. These dialogue systems are promising for providing +easy access and reducing medical costs. Building end-to-end ADD dialogue +systems requires dialogue training datasets. However, to the best of our +knowledge, there is no publicly available ADD dialogue dataset in English +(although non-English datasets exist). Driven by this, we introduce MDDial, the +first differential diagnosis dialogue dataset in English which can aid to build +and evaluate end-to-end ADD dialogue systems. Additionally, earlier studies +present the accuracy of diagnosis and symptoms either individually or as a +combined weighted score. This method overlooks the connection between the +symptoms and the diagnosis. We introduce a unified score for the ADD system +that takes into account the interplay between symptoms and diagnosis. This +score also indicates the system's reliability. To the end, we train two +moderate-size of language models on MDDial. Our experiments suggest that while +these language models can perform well on many natural language understanding +tasks, including dialogue tasks in the general domain, they struggle to relate +relevant symptoms and disease and thus have poor performance on MDDial. MDDial +will be released publicly to aid the study of ADD dialogue research. + +
+
+
+
+
+ + ☆ Radio2Text: Streaming Speech Recognition Using mmWave Radio Signals + + +
+ Millimeter wave (mmWave) based speech recognition provides more possibility +for audio-related applications, such as conference speech transcription and +eavesdropping. However, considering the practicality in real scenarios, latency +and recognizable vocabulary size are two critical factors that cannot be +overlooked. In this paper, we propose Radio2Text, the first mmWave-based system +for streaming automatic speech recognition (ASR) with a vocabulary size +exceeding 13,000 words. Radio2Text is based on a tailored streaming Transformer +that is capable of effectively learning representations of speech-related +features, paving the way for streaming ASR with a large vocabulary. To +alleviate the deficiency of streaming networks unable to access entire future +inputs, we propose the Guidance Initialization that facilitates the transfer of +feature knowledge related to the global context from the non-streaming +Transformer to the tailored streaming Transformer through weight inheritance. +Further, we propose a cross-modal structure based on knowledge distillation +(KD), named cross-modal KD, to mitigate the negative effect of low quality +mmWave signals on recognition performance. In the cross-modal KD, the audio +streaming Transformer provides feature and response guidance that inherit +fruitful and accurate speech information to supervise the training of the +tailored radio streaming Transformer. The experimental results show that our +Radio2Text can achieve a character error rate of 5.7% and a word error rate of +9.4% for the recognition of a vocabulary consisting of over 13,000 words. + +
+
+ comment: Accepted by Proceedings of the ACM on Interactive, Mobile, Wearable + and Ubiquitous Technologies (ACM IMWUT/UbiComp 2023) +
+
+
+
+
+ + ☆ Separate the Wheat from the Chaff: Model Deficiency Unlearning via + Parameter-Efficient Module Operation + + +
+ Large language models (LLMs) have been widely used in various applications +but are known to suffer from issues related to untruthfulness and toxicity. +While parameter-efficient modules (PEMs) have demonstrated their effectiveness +in equipping models with new skills, leveraging PEMs for deficiency unlearning +remains underexplored. In this work, we propose a PEMs operation approach, +namely Extraction-before-Subtraction (Ext-Sub), to enhance the truthfulness and +detoxification of LLMs through the integration of ``expert'' PEM and +``anti-expert'' PEM. Remarkably, even anti-expert PEM possess valuable +capabilities due to their proficiency in generating fabricated content, which +necessitates language modeling and logical narrative competence. Rather than +merely negating the parameters, our approach involves extracting and +eliminating solely the deficiency capability within anti-expert PEM while +preserving the general capabilities. To evaluate the effectiveness of our +approach in terms of truthfulness and detoxification, we conduct extensive +experiments on LLMs, encompassing additional abilities such as language +modeling and mathematical reasoning. Our empirical results demonstrate that our +approach effectively improves truthfulness and detoxification, while largely +preserving the fundamental abilities of LLMs. + +
+
+
+
+
+ + ☆ Lightweight Adaptation of Neural Language Models via Subspace Embedding CIKM 2023 + + +
+ Traditional neural word embeddings are usually dependent on a richer +diversity of vocabulary. However, the language models recline to cover major +vocabularies via the word embedding parameters, in particular, for multilingual +language models that generally cover a significant part of their overall +learning parameters. In this work, we present a new compact embedding structure +to reduce the memory footprint of the pre-trained language models with a +sacrifice of up to 4% absolute accuracy. The embeddings vectors reconstruction +follows a set of subspace embeddings and an assignment procedure via the +contextual relationship among tokens from pre-trained language models. The +subspace embedding structure calibrates to masked language models, to evaluate +our compact embedding structure on similarity and textual entailment tasks, +sentence and paraphrase tasks. Our experimental evaluation shows that the +subspace embeddings achieve compression rates beyond 99.8% in comparison with +the original embeddings for the language models on XNLI and GLUE benchmark +suites. + +
+
+ comment: 5 pages, Accepted as a Main Conference Short Paper at CIKM 2023 +
+
+
+
+
+ + ☆ Answering Ambiguous Questions with a Database of Questions, Answers, and + Revisions + + +
+ Many open-domain questions are under-specified and thus have multiple +possible answers, each of which is correct under a different interpretation of +the question. Answering such ambiguous questions is challenging, as it requires +retrieving and then reasoning about diverse information from multiple passages. +We present a new state-of-the-art for answering ambiguous questions that +exploits a database of unambiguous questions generated from Wikipedia. On the +challenging ASQA benchmark, which requires generating long-form answers that +summarize the multiple answers to an ambiguous question, our method improves +performance by 15% (relative improvement) on recall measures and 10% on +measures which evaluate disambiguating questions from predicted outputs. +Retrieving from the database of generated questions also gives large +improvements in diverse passage retrieval (by matching user questions q to +passages p indirectly, via questions q' generated from p). + +
+
+
+
+
+ + ☆ Large Language Models for Granularized Barrett's Esophagus Diagnosis + Classification + + +
+ Diagnostic codes for Barrett's esophagus (BE), a precursor to esophageal +cancer, lack granularity and precision for many research or clinical use cases. +Laborious manual chart review is required to extract key diagnostic phenotypes +from BE pathology reports. We developed a generalizable transformer-based +method to automate data extraction. Using pathology reports from Columbia +University Irving Medical Center with gastroenterologist-annotated targets, we +performed binary dysplasia classification as well as granularized multi-class +BE-related diagnosis classification. We utilized two clinically pre-trained +large language models, with best model performance comparable to a highly +tailored rule-based system developed using the same data. Binary dysplasia +extraction achieves 0.964 F1-score, while the multi-class model achieves 0.911 +F1-score. Our method is generalizable and faster to implement as compared to a +tailored rule-based approach. + +
+
+
+
+
+ + ☆ Learning the meanings of function words from grounded language using a + visual question answering model + + +
+ Interpreting a seemingly-simple function word like "or", "behind", or "more" +can require logical, numerical, and relational reasoning. How are such words +learned by children? Prior acquisition theories have often relied on positing a +foundation of innate knowledge. Yet recent neural-network based visual question +answering models apparently can learn to use function words as part of +answering questions about complex visual scenes. In this paper, we study what +these models learn about function words, in the hope of better understanding +how the meanings of these words can be learnt by both models and children. We +show that recurrent models trained on visually grounded language learn gradient +semantics for function words requiring spacial and numerical reasoning. +Furthermore, we find that these models can learn the meanings of logical +connectives "and" and "or" without any prior knowledge of logical reasoning, as +well as early evidence that they can develop the ability to reason about +alternative expressions when interpreting language. Finally, we show that word +learning difficulty is dependent on frequency in models' input. Our findings +offer evidence that it is possible to learn the meanings of function words in +visually grounded context by using non-symbolic general statistical learning +algorithms, without any prior knowledge of linguistic meaning. + +
+
+
+
+
+ + ☆ BIOptimus: Pre-training an Optimal Biomedical Language Model with + Curriculum Learning for Named Entity Recognition + + +
+ Using language models (LMs) pre-trained in a self-supervised setting on large +corpora and then fine-tuning for a downstream task has helped to deal with the +problem of limited label data for supervised learning tasks such as Named +Entity Recognition (NER). Recent research in biomedical language processing has +offered a number of biomedical LMs pre-trained using different methods and +techniques that advance results on many BioNLP tasks, including NER. However, +there is still a lack of a comprehensive comparison of pre-training approaches +that would work more optimally in the biomedical domain. This paper aims to +investigate different pre-training methods, such as pre-training the biomedical +LM from scratch and pre-training it in a continued fashion. We compare existing +methods with our proposed pre-training method of initializing weights for new +tokens by distilling existing weights from the BERT model inside the context +where the tokens were found. The method helps to speed up the pre-training +stage and improve performance on NER. In addition, we compare how masking rate, +corruption strategy, and masking strategies impact the performance of the +biomedical LM. Finally, using the insights from our experiments, we introduce a +new biomedical LM (BIOptimus), which is pre-trained using Curriculum Learning +(CL) and contextualized weight distillation method. Our model sets new states +of the art on several biomedical Named Entity Recognition (NER) tasks. We +release our code and all pre-trained models + +
+
+
+
+
+ + ☆ Boosting Logical Reasoning in Large Language Models through a New + Framework: The Graph of Thought + + +
+ Recent advancements in large-scale models, such as GPT-4, have showcased +remarkable capabilities in addressing standard queries. However, when facing +complex problems that require multi-step logical reasoning, their accuracy +dramatically decreases. Current research has explored the realm of +\textit{prompting engineering} to bolster the inferential capacities of these +models. Our paper unveils a pioneering prompting technique, dubbed +\textit{Graph of Thoughts (GoT)}. Through testing on a trio of escalating +challenges: the 24-point game, resolution of high-degree polynomial equations, +and derivation of formulas for recursive sequences, our method outperformed +GPT-4, achieving accuracy improvements of $89.7\%$, $86\%$, and $56\%$ for each +respective task. Moreover, when juxtaposed with the state-of-the-art (SOTA) +prompting method, \textit{Tree of Thought (ToT)}, our approach registered an +average accuracy boost of $23\%$, $24\%$, and $15\%$. + +
+
+
+
+
+ + ☆ FootGPT : A Large Language Model Development Experiment on a Minimal + Setting + + +
+ With recent empirical observations, it has been argued that the most +significant aspect of developing accurate language models may be the proper +dataset content and training strategy compared to the number of neural +parameters, training duration or dataset size. Following this argument, we +opted to fine tune a one billion parameter size trained general purpose causal +language model with a dataset curated on team statistics of the Italian +football league first ten game weeks, using low rank adaptation. The limited +training dataset was compiled based on a framework where a powerful commercial +large language model provides distilled paragraphs and question answer pairs as +intended. The training duration was kept relatively short to provide a basis +for our minimal setting exploration. We share our key observations on the +process related to developing a specific purpose language model which is +intended to interpret soccer data with constrained resources in this article. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ AffectEcho: Speaker Independent and Language-Agnostic Emotion and Affect + Transfer for Speech Synthesis + + +
+ Affect is an emotional characteristic encompassing valence, arousal, and +intensity, and is a crucial attribute for enabling authentic conversations. +While existing text-to-speech (TTS) and speech-to-speech systems rely on +strength embedding vectors and global style tokens to capture emotions, these +models represent emotions as a component of style or represent them in discrete +categories. We propose AffectEcho, an emotion translation model, that uses a +Vector Quantized codebook to model emotions within a quantized space featuring +five levels of affect intensity to capture complex nuances and subtle +differences in the same emotion. The quantized emotional embeddings are +implicitly derived from spoken speech samples, eliminating the need for one-hot +vectors or explicit strength embeddings. Experimental results demonstrate the +effectiveness of our approach in controlling the emotions of generated speech +while preserving identity, style, and emotional cadence unique to each speaker. +We showcase the language-independent emotion modeling capability of the +quantized emotional embeddings learned from a bilingual (English and Chinese) +speech corpus with an emotion transfer task from a reference speech to a target +speech. We achieve state-of-art results on both qualitative and quantitative +metrics. + +
+
+
+
+
+ + ♻ ☆ LLM-Rec: Personalized Recommendation via Prompting Large Language Models + + +
+ We investigate various prompting strategies for enhancing personalized +recommendation performance with large language models (LLMs) through input +augmentation. Our proposed approach, termed LLM-Rec, encompasses four distinct +prompting strategies: (1) basic prompting, (2) recommendation-driven prompting, +(3) engagement-guided prompting, and (4) recommendation-driven + +engagement-guided prompting. Our empirical experiments show that incorporating +the augmented input text generated by LLM leads to improved recommendation +performance. Recommendation-driven and engagement-guided prompting strategies +are found to elicit LLM's understanding of global and local item +characteristics. This finding highlights the importance of leveraging diverse +prompts and input augmentation techniques to enhance the recommendation +capabilities with LLMs. + +
+
+
+
+
+ + ♻ ☆ Allophant: Cross-lingual Phoneme Recognition with Articulatory + Attributes INTERSPEECH 2023 + + +
+ This paper proposes Allophant, a multilingual phoneme recognizer. It requires +only a phoneme inventory for cross-lingual transfer to a target language, +allowing for low-resource recognition. The architecture combines a +compositional phone embedding approach with individually supervised phonetic +attribute classifiers in a multi-task architecture. We also introduce +Allophoible, an extension of the PHOIBLE database. When combined with a +distance based mapping approach for grapheme-to-phoneme outputs, it allows us +to train on PHOIBLE inventories directly. By training and evaluating on 34 +languages, we found that the addition of multi-task learning improves the +model's capability of being applied to unseen phonemes and phoneme inventories. +On supervised languages we achieve phoneme error rate improvements of 11 +percentage points (pp.) compared to a baseline without multi-task learning. +Evaluation of zero-shot transfer on 84 languages yielded a decrease in PER of +2.63 pp. over the baseline. + +
+
+ comment: 5 pages, 2 figures, 2 tables, accepted to INTERSPEECH 2023; published + version +
+
+
+
+
+ + ♻ ☆ Black Box Few-Shot Adaptation for Vision-Language models ICCV 2023 + + +
+ Vision-Language (V-L) models trained with contrastive learning to align the +visual and language modalities have been shown to be strong few-shot learners. +Soft prompt learning is the method of choice for few-shot downstream adaption +aiming to bridge the modality gap caused by the distribution shift induced by +the new domain. While parameter-efficient, prompt learning still requires +access to the model weights and can be computationally infeasible for large +models with billions of parameters. To address these shortcomings, in this +work, we describe a black-box method for V-L few-shot adaptation that (a) +operates on pre-computed image and text features and hence works without access +to the model's weights, (b) it is orders of magnitude faster at training time, +(c) it is amenable to both supervised and unsupervised training, and (d) it can +be even used to align image and text features computed from uni-modal models. +To achieve this, we propose Linear Feature Alignment (LFA), a simple linear +approach for V-L re-alignment in the target domain. LFA is initialized from a +closed-form solution to a least-squares problem and then it is iteratively +updated by minimizing a re-ranking loss. Despite its simplicity, our approach +can even surpass soft-prompt learning methods as shown by extensive experiments +on 11 image and 2 video datasets. + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ LLMatic: Neural Architecture Search via Large Language Models and + Quality-Diversity Optimization + + +
+ Large Language Models (LLMs) have emerged as powerful tools capable of +accomplishing a broad spectrum of tasks. Their abilities span numerous areas, +and one area where they have made a significant impact is in the domain of code +generation. In this context, we view LLMs as mutation and crossover tools. +Meanwhile, Quality-Diversity (QD) algorithms are known to discover diverse and +robust solutions. By merging the code-generating abilities of LLMs with the +diversity and robustness of QD solutions, we introduce LLMatic, a Neural +Architecture Search (NAS) algorithm. While LLMs struggle to conduct NAS +directly through prompts, LLMatic uses a procedural approach, leveraging QD for +prompts and network architecture to create diverse and highly performant +networks. We test LLMatic on the CIFAR-10 image classification benchmark, +demonstrating that it can produce competitive networks with just $2,000$ +searches, even without prior knowledge of the benchmark domain or exposure to +any previous top-performing models for the benchmark. + +
+
+
+
+
+ + ♻ ☆ LLM Comparative Assessment: Zero-shot NLG Evaluation through Pairwise + Comparisons using Large Language Models + + +
+ Current developments in large language models (LLMs) have enabled impressive +zero-shot capabilities across various natural language tasks. An interesting +application of these systems is in the automated assessment of natural language +generation (NLG), a highly challenging area with great practical benefit. In +this paper, we explore two options for exploiting the emergent abilities of +LLMs for zero-shot NLG assessment: absolute score prediction, and comparative +assessment which uses relative comparisons between pairs of candidates. Though +comparative assessment has not been extensively studied in NLG assessment, we +note that humans often find it more intuitive to compare two options rather +than scoring each one independently. This work examines comparative assessment +from multiple perspectives: performance compared to absolute grading; +positional biases in the prompt; and efficient ranking in terms of the number +of comparisons. We illustrate that LLM comparative assessment is a simple, +general and effective approach for NLG assessment. For moderate-sized +open-source LLMs, such as FlanT5 and Llama2-chat, comparative assessment is +superior to prompt scoring, and in many cases can achieve performance +competitive with state-of-the-art methods. Additionally, we demonstrate that +LLMs often exhibit strong positional biases when making pairwise comparisons, +and we propose debiasing methods that can further improve performance. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ LLM Cognitive Judgements Differ From Human + + +
+ Large Language Models (LLMs) have lately been on the spotlight of +researchers, businesses, and consumers alike. While the linguistic capabilities +of such models have been studied extensively, there is growing interest in +investigating them as cognitive subjects. In the present work I examine GPT-3 +and ChatGPT capabilities on an limited-data inductive reasoning task from the +cognitive science literature. The results suggest that these models' cognitive +judgements are not human-like. + +
+
+ comment: 7 pages, 1 figure. License changed to CC BY-NC-SA +
+
+
+
+
+ + ♻ ☆ SpecInfer: Accelerating Generative Large Language Model Serving with + Speculative Inference and Token Tree Verification + + +
+ The high computational and memory requirements of generative large language +models (LLMs) make it challenging to serve them quickly and cheaply. This paper +introduces SpecInfer, an LLM serving system that accelerates generative LLM +inference with speculative inference and token tree verification. A key insight +behind Specinfer is to combine various collectively boost-tuned small language +models to jointly predict the LLM's outputs; the predictions are organized as a +token tree, whose nodes each represent a candidate token sequence. The +correctness of all candidate token sequences represented by a token tree is +verified against the LLM in parallel using a novel tree-based parallel decoding +mechanism. SpecInfer uses an LLM as a token tree verifier instead of an +incremental decoder, which significantly reduces the end-to-end latency and +computational requirement for serving generative LLMs while provably preserving +model quality. Our evaluation shows that SpecInfer outperforms existing LLM +serving systems by 1.3-2.4x for distributed LLM inference and by 2.6-3.5x for +offloading-based LLM inference, while preserving the same generative +performance. SpecInfer is publicly available at +https://github.com/flexflow/FlexFlow/tree/inference. + +
+
+
+
+
+ + ♻ ☆ T-SciQ: Teaching Multimodal Chain-of-Thought Reasoning via Large + Language Model Signals for Science Question Answering + + +
+ Large Language Models (LLMs) have recently demonstrated exceptional +performance in various Natural Language Processing (NLP) tasks. They have also +shown the ability to perform chain-of-thought (CoT) reasoning to solve complex +problems. Recent studies have explored CoT reasoning in complex multimodal +scenarios, such as the science question answering task, by fine-tuning +multimodal models with high-quality human-annotated CoT rationales. However, +collecting high-quality COT rationales is usually time-consuming and costly. +Besides, the annotated rationales are hardly accurate due to the external +essential information missed. To address these issues, we propose a novel +method termed \emph{T-SciQ} that aims at teaching science question answering +with LLM signals. The T-SciQ approach generates high-quality CoT rationales as +teaching signals and is advanced to train much smaller models to perform CoT +reasoning in complex modalities. Additionally, we introduce a novel data mixing +strategy to produce more effective teaching data samples by policy for simple +and complex science question answer problems. Extensive experimental results +show that our T-SciQ method achieves a new state-of-the-art performance on the +ScienceQA benchmark, with an accuracy of 96.18\%. Moreover, our approach +outperforms the most powerful fine-tuned baseline by 4.5\%. + +
+
+
+
+
+ + ♻ ☆ Text-only domain adaptation for end-to-end ASR using integrated + text-to-mel-spectrogram generator INTERSPEECH 2023 + + +
+ We propose an end-to-end Automatic Speech Recognition (ASR) system that can +be trained on transcribed speech data, text-only data, or a mixture of both. +The proposed model uses an integrated auxiliary block for text-based training. +This block combines a non-autoregressive multi-speaker text-to-mel-spectrogram +generator with a GAN-based enhancer to improve the spectrogram quality. The +proposed system can generate a mel-spectrogram dynamically during training. It +can be used to adapt the ASR model to a new domain by using text-only data from +this domain. We demonstrate that the proposed training method significantly +improves ASR accuracy compared to the system trained on transcribed speech +only. It also surpasses cascade TTS systems with the vocoder in the adaptation +quality and training speed. + +
+
+ comment: Accepted to INTERSPEECH 2023 +
+
+
+
+
+ + ♻ ☆ An interpretability framework for Similar case matching + + +
+ Similar Case Matching (SCM) plays a pivotal role in the legal system by +facilitating the efficient identification of similar cases for legal +professionals. While previous research has primarily concentrated on enhancing +the performance of SCM models, the aspect of interpretability has been +neglected. To bridge the gap, this study proposes an integrated pipeline +framework for interpretable SCM. The framework comprises four modules: judicial +feature sentence identification, case matching, feature sentence alignment, and +conflict resolution. In contrast to current SCM methods, our framework first +extracts feature sentences within a legal case that contain essential +information. Then it conducts case matching based on these extracted features. +Subsequently, our framework aligns the corresponding sentences in two legal +cases to provide evidence of similarity. In instances where the results of case +matching and feature sentence alignment exhibit conflicts, the conflict +resolution module resolves these inconsistencies. The experimental results show +the effectiveness of our proposed framework, establishing a new benchmark for +interpretable SCM. + +
+
+
+
+
+ + ♻ ☆ Editing Language Model-based Knowledge Graph Embeddings + + +
+ Recently decades have witnessed the empirical success of framing Knowledge +Graph (KG) embeddings via language models. However, language model-based KG +embeddings are usually deployed as static artifacts, making them difficult to +modify post-deployment without re-training after deployment. To address this +issue, we propose a new task of editing language model-based KG embeddings in +this paper. This task is designed to facilitate rapid, data-efficient updates +to KG embeddings without compromising the performance of other aspects. We +build four new datasets: E-FB15k237, A-FB15k237, E-WN18RR, and A-WN18RR, and +evaluate several knowledge editing baselines demonstrating the limited ability +of previous models to handle the proposed challenging task. We further propose +a simple yet strong baseline dubbed KGEditor, which utilizes additional +parametric layers of the hyper network to edit/add facts. Our comprehensive +experimental results reveal that KGEditor excels in updating specific facts +without impacting the overall performance, even when faced with limited +training resources. Code and datasets are available in +https://github.com/zjunlp/PromptKG/tree/main/deltaKG. + +
+
+ comment: Work in progress and the project website is + https://zjunlp.github.io/project/KGE_Editing/ +
+
+
+
+
+ + ♻ ☆ Approximate Nearest Neighbour Phrase Mining for Contextual Speech + Recognition + + +
+ This paper presents an extension to train end-to-end Context-Aware +Transformer Transducer ( CATT ) models by using a simple, yet efficient method +of mining hard negative phrases from the latent space of the context encoder. +During training, given a reference query, we mine a number of similar phrases +using approximate nearest neighbour search. These sampled phrases are then used +as negative examples in the context list alongside random and ground truth +contextual information. By including approximate nearest neighbour phrases +(ANN-P) in the context list, we encourage the learned representation to +disambiguate between similar, but not identical, biasing phrases. This improves +biasing accuracy when there are several similar phrases in the biasing +inventory. We carry out experiments in a large-scale data regime obtaining up +to 7% relative word error rate reductions for the contextual portion of test +data. We also extend and evaluate CATT approach in streaming applications. + +
+
+ comment: Accepted to Interspeech 2023. 5 pages, 2 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Analyzing the Limits of Self-Supervision in Handling Bias in Language EMNLP + + +
+ Prompting inputs with natural language task descriptions has emerged as a +popular mechanism to elicit reasonably accurate outputs from large-scale +generative language models with little to no in-context supervision. This also +helps gain insight into how well language models capture the semantics of a +wide range of downstream tasks purely from self-supervised pre-training on +massive corpora of unlabeled text. Such models have naturally also been exposed +to a lot of undesirable content like racist and sexist language and there is +limited work on awareness of models along these dimensions. In this paper, we +define and comprehensively evaluate how well such language models capture the +semantics of four tasks for bias: diagnosis, identification, extraction and +rephrasing. We define three broad classes of task descriptions for these tasks: +statement, question, and completion, with numerous lexical variants within each +class. We study the efficacy of prompting for each task using these classes and +the null task description across several decoding methods and few-shot +examples. Our analyses indicate that language models are capable of performing +these tasks to widely varying degrees across different bias dimensions, such as +gender and political affiliation. We believe our work is an important step +towards unbiased language models by quantifying the limits of current +self-supervision objectives at accomplishing such sociologically challenging +tasks. + +
+
+ comment: Accepted at Findings of the Conference on Empirical Methods in + Natural Language Processing (EMNLP) 2022 +
+
+
+
+
+ + ♻ ☆ Rethinking the Role of Scale for In-Context Learning: An + Interpretability-based Case Study at 66 Billion Scale ACL + + +
+ Language models have been shown to perform better with an increase in scale +on a wide variety of tasks via the in-context learning paradigm. In this paper, +we investigate the hypothesis that the ability of a large language model to +in-context learn-perform a task is not uniformly spread across all of its +underlying components. Using a 66 billion parameter language model (OPT-66B) +across a diverse set of 14 downstream tasks, we find this is indeed the case: +$\sim$70% of attention heads and $\sim$20% of feed forward networks can be +removed with minimal decline in task performance. We find substantial overlap +in the set of attention heads (un)important for in-context learning across +tasks and number of in-context examples. We also address our hypothesis through +a task-agnostic lens, finding that a small set of attention heads in OPT-66B +score highly on their ability to perform primitive induction operations +associated with in-context learning, namely, prefix matching and copying. These +induction heads overlap with task-specific important heads, reinforcing +arguments by Olsson et al. (arXiv:2209.11895) regarding induction head +generality to more sophisticated behaviors associated with in-context learning. +Overall, our study provides several insights that indicate large language +models may be under-trained for in-context learning and opens up questions on +how to pre-train language models to more effectively perform in-context +learning. + +
+
+ comment: Accepted at Annual Meeting of the Association for Computational + Linguistics (ACL) 2023, Main Proceedings +
+
+
+
+
+ + ♻ ☆ EnrichEvent: Enriching Social Data with Contextual Information for + Emerging Event Extraction + + +
+ Social platforms have emerged as crucial platforms for disseminating +information and discussing real-life social events, which offers an excellent +opportunity for researchers to design and implement novel event detection +frameworks. However, most existing approaches merely exploit keyword burstiness +or network structures to detect unspecified events. Thus, they often fail to +identify unspecified events regarding the challenging nature of events and +social data. Social data, e.g., tweets, is characterized by misspellings, +incompleteness, word sense ambiguation, and irregular language, as well as +variation in aspects of opinions. Moreover, extracting discriminative features +and patterns for evolving events by exploiting the limited structural knowledge +is almost infeasible. To address these challenges, in this thesis, we propose a +novel framework, namely EnrichEvent, that leverages the lexical and contextual +representations of streaming social data. In particular, we leverage contextual +knowledge, as well as lexical knowledge, to detect semantically related tweets +and enhance the effectiveness of the event detection approaches. Eventually, +our proposed framework produces cluster chains for each event to show the +evolving variation of the event through time. We conducted extensive +experiments to evaluate our framework, validating its high performance and +effectiveness in detecting and distinguishing unspecified social events. + +
+
+
+
+
+ + ♻ ☆ Towards the extraction of robust sign embeddings for low resource sign + language recognition + + +
+ Isolated Sign Language Recognition (SLR) has mostly been applied on datasets +containing signs executed slowly and clearly by a limited group of signers. In +real-world scenarios, however, we are met with challenging visual conditions, +coarticulated signing, small datasets, and the need for signer independent +models. To tackle this difficult problem, we require a robust feature extractor +to process the sign language videos. One could expect human pose estimators to +be ideal candidates. However, due to a domain mismatch with their training sets +and challenging poses in sign language, they lack robustness on sign language +data and image-based models often still outperform keypoint-based models. +Furthermore, whereas the common practice of transfer learning with image-based +models yields even higher accuracy, keypoint-based models are typically trained +from scratch on every SLR dataset. These factors limit their usefulness for +SLR. From the existing literature, it is also not clear which, if any, pose +estimator performs best for SLR. We compare the three most popular pose +estimators for SLR: OpenPose, MMPose and MediaPipe. We show that through +keypoint normalization, missing keypoint imputation, and learning a pose +embedding, we can obtain significantly better results and enable transfer +learning. We show that keypoint-based embeddings contain cross-lingual +features: they can transfer between sign languages and achieve competitive +performance even when fine-tuning only the classifier layer of an SLR model on +a target sign language. We furthermore achieve better performance using +fine-tuned transferred embeddings than models trained only on the target sign +language. The embeddings can also be learned in a multilingual fashion. The +application of these embeddings could prove particularly useful for low +resource sign languages in the future. + +
+
+
+
+
+ + ♻ ☆ Towards Automatic Boundary Detection for Human-AI Collaborative Hybrid + Essay in Education + + +
+ The recent large language models (LLMs), e.g., ChatGPT, have been able to +generate human-like and fluent responses when provided with specific +instructions. While admitting the convenience brought by technological +advancement, educators also have concerns that students might leverage LLMs to +complete their writing assignments and pass them off as their original work. +Although many AI content detection studies have been conducted as a result of +such concerns, most of these prior studies modeled AI content detection as a +classification problem, assuming that a text is either entirely human-written +or entirely AI-generated. In this study, we investigated AI content detection +in a rarely explored yet realistic setting where the text to be detected is +collaboratively written by human and generative LLMs (i.e., hybrid text). We +first formalized the detection task as identifying the transition points +between human-written content and AI-generated content from a given hybrid text +(boundary detection). Then we proposed a two-step approach where we (1) +separated AI-generated content from human-written content during the encoder +training process; and (2) calculated the distances between every two adjacent +prototypes and assumed that the boundaries exist between the two adjacent +prototypes that have the furthest distance from each other. Through extensive +experiments, we observed the following main findings: (1) the proposed approach +consistently outperformed the baseline methods across different experiment +settings; (2) the encoder training process can significantly boost the +performance of the proposed approach; (3) when detecting boundaries for +single-boundary hybrid essays, the proposed approach could be enhanced by +adopting a relatively large prototype size, leading to a 22% improvement in the +In-Domain evaluation and an 18% improvement in the Out-of-Domain evaluation. + +
+
+ comment: 9 pages including references, 2 figures +
+
+
+
+
+ + ♻ ☆ CHAMPAGNE: Learning Real-world Conversation from Large-Scale Web Videos ICCV 2023 + + +
+ Visual information is central to conversation: body gestures and physical +behaviour, for example, contribute to meaning that transcends words alone. To +date, however, most neural conversational models are limited to just text. We +introduce CHAMPAGNE, a generative model of conversations that can account for +visual contexts. To train CHAMPAGNE, we collect and release YTD-18M, a +large-scale corpus of 18M video-based dialogues. YTD-18M is constructed from +web videos: crucial to our data collection pipeline is a pretrained language +model that converts error-prone automatic transcripts to a cleaner dialogue +format while maintaining meaning. Human evaluation reveals that YTD-18M is more +sensible and specific than prior resources (MMDialog, 1M dialogues), while +maintaining visual-groundedness. Experiments demonstrate that 1) CHAMPAGNE +learns to conduct conversation from YTD-18M; and 2) when fine-tuned, it +achieves state-of-the-art results on four vision-language tasks focused on +real-world conversations. We release data, models, and code. + +
+
+ comment: ICCV 2023, Project page: https://seungjuhan.me/champagne +
+
+
+
+
+ + ♻ ☆ SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and + Effective Hotword Customization Ability + + +
+ Hotword customization is one of the important issues remained in ASR field - +it is of value to enable users of ASR systems to customize names of entities, +persons and other phrases. The past few years have seen both implicit and +explicit modeling strategies for ASR contextualization developed. While these +approaches have performed adequately, they still exhibit certain shortcomings +such as instability in effectiveness. In this paper we propose +Semantic-augmented Contextual-Paraformer (SeACo-Paraformer) a novel NAR based +ASR system with flexible and effective hotword customization ability. It +combines the accuracy of the AED-based model, the efficiency of the NAR model, +and the excellent performance in contextualization. In 50,000 hours industrial +big data experiments, our proposed model outperforms strong baselines in +customization and general ASR tasks. Besides, we explore an efficient way to +filter large scale incoming hotwords for further improvement. The source codes +and industrial models proposed and compared are all opened as well as two +hotword test sets. + +
+
+ comment: updated draft +
+
+
+
+
+ + ♻ ☆ Mental-LLM: Leveraging Large Language Models for Mental Health + Prediction via Online Text Data + + +
+ Advances in large language models (LLMs) have empowered a variety of +applications. However, there is still a significant gap in research when it +comes to understanding and enhancing the capabilities of LLMs in the field of +mental health. In this work, we present the first comprehensive evaluation of +multiple LLMs, including Alpaca, Alpaca-LoRA, FLAN-T5, GPT-3.5, and GPT-4, on +various mental health prediction tasks via online text data. We conduct a broad +range of experiments, covering zero-shot prompting, few-shot prompting, and +instruction fine-tuning. The results indicate a promising yet limited +performance of LLMs with zero-shot and few-shot prompt designs for the mental +health tasks. More importantly, our experiments show that instruction +finetuning can significantly boost the performance of LLMs for all tasks +simultaneously. Our best-finetuned models, Mental-Alpaca and Mental-FLAN-T5, +outperform the best prompt design of GPT-3.5 (25 and 15 times bigger) by 10.9% +on balanced accuracy and the best of GPT-4 (250 and 150 times bigger) by 4.8%. +They further perform on par with the state-of-the-art task-specific language +model. We also conduct an exploratory case study on LLMs' capability on the +mental health reasoning tasks, illustrating the promising capability of certain +models such as GPT-4. We summarize our findings into a set of action guidelines +for potential methods to enhance LLMs' capability for mental health tasks. +Meanwhile, we also emphasize the important limitations before achieving +deployability in real-world mental health settings, such as known racial and +gender bias. We highlight the important ethical risks accompanying this line of +research. + +
+
+
+
+
+ + ♻ ☆ SPM: Structured Pretraining and Matching Architectures for Relevance + Modeling in Meituan Search CIKM '23 + + +
+ In e-commerce search, relevance between query and documents is an essential +requirement for satisfying user experience. Different from traditional +e-commerce platforms that offer products, users search on life service +platforms such as Meituan mainly for product providers, which usually have +abundant structured information, e.g. name, address, category, thousands of +products. Modeling search relevance with these rich structured contents is +challenging due to the following issues: (1) there is language distribution +discrepancy among different fields of structured document, making it difficult +to directly adopt off-the-shelf pretrained language model based methods like +BERT. (2) different fields usually have different importance and their length +vary greatly, making it difficult to extract document information helpful for +relevance matching. + To tackle these issues, in this paper we propose a novel two-stage +pretraining and matching architecture for relevance matching with rich +structured documents. At pretraining stage, we propose an effective pretraining +method that employs both query and multiple fields of document as inputs, +including an effective information compression method for lengthy fields. At +relevance matching stage, a novel matching method is proposed by leveraging +domain knowledge in search query to generate more effective document +representations for relevance scoring. Extensive offline experiments and online +A/B tests on millions of users verify that the proposed architectures +effectively improve the performance of relevance modeling. The model has +already been deployed online, serving the search traffic of Meituan for over a +year. + +
+
+ comment: Accepted by CIKM '23 +
+
+
+
+
+ + ♻ ☆ Prompting the Hidden Talent of Web-Scale Speech Models for Zero-Shot + Task Generalization + + +
+ We investigate the emergent abilities of the recently proposed web-scale +speech model Whisper, by adapting it to unseen tasks with prompt engineering. +We selected three tasks: audio-visual speech recognition (AVSR), code-switched +speech recognition (CS-ASR), and speech translation (ST) on unseen language +pairs. We design task-specific prompts, by either leveraging another +large-scale model, or simply manipulating the special tokens in the default +prompts. Experiments show that compared to the default prompts, our proposed +prompts improve performance by 10% to 45% on the three zero-shot tasks, and +even outperform SotA supervised models on some datasets. In addition, our +experiments reveal many interesting properties of Whisper, including its +robustness to prompts, bias on accents, and the multilingual understanding in +its latent space. Code is available at +https://github.com/jasonppy/PromptingWhisper + +
+
+ comment: Interspeech 2023 +
+
+
+
+
+ + ♻ ☆ Visual Adversarial Examples Jailbreak Aligned Large Language Models + + +
+ Recently, there has been a surge of interest in integrating vision into Large +Language Models (LLMs), exemplified by Visual Language Models (VLMs) such as +Flamingo and GPT-4. This paper sheds light on the security and safety +implications of this trend. First, we underscore that the continuous and +high-dimensional nature of the visual input makes it a weak link against +adversarial attacks, representing an expanded attack surface of +vision-integrated LLMs. Second, we highlight that the versatility of LLMs also +presents visual attackers with a wider array of achievable adversarial +objectives, extending the implications of security failures beyond mere +misclassification. As an illustration, we present a case study in which we +exploit visual adversarial examples to circumvent the safety guardrail of +aligned LLMs with integrated vision. Intriguingly, we discover that a single +visual adversarial example can universally jailbreak an aligned LLM, compelling +it to heed a wide range of harmful instructions that it otherwise would not) +and generate harmful content that transcends the narrow scope of a `few-shot' +derogatory corpus initially employed to optimize the adversarial example. Our +study underscores the escalating adversarial risks associated with the pursuit +of multimodality. Our findings also connect the long-studied adversarial +vulnerabilities of neural networks to the nascent field of AI alignment. The +presented attack suggests a fundamental adversarial challenge for AI alignment, +especially in light of the emerging trend toward multimodality in frontier +foundation models. + +
+
+
+
+
+ + ♻ ☆ Integrating Knowledge Graph embedding and pretrained Language Models in + Hypercomplex Spaces ISWC2023 + + +
+ Knowledge Graphs, such as Wikidata, comprise structural and textual knowledge +in order to represent knowledge. For each of the two modalities dedicated +approaches for graph embedding and language models learn patterns that allow +for predicting novel structural knowledge. Few approaches have integrated +learning and inference with both modalities and these existing ones could only +partially exploit the interaction of structural and textual knowledge. In our +approach, we build on existing strong representations of single modalities and +we use hypercomplex algebra to represent both, (i), single-modality embedding +as well as, (ii), the interaction between different modalities and their +complementary means of knowledge representation. More specifically, we suggest +Dihedron and Quaternion representations of 4D hypercomplex numbers to integrate +four modalities namely structural knowledge graph embedding, word-level +representations (e.g.\ Word2vec, Fasttext), sentence-level representations +(Sentence transformer), and document-level representations (sentence +transformer, Doc2vec). Our unified vector representation scores the +plausibility of labelled edges via Hamilton and Dihedron products, thus +modeling pairwise interactions between different modalities. Extensive +experimental evaluation on standard benchmark datasets shows the superiority of +our two new models using abundant textual information besides sparse structural +knowledge to enhance performance in link prediction tasks. + +
+
+ comment: ISWC2023 version +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 111 + +
+
+
+ + ☆ TeCH: Text-guided Reconstruction of Lifelike Clothed Humans + + +
+ Despite recent research advancements in reconstructing clothed humans from a +single image, accurately restoring the "unseen regions" with high-level details +remains an unsolved challenge that lacks attention. Existing methods often +generate overly smooth back-side surfaces with a blurry texture. But how to +effectively capture all visual attributes of an individual from a single image, +which are sufficient to reconstruct unseen areas (e.g., the back view)? +Motivated by the power of foundation models, TeCH reconstructs the 3D human by +leveraging 1) descriptive text prompts (e.g., garments, colors, hairstyles) +which are automatically generated via a garment parsing model and Visual +Question Answering (VQA), 2) a personalized fine-tuned Text-to-Image diffusion +model (T2I) which learns the "indescribable" appearance. To represent +high-resolution 3D clothed humans at an affordable cost, we propose a hybrid 3D +representation based on DMTet, which consists of an explicit body shape grid +and an implicit distance field. Guided by the descriptive prompts + +personalized T2I diffusion model, the geometry and texture of the 3D humans are +optimized through multi-view Score Distillation Sampling (SDS) and +reconstruction losses based on the original observation. TeCH produces +high-fidelity 3D clothed humans with consistent & delicate texture, and +detailed full-body geometry. Quantitative and qualitative experiments +demonstrate that TeCH outperforms the state-of-the-art methods in terms of +reconstruction accuracy and rendering quality. The code will be publicly +available for research purposes at https://huangyangyi.github.io/tech + +
+
+ comment: Project: https://huangyangyi.github.io/tech +
+
+
+
+
+ + ☆ MeViS: A Large-scale Benchmark for Video Segmentation with Motion + Expressions ICCV 2023 + + +
+ This paper strives for motion expressions guided video segmentation, which +focuses on segmenting objects in video content based on a sentence describing +the motion of the objects. Existing referring video object datasets typically +focus on salient objects and use language expressions that contain excessive +static attributes that could potentially enable the target object to be +identified in a single frame. These datasets downplay the importance of motion +in video content for language-guided video object segmentation. To investigate +the feasibility of using motion expressions to ground and segment objects in +videos, we propose a large-scale dataset called MeViS, which contains numerous +motion expressions to indicate target objects in complex environments. We +benchmarked 5 existing referring video object segmentation (RVOS) methods and +conducted a comprehensive comparison on the MeViS dataset. The results show +that current RVOS methods cannot effectively address motion expression-guided +video segmentation. We further analyze the challenges and propose a baseline +approach for the proposed MeViS dataset. The goal of our benchmark is to +provide a platform that enables the development of effective language-guided +video segmentation algorithms that leverage motion expressions as a primary cue +for object segmentation in complex video scenes. The proposed MeViS dataset has +been released at https://henghuiding.github.io/MeViS. + +
+
+ comment: ICCV 2023, Project Page: https://henghuiding.github.io/MeViS/ +
+
+
+
+
+ + ☆ InsightMapper: A Closer Look at Inner-instance Information for + Vectorized High-Definition Mapping + + +
+ Vectorized high-definition (HD) maps contain detailed information about +surrounding road elements, which are crucial for various downstream tasks in +modern autonomous driving vehicles, such as vehicle planning and control. +Recent works have attempted to directly detect the vectorized HD map as a point +set prediction task, resulting in significant improvements in detection +performance. However, these approaches fail to analyze and exploit the +inner-instance correlations between predicted points, impeding further +advancements. To address these challenges, we investigate the utilization of +inner-$\textbf{INS}$tance information for vectorized h$\textbf{IGH}$-definition +mapping through $\textbf{T}$ransformers and introduce InsightMapper. This paper +presents three novel designs within InsightMapper that leverage inner-instance +information in distinct ways, including hybrid query generation, inner-instance +query fusion, and inner-instance feature aggregation. Comparative experiments +are conducted on the NuScenes dataset, showcasing the superiority of our +proposed method. InsightMapper surpasses previous state-of-the-art (SOTA) +methods by 5.78 mAP and 5.12 TOPO, which assess topology correctness. +Simultaneously, InsightMapper maintains high efficiency during both training +and inference phases, resulting in remarkable comprehensive performance. The +project page for this work is available at +https://tonyxuqaq.github.io/projects/InsightMapper . + +
+
+ comment: Code and demo will be available at + https://tonyxuqaq.github.io/projects/InsightMapper +
+
+
+
+
+ + ☆ Ref-DVGO: Reflection-Aware Direct Voxel Grid Optimization for an + Improved Quality-Efficiency Trade-Off in Reflective Scene Reconstructio ICCV + + +
+ Neural Radiance Fields (NeRFs) have revolutionized the field of novel view +synthesis, demonstrating remarkable performance. However, the modeling and +rendering of reflective objects remain challenging problems. Recent methods +have shown significant improvements over the baselines in handling reflective +scenes, albeit at the expense of efficiency. In this work, we aim to strike a +balance between efficiency and quality. To this end, we investigate an +implicit-explicit approach based on conventional volume rendering to enhance +the reconstruction quality and accelerate the training and rendering processes. +We adopt an efficient density-based grid representation and reparameterize the +reflected radiance in our pipeline. Our proposed reflection-aware approach +achieves a competitive quality efficiency trade-off compared to competing +methods. Based on our experimental results, we propose and discuss hypotheses +regarding the factors influencing the results of density-based methods for +reconstructing reflective objects. The source code is available at: +https://github.com/gkouros/ref-dvgo + +
+
+ comment: 5 pages, 4 figures, 3 tables, ICCV TRICKY 2023 Workshop +
+
+
+
+
+ + ☆ Diagnosing Human-object Interaction Detectors + + +
+ Although we have witnessed significant progress in human-object interaction +(HOI) detection with increasingly high mAP (mean Average Precision), a single +mAP score is too concise to obtain an informative summary of a model's +performance and to understand why one approach is better than another. In this +paper, we introduce a diagnosis toolbox for analyzing the error sources of the +existing HOI detection models. We first conduct holistic investigations in the +pipeline of HOI detection, consisting of human-object pair detection and then +interaction classification. We define a set of errors and the oracles to fix +each of them. By measuring the mAP improvement obtained from fixing an error +using its oracle, we can have a detailed analysis of the significance of +different errors. We then delve into the human-object detection and interaction +classification, respectively, and check the model's behavior. For the first +detection task, we investigate both recall and precision, measuring the +coverage of ground-truth human-object pairs as well as the noisiness level in +the detections. For the second classification task, we compute mAP for +interaction classification only, without considering the detection scores. We +also measure the performance of the models in differentiating human-object +pairs with and without actual interactions using the AP (Average Precision) +score. Our toolbox is applicable for different methods across different +datasets and available at https://github.com/neu-vi/Diag-HOI. + +
+
+
+
+
+ + ☆ Likelihood-Based Text-to-Image Evaluation with Patch-Level Perceptual + and Semantic Credit Assignment + + +
+ Text-to-image synthesis has made encouraging progress and attracted lots of +public attention recently. However, popular evaluation metrics in this area, +like the Inception Score and Fr'echet Inception Distance, incur several issues. +First of all, they cannot explicitly assess the perceptual quality of generated +images and poorly reflect the semantic alignment of each text-image pair. Also, +they are inefficient and need to sample thousands of images to stabilise their +evaluation results. In this paper, we propose to evaluate text-to-image +generation performance by directly estimating the likelihood of the generated +images using a pre-trained likelihood-based text-to-image generative model, +i.e., a higher likelihood indicates better perceptual quality and better +text-image alignment. To prevent the likelihood of being dominated by the +non-crucial part of the generated image, we propose several new designs to +develop a credit assignment strategy based on the semantic and perceptual +significance of the image patches. In the experiments, we evaluate the proposed +metric on multiple popular text-to-image generation models and datasets in +accessing both the perceptual quality and the text-image alignment. Moreover, +it can successfully assess the generation ability of these models with as few +as a hundred samples, making it very efficient in practice. + +
+
+
+
+
+ + ☆ Painter: Teaching Auto-regressive Language Models to Draw Sketches + + +
+ Large language models (LLMs) have made tremendous progress in natural +language understanding and they have also been successfully adopted in other +domains such as computer vision, robotics, reinforcement learning, etc. In this +work, we apply LLMs to image generation tasks by directly generating the +virtual brush strokes to paint an image. We present Painter, an LLM that can +convert user prompts in text description format to sketches by generating the +corresponding brush strokes in an auto-regressive way. We construct Painter +based on off-the-shelf LLM that is pre-trained on a large text corpus, by +fine-tuning it on the new task while preserving language understanding +capabilities. We create a dataset of diverse multi-object sketches paired with +textual prompts that covers several object types and tasks. Painter can +generate sketches from text descriptions, remove objects from canvas, and +detect and classify objects in sketches. Although this is an unprecedented +pioneering work in using LLMs for auto-regressive image generation, the results +are very encouraging. + +
+
+
+
+
+ + ☆ Exploiting Point-Wise Attention in 6D Object Pose Estimation Based on + Bidirectional Prediction + + +
+ Traditional geometric registration based estimation methods only exploit the +CAD model implicitly, which leads to their dependence on observation quality +and deficiency to occlusion.To address the problem,the paper proposes a +bidirectional correspondence prediction network with a point-wise +attention-aware mechanism. This network not only requires the model points to +predict the correspondence but also explicitly models the geometric +similarities between observations and the model prior.} Our key insight is that +the correlations between each model point and scene point provide essential +information for learning point-pair matches. To further tackle the correlation +noises brought by feature distribution divergence, we design a simple but +effective pseudo-siamese network to improve feature homogeneity.Experimental +results on the public datasets of LineMOD, YCB-Video, and Occ-LineMOD show that +the proposed method achieves better performance than other state-of-the-art +methods under the same evaluation criteria. Its robustness in estimating poses +is greatly improved, especially in an environment with severe occlusions. + +
+
+
+
+
+ + ☆ Two-and-a-half Order Score-based Model for Solving 3D Ill-posed Inverse + Problems + + +
+ Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) are crucial +technologies in the field of medical imaging. Score-based models have proven to +be effective in addressing different inverse problems encountered in CT and +MRI, such as sparse-view CT and fast MRI reconstruction. However, these models +face challenges in achieving accurate three dimensional (3D) volumetric +reconstruction. The existing score-based models primarily focus on +reconstructing two dimensional (2D) data distribution, leading to +inconsistencies between adjacent slices in the reconstructed 3D volumetric +images. To overcome this limitation, we propose a novel two-and-a-half order +score-based model (TOSM). During the training phase, our TOSM learns data +distributions in 2D space, which reduces the complexity of training compared to +directly working on 3D volumes. However, in the reconstruction phase, the TOSM +updates the data distribution in 3D space, utilizing complementary scores along +three directions (sagittal, coronal, and transaxial) to achieve a more precise +reconstruction. The development of TOSM is built on robust theoretical +principles, ensuring its reliability and efficacy. Through extensive +experimentation on large-scale sparse-view CT and fast MRI datasets, our method +demonstrates remarkable advancements and attains state-of-the-art results in +solving 3D ill-posed inverse problems. Notably, the proposed TOSM effectively +addresses the inter-slice inconsistency issue, resulting in high-quality 3D +volumetric reconstruction. + +
+
+
+
+
+ + ☆ ResBuilder: Automated Learning of Depth with Residual Structures + + +
+ In this work, we develop a neural architecture search algorithm, termed +Resbuilder, that develops ResNet architectures from scratch that achieve high +accuracy at moderate computational cost. It can also be used to modify existing +architectures and has the capability to remove and insert ResNet blocks, in +this way searching for suitable architectures in the space of ResNet +architectures. In our experiments on different image classification datasets, +Resbuilder achieves close to state-of-the-art performance while saving +computational cost compared to off-the-shelf ResNets. Noteworthy, we once tune +the parameters on CIFAR10 which yields a suitable default choice for all other +datasets. We demonstrate that this property generalizes even to industrial +applications by applying our method with default parameters on a proprietary +fraud detection dataset. + +
+
+
+
+
+ + ☆ Self-Supervised Online Camera Calibration for Automated Driving and + Parking Applications + + +
+ Camera-based perception systems play a central role in modern autonomous +vehicles. These camera based perception algorithms require an accurate +calibration to map the real world distances to image pixels. In practice, +calibration is a laborious procedure requiring specialised data collection and +careful tuning. This process must be repeated whenever the parameters of the +camera change, which can be a frequent occurrence in autonomous vehicles. Hence +there is a need to calibrate at regular intervals to ensure the camera is +accurate. Proposed is a deep learning framework to learn intrinsic and +extrinsic calibration of the camera in real time. The framework is +self-supervised and doesn't require any labelling or supervision to learn the +calibration parameters. The framework learns calibration without the need for +any physical targets or to drive the car on special planar surfaces. + +
+
+
+
+
+ + ☆ DeDoDe: Detect, Don't Describe -- Describe, Don't Detect for Local + Feature Matching + + +
+ Keypoint detection is a pivotal step in 3D reconstruction, whereby sets of +(up to) K points are detected in each view of a scene. Crucially, the detected +points need to be consistent between views, i.e., correspond to the same 3D +point in the scene. One of the main challenges with keypoint detection is the +formulation of the learning objective. Previous learning-based methods +typically jointly learn descriptors with keypoints, and treat the keypoint +detection as a binary classification task on mutual nearest neighbours. +However, basing keypoint detection on descriptor nearest neighbours is a proxy +task, which is not guaranteed to produce 3D-consistent keypoints. Furthermore, +this ties the keypoints to a specific descriptor, complicating downstream +usage. In this work, we instead learn keypoints directly from 3D consistency. +To this end, we train the detector to detect tracks from large-scale SfM. As +these points are often overly sparse, we derive a semi-supervised two-view +detection objective to expand this set to a desired number of detections. To +train a descriptor, we maximize the mutual nearest neighbour objective over the +keypoints with a separate network. Results show that our approach, DeDoDe, +achieves significant gains on multiple geometry benchmarks. Code is provided at +https://github.com/Parskatt/DeDoDe . + +
+
+
+
+
+ + ☆ Classification Committee for Active Deep Object Detection + + +
+ In object detection, the cost of labeling is much high because it needs not +only to confirm the categories of multiple objects in an image but also to +accurately determine the bounding boxes of each object. Thus, integrating +active learning into object detection will raise pretty positive significance. +In this paper, we propose a classification committee for active deep object +detection method by introducing a discrepancy mechanism of multiple classifiers +for samples' selection when training object detectors. The model contains a +main detector and a classification committee. The main detector denotes the +target object detector trained from a labeled pool composed of the selected +informative images. The role of the classification committee is to select the +most informative images according to their uncertainty values from the view of +classification, which is expected to focus more on the discrepancy and +representative of instances. Specifically, they compute the uncertainty for a +specified instance within the image by measuring its discrepancy output by the +committee pre-trained via the proposed Maximum Classifiers Discrepancy Group +Loss (MCDGL). The most informative images are finally determined by selecting +the ones with many high-uncertainty instances. Besides, to mitigate the impact +of interference instances, we design a Focus on Positive Instances Loss (FPIL) +to make the committee the ability to automatically focus on the representative +instances as well as precisely encode their discrepancies for the same +instance. Experiments are conducted on Pascal VOC and COCO datasets versus some +popular object detectors. And results show that our method outperforms the +state-of-the-art active learning methods, which verifies the effectiveness of +the proposed method. + +
+
+
+
+
+ + ☆ Hierarchical Uncertainty Estimation for Medical Image Segmentation + Networks + + +
+ Learning a medical image segmentation model is an inherently ambiguous task, +as uncertainties exist in both images (noise) and manual annotations (human +errors and bias) used for model training. To build a trustworthy image +segmentation model, it is important to not just evaluate its performance but +also estimate the uncertainty of the model prediction. Most state-of-the-art +image segmentation networks adopt a hierarchical encoder architecture, +extracting image features at multiple resolution levels from fine to coarse. In +this work, we leverage this hierarchical image representation and propose a +simple yet effective method for estimating uncertainties at multiple levels. +The multi-level uncertainties are modelled via the skip-connection module and +then sampled to generate an uncertainty map for the predicted image +segmentation. We demonstrate that a deep learning segmentation network such as +U-net, when implemented with such hierarchical uncertainty estimation module, +can achieve a high segmentation performance, while at the same time provide +meaningful uncertainty maps that can be used for out-of-distribution detection. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Learning to Distill Global Representation for Sparse-View CT ICCV 2023 + + +
+ Sparse-view computed tomography (CT) -- using a small number of projections +for tomographic reconstruction -- enables much lower radiation dose to patients +and accelerated data acquisition. The reconstructed images, however, suffer +from strong artifacts, greatly limiting their diagnostic value. Current trends +for sparse-view CT turn to the raw data for better information recovery. The +resultant dual-domain methods, nonetheless, suffer from secondary artifacts, +especially in ultra-sparse view scenarios, and their generalization to other +scanners/protocols is greatly limited. A crucial question arises: have the +image post-processing methods reached the limit? Our answer is not yet. In this +paper, we stick to image post-processing methods due to great flexibility and +propose global representation (GloRe) distillation framework for sparse-view +CT, termed GloReDi. First, we propose to learn GloRe with Fourier convolution, +so each element in GloRe has an image-wide receptive field. Second, unlike +methods that only use the full-view images for supervision, we propose to +distill GloRe from intermediate-view reconstructed images that are readily +available but not explored in previous literature. The success of GloRe +distillation is attributed to two key components: representation directional +distillation to align the GloRe directions, and band-pass-specific contrastive +distillation to gain clinically important details. Extensive experiments +demonstrate the superiority of the proposed GloReDi over the state-of-the-art +methods, including dual-domain ones. The source code is available at +https://github.com/longzilicart/GloReDi. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ High-Fidelity Lake Extraction via Two-Stage Prompt Enhancement: + Establishing a Novel Baseline and Benchmark + + +
+ The extraction of lakes from remote sensing images is a complex challenge due +to the varied lake shapes and data noise. Current methods rely on multispectral +image datasets, making it challenging to learn lake features accurately from +pixel arrangements. This, in turn, affects model learning and the creation of +accurate segmentation masks. This paper introduces a unified prompt-based +dataset construction approach that provides approximate lake locations using +point, box, and mask prompts. We also propose a two-stage prompt enhancement +framework, LEPrompter, which involves prompt-based and prompt-free stages +during training. The prompt-based stage employs a prompt encoder to extract +prior information, integrating prompt tokens and image embeddings through self- +and cross-attention in the prompt decoder. Prompts are deactivated once the +model is trained to ensure independence during inference, enabling automated +lake extraction. Evaluations on Surface Water and Qinghai-Tibet Plateau Lake +datasets show consistent performance improvements compared to the previous +state-of-the-art method. LEPrompter achieves mIoU scores of 91.48% and 97.43% +on the respective datasets without introducing additional parameters or GFLOPs. +Supplementary materials provide the source code, pre-trained models, and +detailed user studies. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Integrating Visual and Semantic Similarity Using Hierarchies for Image + Retrieval + + +
+ Most of the research in content-based image retrieval (CBIR) focus on +developing robust feature representations that can effectively retrieve +instances from a database of images that are visually similar to a query. +However, the retrieved images sometimes contain results that are not +semantically related to the query. To address this, we propose a method for +CBIR that captures both visual and semantic similarity using a visual +hierarchy. The hierarchy is constructed by merging classes with overlapping +features in the latent space of a deep neural network trained for +classification, assuming that overlapping classes share high visual and +semantic similarities. Finally, the constructed hierarchy is integrated into +the distance calculation metric for similarity search. Experiments on standard +datasets: CUB-200-2011 and CIFAR100, and a real-life use case using diatom +microscopy images show that our method achieves superior performance compared +to the existing methods on image retrieval. + +
+
+ comment: Accepted in ICVS 2023 +
+
+
+
+
+ + ☆ ALIP: Adaptive Language-Image Pre-training with Synthetic Caption ICCV2023 + + +
+ Contrastive Language-Image Pre-training (CLIP) has significantly boosted the +performance of various vision-language tasks by scaling up the dataset with +image-text pairs collected from the web. However, the presence of intrinsic +noise and unmatched image-text pairs in web data can potentially affect the +performance of representation learning. To address this issue, we first utilize +the OFA model to generate synthetic captions that focus on the image content. +The generated captions contain complementary information that is beneficial for +pre-training. Then, we propose an Adaptive Language-Image Pre-training (ALIP), +a bi-path model that integrates supervision from both raw text and synthetic +caption. As the core components of ALIP, the Language Consistency Gate (LCG) +and Description Consistency Gate (DCG) dynamically adjust the weights of +samples and image-text/caption pairs during the training process. Meanwhile, +the adaptive contrastive loss can effectively reduce the impact of noise data +and enhances the efficiency of pre-training data. We validate ALIP with +experiments on different scales of models and pre-training datasets. +Experiments results show that ALIP achieves state-of-the-art performance on +multiple downstream tasks including zero-shot image-text retrieval and linear +probe. To facilitate future research, the code and pre-trained models are +released at https://github.com/deepglint/ALIP. + +
+
+ comment: 15pages, 10figures, ICCV2023 +
+
+
+
+
+ + ☆ Tem-adapter: Adapting Image-Text Pretraining for Video Question Answer ICCV 2023 + + +
+ Video-language pre-trained models have shown remarkable success in guiding +video question-answering (VideoQA) tasks. However, due to the length of video +sequences, training large-scale video-based models incurs considerably higher +costs than training image-based ones. This motivates us to leverage the +knowledge from image-based pretraining, despite the obvious gaps between image +and video domains. To bridge these gaps, in this paper, we propose Tem-Adapter, +which enables the learning of temporal dynamics and complex semantics by a +visual Temporal Aligner and a textual Semantic Aligner. Unlike conventional +pretrained knowledge adaptation methods that only concentrate on the downstream +task objective, the Temporal Aligner introduces an extra language-guided +autoregressive task aimed at facilitating the learning of temporal +dependencies, with the objective of predicting future states based on +historical clues and language guidance that describes event progression. +Besides, to reduce the semantic gap and adapt the textual representation for +better event description, we introduce a Semantic Aligner that first designs a +template to fuse question and answer pairs as event descriptions and then +learns a Transformer decoder with the whole video sequence as guidance for +refinement. We evaluate Tem-Adapter and different pre-train transferring +methods on two VideoQA benchmarks, and the significant performance improvement +demonstrates the effectiveness of our method. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ Prediction of post-radiotherapy recurrence volumes in head and neck + squamous cell carcinoma using 3D U-Net segmentation + + +
+ Locoregional recurrences (LRR) are still a frequent site of treatment failure +for head and neck squamous cell carcinoma (HNSCC) patients. + Identification of high risk subvolumes based on pretreatment imaging is key +to biologically targeted radiation therapy. We investigated the extent to which +a Convolutional neural network (CNN) is able to predict LRR volumes based on +pre-treatment 18F-fluorodeoxyglucose positron emission tomography +(FDG-PET)/computed tomography (CT) scans in HNSCC patients and thus the +potential to identify biological high risk volumes using CNNs. + For 37 patients who had undergone primary radiotherapy for oropharyngeal +squamous cell carcinoma, five oncologists contoured the relapse volumes on +recurrence CT scans. Datasets of pre-treatment FDG-PET/CT, gross tumour volume +(GTV) and contoured relapse for each of the patients were randomly divided into +training (n=23), validation (n=7) and test (n=7) datasets. We compared a CNN +trained from scratch, a pre-trained CNN, a SUVmax threshold approach, and using +the GTV directly. + The SUVmax threshold method included 5 out of the 7 relapse origin points +within a volume of median 4.6 cubic centimetres (cc). Both the GTV contour and +best CNN segmentations included the relapse origin 6 out of 7 times with median +volumes of 28 and 18 cc respectively. + The CNN included the same or greater number of relapse volume POs, with +significantly smaller relapse volumes. Our novel findings indicate that CNNs +may predict LRR, yet further work on dataset development is required to attain +clinically useful prediction accuracy. + +
+
+
+
+
+ + ☆ SIGMA: Scale-Invariant Global Sparse Shape Matching + + +
+ We propose a novel mixed-integer programming (MIP) formulation for generating +precise sparse correspondences for highly non-rigid shapes. To this end, we +introduce a projected Laplace-Beltrami operator (PLBO) which combines intrinsic +and extrinsic geometric information to measure the deformation quality induced +by predicted correspondences. We integrate the PLBO, together with an +orientation-aware regulariser, into a novel MIP formulation that can be solved +to global optimality for many practical problems. In contrast to previous +methods, our approach is provably invariant to rigid transformations and global +scaling, initialisation-free, has optimality guarantees, and scales to high +resolution meshes with (empirically observed) linear time. We show +state-of-the-art results for sparse non-rigid matching on several challenging +3D datasets, including data with inconsistent meshing, as well as applications +in mesh-to-point-cloud matching. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Robust Autonomous Vehicle Pursuit without Expert Steering Labels + + +
+ In this work, we present a learning method for lateral and longitudinal +motion control of an ego-vehicle for vehicle pursuit. The car being controlled +does not have a pre-defined route, rather it reactively adapts to follow a +target vehicle while maintaining a safety distance. To train our model, we do +not rely on steering labels recorded from an expert driver but effectively +leverage a classical controller as an offline label generation tool. In +addition, we account for the errors in the predicted control values, which can +lead to a loss of tracking and catastrophic crashes of the controlled vehicle. +To this end, we propose an effective data augmentation approach, which allows +to train a network capable of handling different views of the target vehicle. +During the pursuit, the target vehicle is firstly localized using a +Convolutional Neural Network. The network takes a single RGB image along with +cars' velocities and estimates the target vehicle's pose with respect to the +ego-vehicle. This information is then fed to a Multi-Layer Perceptron, which +regresses the control commands for the ego-vehicle, namely throttle and +steering angle. We extensively validate our approach using the CARLA simulator +on a wide range of terrains. Our method demonstrates real-time performance and +robustness to different scenarios including unseen trajectories and high route +completion. The project page containing code and multimedia can be publicly +accessed here: https://changyaozhou.github.io/Autonomous-Vehicle-Pursuit/. + +
+
+ comment: 9 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Automated Semiconductor Defect Inspection in Scanning Electron + Microscope Images: a Systematic Review + + +
+ A growing need exists for efficient and accurate methods for detecting +defects in semiconductor materials and devices. These defects can have a +detrimental impact on the efficiency of the manufacturing process, because they +cause critical failures and wafer-yield limitations. As nodes and patterns get +smaller, even high-resolution imaging techniques such as Scanning Electron +Microscopy (SEM) produce noisy images due to operating close to sensitivity +levels and due to varying physical properties of different underlayers or +resist materials. This inherent noise is one of the main challenges for defect +inspection. One promising approach is the use of machine learning algorithms, +which can be trained to accurately classify and locate defects in semiconductor +samples. Recently, convolutional neural networks have proved to be particularly +useful in this regard. This systematic review provides a comprehensive overview +of the state of automated semiconductor defect inspection on SEM images, +including the most recent innovations and developments. 38 publications were +selected on this topic, indexed in IEEE Xplore and SPIE databases. For each of +these, the application, methodology, dataset, results, limitations and future +work were summarized. A comprehensive overview and analysis of their methods is +provided. Finally, promising avenues for future work in the field of SEM-based +defect inspection are suggested. + +
+
+ comment: 16 pages, 12 figures, 3 tables +
+
+
+
+
+ + ☆ Agglomerative Transformer for Human-Object Interaction Detection ICCV'23 + + +
+ We propose an agglomerative Transformer (AGER) that enables Transformer-based +human-object interaction (HOI) detectors to flexibly exploit extra +instance-level cues in a single-stage and end-to-end manner for the first time. +AGER acquires instance tokens by dynamically clustering patch tokens and +aligning cluster centers to instances with textual guidance, thus enjoying two +benefits: 1) Integrality: each instance token is encouraged to contain all +discriminative feature regions of an instance, which demonstrates a significant +improvement in the extraction of different instance-level cues and subsequently +leads to a new state-of-the-art performance of HOI detection with 36.75 mAP on +HICO-Det. 2) Efficiency: the dynamical clustering mechanism allows AGER to +generate instance tokens jointly with the feature learning of the Transformer +encoder, eliminating the need of an additional object detector or instance +decoder in prior methods, thus allowing the extraction of desirable extra cues +for HOI detection in a single-stage and end-to-end pipeline. Concretely, AGER +reduces GFLOPs by 8.5% and improves FPS by 36%, even compared to a vanilla +DETR-like pipeline without extra cue extraction. + +
+
+ comment: Accepted by ICCV'23 +
+
+
+
+
+ + ☆ Diff-CAPTCHA: An Image-based CAPTCHA with Security Enhanced by Denoising + Diffusion Model + + +
+ To enhance the security of text CAPTCHAs, various methods have been employed, +such as adding the interference lines on the text, randomly distorting the +characters, and overlapping multiple characters. These methods partly increase +the difficulty of automated segmentation and recognition attacks. However, +facing the rapid development of the end-to-end breaking algorithms, their +security has been greatly weakened. The diffusion model is a novel image +generation model that can generate the text images with deep fusion of +characters and background images. In this paper, an image-click CAPTCHA scheme +called Diff-CAPTCHA is proposed based on denoising diffusion models. The +background image and characters of the CAPTCHA are treated as a whole to guide +the generation process of a diffusion model, thus weakening the character +features available for machine learning, enhancing the diversity of character +features in the CAPTCHA, and increasing the difficulty of breaking algorithms. +To evaluate the security of Diff-CAPTCHA, this paper develops several attack +methods, including end-to-end attacks based on Faster R-CNN and two-stage +attacks, and Diff-CAPTCHA is compared with three baseline schemes, including +commercial CAPTCHA scheme and security-enhanced CAPTCHA scheme based on style +transfer. The experimental results show that diffusion models can effectively +enhance CAPTCHA security while maintaining good usability in human testing. + +
+
+
+
+
+ + ☆ DeepContrast: Deep Tissue Contrast Enhancement using Synthetic Data + Degradations and OOD Model Predictions + + +
+ Microscopy images are crucial for life science research, allowing detailed +inspection and characterization of cellular and tissue-level structures and +functions. However, microscopy data are unavoidably affected by image +degradations, such as noise, blur, or others. Many such degradations also +contribute to a loss of image contrast, which becomes especially pronounced in +deeper regions of thick samples. Today, best performing methods to increase the +quality of images are based on Deep Learning approaches, which typically +require ground truth (GT) data during training. Our inability to counteract +blurring and contrast loss when imaging deep into samples prevents the +acquisition of such clean GT data. The fact that the forward process of +blurring and contrast loss deep into tissue can be modeled, allowed us to +propose a new method that can circumvent the problem of unobtainable GT data. +To this end, we first synthetically degraded the quality of microscopy images +even further by using an approximate forward model for deep tissue image +degradations. Then we trained a neural network that learned the inverse of this +degradation function from our generated pairs of raw and degraded images. We +demonstrated that networks trained in this way can be used out-of-distribution +(OOD) to improve the quality of less severely degraded images, e.g. the raw +data imaged in a microscope. Since the absolute level of degradation in such +microscopy images can be stronger than the additional degradation introduced by +our forward model, we also explored the effect of iterative predictions. Here, +we observed that in each iteration the measured image contrast kept improving +while detailed structures in the images got increasingly removed. Therefore, +dependent on the desired downstream analysis, a balance between contrast +improvement and retention of image details has to be found. + +
+
+ comment: 8 pages, 7 figures, 1 table +
+
+
+
+
+ + ☆ KernelWarehouse: Towards Parameter-Efficient Dynamic Convolution + + +
+ Dynamic convolution learns a linear mixture of $n$ static kernels weighted +with their sample-dependent attentions, demonstrating superior performance +compared to normal convolution. However, existing designs are +parameter-inefficient: they increase the number of convolutional parameters by +$n$ times. This and the optimization difficulty lead to no research progress in +dynamic convolution that can allow us to use a significant large value of $n$ +(e.g., $n>100$ instead of typical setting $n<10$) to push forward the +performance boundary. In this paper, we propose $KernelWarehouse$, a more +general form of dynamic convolution, which can strike a favorable trade-off +between parameter efficiency and representation power. Its key idea is to +redefine the basic concepts of "$kernels$" and "$assembling$ $kernels$" in +dynamic convolution from the perspective of reducing kernel dimension and +increasing kernel number significantly. In principle, KernelWarehouse enhances +convolutional parameter dependencies within the same layer and across +successive layers via tactful kernel partition and warehouse sharing, yielding +a high degree of freedom to fit a desired parameter budget. We validate our +method on ImageNet and MS-COCO datasets with different ConvNet architectures, +and show that it attains state-of-the-art results. For instance, the +ResNet18|ResNet50|MobileNetV2|ConvNeXt-Tiny model trained with KernelWarehouse +on ImageNet reaches 76.05%|81.05%|75.52%|82.51% top-1 accuracy. Thanks to its +flexible design, KernelWarehouse can even reduce the model size of a ConvNet +while improving the accuracy, e.g., our ResNet18 model with 36.45%|65.10% +parameter reduction to the baseline shows 2.89%|2.29% absolute improvement to +top-1 accuracy. + +
+
+ comment: This research work was completed and submitted in early May 2023. + Code and pre-trained models are available at + https://github.com/OSVAI/KernelWarehouse +
+
+
+
+
+ + ☆ Membrane Potential Batch Normalization for Spiking Neural Networks ICCV2023 + + +
+ As one of the energy-efficient alternatives of conventional neural networks +(CNNs), spiking neural networks (SNNs) have gained more and more interest +recently. To train the deep models, some effective batch normalization (BN) +techniques are proposed in SNNs. All these BNs are suggested to be used after +the convolution layer as usually doing in CNNs. However, the spiking neuron is +much more complex with the spatio-temporal dynamics. The regulated data flow +after the BN layer will be disturbed again by the membrane potential updating +operation before the firing function, i.e., the nonlinear activation. +Therefore, we advocate adding another BN layer before the firing function to +normalize the membrane potential again, called MPBN. To eliminate the induced +time cost of MPBN, we also propose a training-inference-decoupled +re-parameterization technique to fold the trained MPBN into the firing +threshold. With the re-parameterization technique, the MPBN will not introduce +any extra time burden in the inference. Furthermore, the MPBN can also adopt +the element-wised form, while these BNs after the convolution layer can only +use the channel-wised form. Experimental results show that the proposed MPBN +performs well on both popular non-spiking static and neuromorphic datasets. Our +code is open-sourced at \href{https://github.com/yfguo91/MPBN}{MPBN}. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ GAEI-UNet: Global Attention and Elastic Interaction U-Net for Vessel + Image Segmentation + + +
+ Vessel image segmentation plays a pivotal role in medical diagnostics, aiding +in the early detection and treatment of vascular diseases. While segmentation +based on deep learning has shown promising results, effectively segmenting +small structures and maintaining connectivity between them remains challenging. +To address these limitations, we propose GAEI-UNet, a novel model that combines +global attention and elastic interaction-based techniques. GAEI-UNet leverages +global spatial and channel context information to enhance high-level semantic +understanding within the U-Net architecture, enabling precise segmentation of +small vessels. Additionally, we adopt an elastic interaction-based loss +function to improve connectivity among these fine structures. By capturing the +forces generated by misalignment between target and predicted shapes, our model +effectively learns to preserve the correct topology of vessel networks. +Evaluation on retinal vessel dataset -- DRIVE demonstrates the superior +performance of GAEI-UNet in terms of SE and connectivity of small structures, +without significantly increasing computational complexity. This research aims +to advance the field of vessel image segmentation, providing more accurate and +reliable diagnostic tools for the medical community. The implementation code is +available on Code. + +
+
+ comment: BIBM 2023 Under Review +
+
+
+
+
+ + ☆ Denoising Diffusion Probabilistic Model for Retinal Image Generation and + Segmentation + + +
+ Experts use retinal images and vessel trees to detect and diagnose various +eye, blood circulation, and brain-related diseases. However, manual +segmentation of retinal images is a time-consuming process that requires high +expertise and is difficult due to privacy issues. Many methods have been +proposed to segment images, but the need for large retinal image datasets +limits the performance of these methods. Several methods synthesize deep +learning models based on Generative Adversarial Networks (GAN) to generate +limited sample varieties. This paper proposes a novel Denoising Diffusion +Probabilistic Model (DDPM) that outperformed GANs in image synthesis. We +developed a Retinal Trees (ReTree) dataset consisting of retinal images, +corresponding vessel trees, and a segmentation network based on DDPM trained +with images from the ReTree dataset. In the first stage, we develop a two-stage +DDPM that generates vessel trees from random numbers belonging to a standard +normal distribution. Later, the model is guided to generate fundus images from +given vessel trees and random distribution. The proposed dataset has been +evaluated quantitatively and qualitatively. Quantitative evaluation metrics +include Frechet Inception Distance (FID) score, Jaccard similarity coefficient, +Cohen's kappa, Matthew's Correlation Coefficient (MCC), precision, recall, +F1-score, and accuracy. We trained the vessel segmentation model with synthetic +data to validate our dataset's efficiency and tested it on authentic data. Our +developed dataset and source code is available at +https://github.com/AAleka/retree. + +
+
+ comment: International Conference on Computational Photography 2023 (ICCP + 2023) +
+
+
+
+
+ + ☆ Improving Depth Gradient Continuity in Transformers: A Comparative Study + on Monocular Depth Estimation with CNN + + +
+ Monocular depth estimation is an ongoing challenge in computer vision. Recent +progress with Transformer models has demonstrated notable advantages over +conventional CNNs in this area. However, there's still a gap in understanding +how these models prioritize different regions in 2D images and how these +regions affect depth estimation performance. To explore the differences between +Transformers and CNNs, we employ a sparse pixel approach to contrastively +analyze the distinctions between the two. Our findings suggest that while +Transformers excel in handling global context and intricate textures, they lag +behind CNNs in preserving depth gradient continuity. To further enhance the +performance of Transformer models in monocular depth estimation, we propose the +Depth Gradient Refinement (DGR) module that refines depth estimation through +high-order differentiation, feature fusion, and recalibration. Additionally, we +leverage optimal transport theory, treating depth maps as spatial probability +distributions, and employ the optimal transport distance as a loss function to +optimize our model. Experimental results demonstrate that models integrated +with the plug-and-play Depth Gradient Refinement (DGR) module and the proposed +loss function enhance performance without increasing complexity and +computational costs. This research not only offers fresh insights into the +distinctions between Transformers and CNNs in depth estimation but also paves +the way for novel depth estimation methodologies. + +
+
+
+
+
+ + ☆ AdaBrowse: Adaptive Video Browser for Efficient Continuous Sign Language + Recognition + + +
+ Raw videos have been proven to own considerable feature redundancy where in +many cases only a portion of frames can already meet the requirements for +accurate recognition. In this paper, we are interested in whether such +redundancy can be effectively leveraged to facilitate efficient inference in +continuous sign language recognition (CSLR). We propose a novel adaptive model +(AdaBrowse) to dynamically select a most informative subsequence from input +video sequences by modelling this problem as a sequential decision task. In +specific, we first utilize a lightweight network to quickly scan input videos +to extract coarse features. Then these features are fed into a policy network +to intelligently select a subsequence to process. The corresponding subsequence +is finally inferred by a normal CSLR model for sentence prediction. As only a +portion of frames are processed in this procedure, the total computations can +be considerably saved. Besides temporal redundancy, we are also interested in +whether the inherent spatial redundancy can be seamlessly integrated together +to achieve further efficiency, i.e., dynamically selecting a lowest input +resolution for each sample, whose model is referred to as AdaBrowse+. Extensive +experimental results on four large-scale CSLR datasets, i.e., PHOENIX14, +PHOENIX14-T, CSL-Daily and CSL, demonstrate the effectiveness of AdaBrowse and +AdaBrowse+ by achieving comparable accuracy with state-of-the-art methods with +1.44$\times$ throughput and 2.12$\times$ fewer FLOPs. Comparisons with other +commonly-used 2D CNNs and adaptive efficient methods verify the effectiveness +of AdaBrowse. Code is available at +\url{https://github.com/hulianyuyy/AdaBrowse}. + +
+
+ comment: ACMMM2023 +
+
+
+
+
+ + ☆ Visually-Aware Context Modeling for News Image Captioning + + +
+ The goal of News Image Captioning is to generate an image caption according +to the content of both a news article and an image. To leverage the visual +information effectively, it is important to exploit the connection between the +context in the articles/captions and the images. Psychological studies indicate +that human faces in images draw higher attention priorities. On top of that, +humans often play a central role in news stories, as also proven by the +face-name co-occurrence pattern we discover in existing News Image Captioning +datasets. Therefore, we design a face-naming module for faces in images and +names in captions/articles to learn a better name embedding. Apart from names, +which can be directly linked to an image area (faces), news image captions +mostly contain context information that can only be found in the article. +Humans typically address this by searching for relevant information from the +article based on the image. To emulate this thought process, we design a +retrieval strategy using CLIP to retrieve sentences that are semantically close +to the image. We conduct extensive experiments to demonstrate the efficacy of +our framework. Without using additional paired data, we establish the new +state-of-the-art performance on two News Image Captioning datasets, exceeding +the previous state-of-the-art by 5 CIDEr points. We will release code upon +acceptance. + +
+
+
+
+
+ + ☆ Stable and Causal Inference for Discriminative Self-supervised Deep + Visual Representations ICCV 2023 + + +
+ In recent years, discriminative self-supervised methods have made significant +strides in advancing various visual tasks. The central idea of learning a data +encoder that is robust to data distortions/augmentations is straightforward yet +highly effective. Although many studies have demonstrated the empirical success +of various learning methods, the resulting learned representations can exhibit +instability and hinder downstream performance. In this study, we analyze +discriminative self-supervised methods from a causal perspective to explain +these unstable behaviors and propose solutions to overcome them. Our approach +draws inspiration from prior works that empirically demonstrate the ability of +discriminative self-supervised methods to demix ground truth causal sources to +some extent. Unlike previous work on causality-empowered representation +learning, we do not apply our solutions during the training process but rather +during the inference process to improve time efficiency. Through experiments on +both controlled image datasets and realistic image datasets, we show that our +proposed solutions, which involve tempering a linear transformation with +controlled synthetic data, are effective in addressing these issues. + +
+
+ comment: ICCV 2023 accepted paper +
+
+
+
+
+ + ☆ Dual-Stream Diffusion Net for Text-to-Video Generation + + +
+ With the emerging diffusion models, recently, text-to-video generation has +aroused increasing attention. But an important bottleneck therein is that +generative videos often tend to carry some flickers and artifacts. In this +work, we propose a dual-stream diffusion net (DSDN) to improve the consistency +of content variations in generating videos. In particular, the designed two +diffusion streams, video content and motion branches, could not only run +separately in their private spaces for producing personalized video variations +as well as content, but also be well-aligned between the content and motion +domains through leveraging our designed cross-transformer interaction module, +which would benefit the smoothness of generated videos. Besides, we also +introduce motion decomposer and combiner to faciliate the operation on video +motion. Qualitative and quantitative experiments demonstrate that our method +could produce amazing continuous videos with fewer flickers. + +
+
+ comment: 8pages, 7 figures +
+
+
+
+
+ + ☆ ECPC-IDS:A benchmark endometrail cancer PET/CT image dataset for + evaluation of semantic segmentation and detection of hypermetabolic regions + + +
+ Endometrial cancer is one of the most common tumors in the female +reproductive system and is the third most common gynecological malignancy that +causes death after ovarian and cervical cancer. Early diagnosis can +significantly improve the 5-year survival rate of patients. With the +development of artificial intelligence, computer-assisted diagnosis plays an +increasingly important role in improving the accuracy and objectivity of +diagnosis, as well as reducing the workload of doctors. However, the absence of +publicly available endometrial cancer image datasets restricts the application +of computer-assisted diagnostic techniques.In this paper, a publicly available +Endometrial Cancer PET/CT Image Dataset for Evaluation of Semantic Segmentation +and Detection of Hypermetabolic Regions (ECPC-IDS) are published. Specifically, +the segmentation section includes PET and CT images, with a total of 7159 +images in multiple formats. In order to prove the effectiveness of segmentation +methods on ECPC-IDS, five classical deep learning semantic segmentation methods +are selected to test the image segmentation task. The object detection section +also includes PET and CT images, with a total of 3579 images and XML files with +annotation information. Six deep learning methods are selected for experiments +on the detection task.This study conduct extensive experiments using deep +learning-based semantic segmentation and object detection methods to +demonstrate the differences between various methods on ECPC-IDS. As far as we +know, this is the first publicly available dataset of endometrial cancer with a +large number of multiple images, including a large amount of information +required for image and target detection. ECPC-IDS can aid researchers in +exploring new algorithms to enhance computer-assisted technology, benefiting +both clinical doctors and patients greatly. + +
+
+ comment: 14 pages,6 figures +
+
+
+
+
+ + ☆ Leveraging Next-Active Objects for Context-Aware Anticipation in + Egocentric Videos WACV'24 + + +
+ Objects are crucial for understanding human-object interactions. By +identifying the relevant objects, one can also predict potential future +interactions or actions that may occur with these objects. In this paper, we +study the problem of Short-Term Object interaction anticipation (STA) and +propose NAOGAT (Next-Active-Object Guided Anticipation Transformer), a +multi-modal end-to-end transformer network, that attends to objects in observed +frames in order to anticipate the next-active-object (NAO) and, eventually, to +guide the model to predict context-aware future actions. The task is +challenging since it requires anticipating future action along with the object +with which the action occurs and the time after which the interaction will +begin, a.k.a. the time to contact (TTC). Compared to existing video modeling +architectures for action anticipation, NAOGAT captures the relationship between +objects and the global scene context in order to predict detections for the +next active object and anticipate relevant future actions given these +detections, leveraging the objects' dynamics to improve accuracy. One of the +key strengths of our approach, in fact, is its ability to exploit the motion +dynamics of objects within a given clip, which is often ignored by other +models, and separately decoding the object-centric and motion-centric +information. Through our experiments, we show that our model outperforms +existing methods on two separate datasets, Ego4D and EpicKitchens-100 ("Unseen +Set"), as measured by several additional metrics, such as time to contact, and +next-active-object localization. The code will be available upon acceptance. + +
+
+ comment: Accepted in WACV'24 +
+
+
+
+
+ + ☆ Improving Audio-Visual Segmentation with Bidirectional Generation + + +
+ The aim of audio-visual segmentation (AVS) is to precisely differentiate +audible objects within videos down to the pixel level. Traditional approaches +often tackle this challenge by combining information from various modalities, +where the contribution of each modality is implicitly or explicitly modeled. +Nevertheless, the interconnections between different modalities tend to be +overlooked in audio-visual modeling. In this paper, inspired by the human +ability to mentally simulate the sound of an object and its visual appearance, +we introduce a bidirectional generation framework. This framework establishes +robust correlations between an object's visual characteristics and its +associated sound, thereby enhancing the performance of AVS. To achieve this, we +employ a visual-to-audio projection component that reconstructs audio features +from object segmentation masks and minimizes reconstruction errors. Moreover, +recognizing that many sounds are linked to object movements, we introduce an +implicit volumetric motion estimation module to handle temporal dynamics that +may be challenging to capture using conventional optical flow methods. To +showcase the effectiveness of our approach, we conduct comprehensive +experiments and analyses on the widely recognized AVSBench benchmark. As a +result, we establish a new state-of-the-art performance level in the AVS +benchmark, particularly excelling in the challenging MS3 subset which involves +segmenting multiple sound sources. To facilitate reproducibility, we plan to +release both the source code and the pre-trained model. + +
+
+ comment: Dawei Hao and Yuxin Mao contribute equality to this paper. Yiran + Zhong is the corresponding author. The code will be released at + https://github.com/OpenNLPLab/AVS-bidirectional +
+
+
+
+
+ + ☆ CARE: A Large Scale CT Image Dataset and Clinical Applicable Benchmark + Model for Rectal Cancer Segmentation + + +
+ Rectal cancer segmentation of CT image plays a crucial role in timely +clinical diagnosis, radiotherapy treatment, and follow-up. Although current +segmentation methods have shown promise in delineating cancerous tissues, they +still encounter challenges in achieving high segmentation precision. These +obstacles arise from the intricate anatomical structures of the rectum and the +difficulties in performing differential diagnosis of rectal cancer. +Additionally, a major obstacle is the lack of a large-scale, finely annotated +CT image dataset for rectal cancer segmentation. To address these issues, this +work introduces a novel large scale rectal cancer CT image dataset CARE with +pixel-level annotations for both normal and cancerous rectum, which serves as a +valuable resource for algorithm research and clinical application development. +Moreover, we propose a novel medical cancer lesion segmentation benchmark model +named U-SAM. The model is specifically designed to tackle the challenges posed +by the intricate anatomical structures of abdominal organs by incorporating +prompt information. U-SAM contains three key components: promptable information +(e.g., points) to aid in target area localization, a convolution module for +capturing low-level lesion details, and skip-connections to preserve and +recover spatial information during the encoding-decoding process. To evaluate +the effectiveness of U-SAM, we systematically compare its performance with +several popular segmentation methods on the CARE dataset. The generalization of +the model is further verified on the WORD dataset. Extensive experiments +demonstrate that the proposed U-SAM outperforms state-of-the-art methods on +these two datasets. These experiments can serve as the baseline for future +research and clinical application development. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Computer vision-enriched discrete choice models, with an application to + residential location choice + + +
+ Visual imagery is indispensable to many multi-attribute decision situations. +Examples of such decision situations in travel behaviour research include +residential location choices, vehicle choices, tourist destination choices, and +various safety-related choices. However, current discrete choice models cannot +handle image data and thus cannot incorporate information embedded in images +into their representations of choice behaviour. This gap between discrete +choice models' capabilities and the real-world behaviour it seeks to model +leads to incomplete and, possibly, misleading outcomes. To solve this gap, this +study proposes "Computer Vision-enriched Discrete Choice Models" (CV-DCMs). +CV-DCMs can handle choice tasks involving numeric attributes and images by +integrating computer vision and traditional discrete choice models. Moreover, +because CV-DCMs are grounded in random utility maximisation principles, they +maintain the solid behavioural foundation of traditional discrete choice +models. We demonstrate the proposed CV-DCM by applying it to data obtained +through a novel stated choice experiment involving residential location +choices. In this experiment, respondents faced choice tasks with trade-offs +between commute time, monthly housing cost and street-level conditions, +presented using images. As such, this research contributes to the growing body +of literature in the travel behaviour field that seeks to integrate discrete +choice modelling and machine learning. + +
+
+
+
+
+ + ☆ Detecting Olives with Synthetic or Real Data? Olive the Above + + +
+ Modern robotics has enabled the advancement in yield estimation for precision +agriculture. However, when applied to the olive industry, the high variation of +olive colors and their similarity to the background leaf canopy presents a +challenge. Labeling several thousands of very dense olive grove images for +segmentation is a labor-intensive task. This paper presents a novel approach to +detecting olives without the need to manually label data. In this work, we +present the world's first olive detection dataset comprised of synthetic and +real olive tree images. This is accomplished by generating an auto-labeled +photorealistic 3D model of an olive tree. Its geometry is then simplified for +lightweight rendering purposes. In addition, experiments are conducted with a +mix of synthetically generated and real images, yielding an improvement of up +to 66% compared to when only using a small sample of real data. When access to +real, human-labeled data is limited, a combination of mostly synthetic data and +a small amount of real data can enhance olive detection. + +
+
+
+
+
+ + ☆ OnUVS: Online Feature Decoupling Framework for High-Fidelity Ultrasound + Video Synthesis + + +
+ Ultrasound (US) imaging is indispensable in clinical practice. To diagnose +certain diseases, sonographers must observe corresponding dynamic anatomic +structures to gather comprehensive information. However, the limited +availability of specific US video cases causes teaching difficulties in +identifying corresponding diseases, which potentially impacts the detection +rate of such cases. The synthesis of US videos may represent a promising +solution to this issue. Nevertheless, it is challenging to accurately animate +the intricate motion of dynamic anatomic structures while preserving image +fidelity. To address this, we present a novel online feature-decoupling +framework called OnUVS for high-fidelity US video synthesis. Our highlights can +be summarized by four aspects. First, we introduced anatomic information into +keypoint learning through a weakly-supervised training strategy, resulting in +improved preservation of anatomical integrity and motion while minimizing the +labeling burden. Second, to better preserve the integrity and textural +information of US images, we implemented a dual-decoder that decouples the +content and textural features in the generator. Third, we adopted a +multiple-feature discriminator to extract a comprehensive range of visual cues, +thereby enhancing the sharpness and fine details of the generated videos. +Fourth, we constrained the motion trajectories of keypoints during online +learning to enhance the fluidity of generated videos. Our validation and user +studies on in-house echocardiographic and pelvic floor US videos showed that +OnUVS synthesizes US videos with high fidelity. + +
+
+ comment: 14 pages, 13 figures and 6 tables +
+
+
+
+
+ + ☆ SceNeRFlow: Time-Consistent Reconstruction of General Dynamic Scenes + + +
+ Existing methods for the 4D reconstruction of general, non-rigidly deforming +objects focus on novel-view synthesis and neglect correspondences. However, +time consistency enables advanced downstream tasks like 3D editing, motion +analysis, or virtual-asset creation. We propose SceNeRFlow to reconstruct a +general, non-rigid scene in a time-consistent manner. Our dynamic-NeRF method +takes multi-view RGB videos and background images from static cameras with +known camera parameters as input. It then reconstructs the deformations of an +estimated canonical model of the geometry and appearance in an online fashion. +Since this canonical model is time-invariant, we obtain correspondences even +for long-term, long-range motions. We employ neural scene representations to +parametrize the components of our method. Like prior dynamic-NeRF methods, we +use a backwards deformation model. We find non-trivial adaptations of this +model necessary to handle larger motions: We decompose the deformations into a +strongly regularized coarse component and a weakly regularized fine component, +where the coarse component also extends the deformation field into the space +surrounding the object, which enables tracking over time. We show +experimentally that, unlike prior work that only handles small motion, our +method enables the reconstruction of studio-scale motions. + +
+
+ comment: Project page: https://vcai.mpi-inf.mpg.de/projects/scenerflow/ +
+
+
+
+
+ + ☆ MultiMediate'23: Engagement Estimation and Bodily Behaviour Recognition + in Social Interactions + + +
+ Automatic analysis of human behaviour is a fundamental prerequisite for the +creation of machines that can effectively interact with- and support humans in +social interactions. In MultiMediate'23, we address two key human social +behaviour analysis tasks for the first time in a controlled challenge: +engagement estimation and bodily behaviour recognition in social interactions. +This paper describes the MultiMediate'23 challenge and presents novel sets of +annotations for both tasks. For engagement estimation we collected novel +annotations on the NOvice eXpert Interaction (NOXI) database. For bodily +behaviour recognition, we annotated test recordings of the MPIIGroupInteraction +corpus with the BBSI annotation scheme. In addition, we present baseline +results for both challenge tasks. + +
+
+ comment: ACM MultiMedia'23 +
+
+
+
+
+ + ☆ Contrastive Learning for Lane Detection via cross-similarity + + +
+ Detecting road lanes is challenging due to intricate markings vulnerable to +unfavorable conditions. Lane markings have strong shape priors, but their +visibility is easily compromised. Factors like lighting, weather, vehicles, +pedestrians, and aging colors challenge the detection. A large amount of data +is required to train a lane detection approach that can withstand natural +variations caused by low visibility. This is because there are numerous lane +shapes and natural variations that exist. Our solution, Contrastive Learning +for Lane Detection via cross-similarity (CLLD), is a self-supervised learning +method that tackles this challenge by enhancing lane detection models +resilience to real-world conditions that cause lane low visibility. CLLD is a +novel multitask contrastive learning that trains lane detection approaches to +detect lane markings even in low visible situations by integrating local +feature contrastive learning (CL) with our new proposed operation +cross-similarity. Local feature CL focuses on extracting features for small +image parts, which is necessary to localize lane segments, while +cross-similarity captures global features to detect obscured lane segments +using their surrounding. We enhance cross-similarity by randomly masking parts +of input images for augmentation. Evaluated on benchmark datasets, CLLD +outperforms state-of-the-art contrastive learning, especially in +visibility-impairing conditions like shadows. Compared to supervised learning, +CLLD excels in scenarios like shadows and crowded scenes. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ DDF-HO: Hand-Held Object Reconstruction via Conditional Directed + Distance Field + + +
+ Reconstructing hand-held objects from a single RGB image is an important and +challenging problem. Existing works utilizing Signed Distance Fields (SDF) +reveal limitations in comprehensively capturing the complex hand-object +interactions, since SDF is only reliable within the proximity of the target, +and hence, infeasible to simultaneously encode local hand and object cues. To +address this issue, we propose DDF-HO, a novel approach leveraging Directed +Distance Field (DDF) as the shape representation. Unlike SDF, DDF maps a ray in +3D space, consisting of an origin and a direction, to corresponding DDF values, +including a binary visibility signal determining whether the ray intersects the +objects and a distance value measuring the distance from origin to target in +the given direction. We randomly sample multiple rays and collect local to +global geometric features for them by introducing a novel 2D ray-based feature +aggregation scheme and a 3D intersection-aware hand pose embedding, combining +2D-3D features to model hand-object interactions. Extensive experiments on +synthetic and real-world datasets demonstrate that DDF-HO consistently +outperforms all baseline methods by a large margin, especially under Chamfer +Distance, with about 80% leap forward. Codes and trained models will be +released soon. + +
+
+
+
+
+ + ☆ Inherent Redundancy in Spiking Neural Networks ICCV2023 + + +
+ Spiking Neural Networks (SNNs) are well known as a promising energy-efficient +alternative to conventional artificial neural networks. Subject to the +preconceived impression that SNNs are sparse firing, the analysis and +optimization of inherent redundancy in SNNs have been largely overlooked, thus +the potential advantages of spike-based neuromorphic computing in accuracy and +energy efficiency are interfered. In this work, we pose and focus on three key +questions regarding the inherent redundancy in SNNs. We argue that the +redundancy is induced by the spatio-temporal invariance of SNNs, which enhances +the efficiency of parameter utilization but also invites lots of noise spikes. +Further, we analyze the effect of spatio-temporal invariance on the +spatio-temporal dynamics and spike firing of SNNs. Then, motivated by these +analyses, we propose an Advance Spatial Attention (ASA) module to harness SNNs' +redundancy, which can adaptively optimize their membrane potential distribution +by a pair of individual spatial attention sub-modules. In this way, noise spike +features are accurately regulated. Experimental results demonstrate that the +proposed method can significantly drop the spike firing with better performance +than state-of-the-art SNN baselines. Our code is available in +\url{https://github.com/BICLab/ASA-SNN}. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ How To Overcome Confirmation Bias in Semi-Supervised Image + Classification By Active Learning ECML + + +
+ Do we need active learning? The rise of strong deep semi-supervised methods +raises doubt about the usability of active learning in limited labeled data +settings. This is caused by results showing that combining semi-supervised +learning (SSL) methods with a random selection for labeling can outperform +existing active learning (AL) techniques. However, these results are obtained +from experiments on well-established benchmark datasets that can overestimate +the external validity. However, the literature lacks sufficient research on the +performance of active semi-supervised learning methods in realistic data +scenarios, leaving a notable gap in our understanding. Therefore we present +three data challenges common in real-world applications: between-class +imbalance, within-class imbalance, and between-class similarity. These +challenges can hurt SSL performance due to confirmation bias. We conduct +experiments with SSL and AL on simulated data challenges and find that random +sampling does not mitigate confirmation bias and, in some cases, leads to worse +performance than supervised learning. In contrast, we demonstrate that AL can +overcome confirmation bias in SSL in these realistic settings. Our results +provide insights into the potential of combining active and semi-supervised +learning in the presence of common real-world challenges, which is a promising +direction for robust methods when learning with limited labeled data in +real-world applications. + +
+
+ comment: Accepted @ ECML PKDD 2023. This is the author's version of the work. + The definitive Version of Record will be published in the Proceedings of ECML + PKDD 2023 +
+
+
+
+
+ + ☆ Low-Light Image Enhancement with Illumination-Aware Gamma Correction and + Complete Image Modelling Network ICCV 2023 + + +
+ This paper presents a novel network structure with illumination-aware gamma +correction and complete image modelling to solve the low-light image +enhancement problem. Low-light environments usually lead to less informative +large-scale dark areas, directly learning deep representations from low-light +images is insensitive to recovering normal illumination. We propose to +integrate the effectiveness of gamma correction with the strong modelling +capacities of deep networks, which enables the correction factor gamma to be +learned in a coarse to elaborate manner via adaptively perceiving the deviated +illumination. Because exponential operation introduces high computational +complexity, we propose to use Taylor Series to approximate gamma correction, +accelerating the training and inference speed. Dark areas usually occupy large +scales in low-light images, common local modelling structures, e.g., CNN, +SwinIR, are thus insufficient to recover accurate illumination across whole +low-light images. We propose a novel Transformer block to completely simulate +the dependencies of all pixels across images via a local-to-global hierarchical +attention mechanism, so that dark areas could be inferred by borrowing the +information from far informative regions in a highly effective manner. +Extensive experiments on several benchmark datasets demonstrate that our +approach outperforms state-of-the-art methods. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ MEDOE: A Multi-Expert Decoder and Output Ensemble Framework for + Long-tailed Semantic Segmentation + + +
+ Long-tailed distribution of semantic categories, which has been often ignored +in conventional methods, causes unsatisfactory performance in semantic +segmentation on tail categories. In this paper, we focus on the problem of +long-tailed semantic segmentation. Although some long-tailed recognition +methods (e.g., re-sampling/re-weighting) have been proposed in other problems, +they can probably compromise crucial contextual information and are thus hardly +adaptable to the problem of long-tailed semantic segmentation. To address this +issue, we propose MEDOE, a novel framework for long-tailed semantic +segmentation via contextual information ensemble-and-grouping. The proposed +two-sage framework comprises a multi-expert decoder (MED) and a multi-expert +output ensemble (MOE). Specifically, the MED includes several "experts". Based +on the pixel frequency distribution, each expert takes the dataset masked +according to the specific categories as input and generates contextual +information self-adaptively for classification; The MOE adopts learnable +decision weights for the ensemble of the experts' outputs. As a model-agnostic +framework, our MEDOE can be flexibly and efficiently coupled with various +popular deep neural networks (e.g., DeepLabv3+, OCRNet, and PSPNet) to improve +their performance in long-tailed semantic segmentation. Experimental results +show that the proposed framework outperforms the current methods on both +Cityscapes and ADE20K datasets by up to 1.78% in mIoU and 5.89% in mAcc. + +
+
+ comment: 18 pages, 9 figures +
+
+
+
+
+ + ☆ Neural Spherical Harmonics for structurally coherent continuous + representation of diffusion MRI signal MICCAI 2023 + + +
+ We present a novel way to model diffusion magnetic resonance imaging (dMRI) +datasets, that benefits from the structural coherence of the human brain while +only using data from a single subject. Current methods model the dMRI signal in +individual voxels, disregarding the intervoxel coherence that is present. We +use a neural network to parameterize a spherical harmonics series (NeSH) to +represent the dMRI signal of a single subject from the Human Connectome Project +dataset, continuous in both the angular and spatial domain. The reconstructed +dMRI signal using this method shows a more structurally coherent representation +of the data. Noise in gradient images is removed and the fiber orientation +distribution functions show a smooth change in direction along a fiber tract. +We showcase how the reconstruction can be used to calculate mean diffusivity, +fractional anisotropy, and total apparent fiber density. These results can be +achieved with a single model architecture, tuning only one hyperparameter. In +this paper we also demonstrate how upsampling in both the angular and spatial +domain yields reconstructions that are on par or better than existing methods. + +
+
+ comment: 12 pages, 6 figures, accepted for cdMRI workshop at MICCAI 2023 +
+
+
+
+
+ + ☆ Explainable Multi-View Deep Networks Methodology for Experimental + Physics + + +
+ Physical experiments often involve multiple imaging representations, such as +X-ray scans and microscopic images. Deep learning models have been widely used +for supervised analysis in these experiments. Combining different image +representations is frequently required to analyze and make a decision properly. +Consequently, multi-view data has emerged - datasets where each sample is +described by views from different angles, sources, or modalities. These +problems are addressed with the concept of multi-view learning. Understanding +the decision-making process of deep learning models is essential for reliable +and credible analysis. Hence, many explainability methods have been devised +recently. Nonetheless, there is a lack of proper explainability in multi-view +models, which are challenging to explain due to their architectures. In this +paper, we suggest different multi-view architectures for the vision domain, +each suited to another problem, and we also present a methodology for +explaining these models. To demonstrate the effectiveness of our methodology, +we focus on the domain of High Energy Density Physics (HEDP) experiments, where +multiple imaging representations are used to assess the quality of foam +samples. We apply our methodology to classify the foam samples quality using +the suggested multi-view architectures. Through experimental results, we +showcase the improvement of accurate architecture choice on both accuracy - 78% +to 84% and AUC - 83% to 93% and present a trade-off between performance and +explainability. Specifically, we demonstrate that our approach enables the +explanation of individual one-view models, providing insights into the +decision-making process of each view. This understanding enhances the +interpretability of the overall multi-view model. The sources of this work are +available at: +https://github.com/Scientific-Computing-Lab-NRCN/Multi-View-Explainability. + +
+
+
+
+
+ + ☆ Self-Reference Deep Adaptive Curve Estimation for Low-Light Image + Enhancement + + +
+ In this paper, we propose a 2-stage low-light image enhancement method called +Self-Reference Deep Adaptive Curve Estimation (Self-DACE). In the first stage, +we present an intuitive, lightweight, fast, and unsupervised luminance +enhancement algorithm. The algorithm is based on a novel low-light enhancement +curve that can be used to locally boost image brightness. We also propose a new +loss function with a simplified physical model designed to preserve natural +images' color, structure, and fidelity. We use a vanilla CNN to map each pixel +through deep Adaptive Adjustment Curves (AAC) while preserving the local image +structure. Secondly, we introduce the corresponding denoising scheme to remove +the latent noise in the darkness. We approximately model the noise in the dark +and deploy a Denoising-Net to estimate and remove the noise after the first +stage. Exhaustive qualitative and quantitative analysis shows that our method +outperforms existing state-of-the-art algorithms on multiple real-world +datasets. + +
+
+
+
+
+ + ☆ Automatic Vision-Based Parking Slot Detection and Occupancy + Classification + + +
+ Parking guidance information (PGI) systems are used to provide information to +drivers about the nearest parking lots and the number of vacant parking slots. +Recently, vision-based solutions started to appear as a cost-effective +alternative to standard PGI systems based on hardware sensors mounted on each +parking slot. Vision-based systems provide information about parking occupancy +based on images taken by a camera that is recording a parking lot. However, +such systems are challenging to develop due to various possible viewpoints, +weather conditions, and object occlusions. Most notably, they require manual +labeling of parking slot locations in the input image which is sensitive to +camera angle change, replacement, or maintenance. In this paper, the algorithm +that performs Automatic Parking Slot Detection and Occupancy Classification +(APSD-OC) solely on input images is proposed. Automatic parking slot detection +is based on vehicle detections in a series of parking lot images upon which +clustering is applied in bird's eye view to detect parking slots. Once the +parking slots positions are determined in the input image, each detected +parking slot is classified as occupied or vacant using a specifically trained +ResNet34 deep classifier. The proposed approach is extensively evaluated on +well-known publicly available datasets (PKLot and CNRPark+EXT), showing high +efficiency in parking slot detection and robustness to the presence of illegal +parking or passing vehicles. Trained classifier achieves high accuracy in +parking slot occupancy classification. + +
+
+ comment: 39 pages, 8 figures, 9 tables +
+
+
+
+
+ + ☆ Unsupervised Domain Adaptive Detection with Network Stability Analysis + + +
+ Domain adaptive detection aims to improve the generality of a detector, +learned from the labeled source domain, on the unlabeled target domain. In this +work, drawing inspiration from the concept of stability from the control theory +that a robust system requires to remain consistent both externally and +internally regardless of disturbances, we propose a novel framework that +achieves unsupervised domain adaptive detection through stability analysis. In +specific, we treat discrepancies between images and regions from different +domains as disturbances, and introduce a novel simple but effective Network +Stability Analysis (NSA) framework that considers various disturbances for +domain adaptation. Particularly, we explore three types of perturbations +including heavy and light image-level disturbances and instancelevel +disturbance. For each type, NSA performs external consistency analysis on the +outputs from raw and perturbed images and/or internal consistency analysis on +their features, using teacher-student models. By integrating NSA into Faster +R-CNN, we immediately achieve state-of-the-art results. In particular, we set a +new record of 52.7% mAP on Cityscapes-to-FoggyCityscapes, showing the potential +of NSA for domain adaptive detection. It is worth noticing, our NSA is designed +for general purpose, and thus applicable to one-stage detection model (e.g., +FCOS) besides the adopted one, as shown by experiments. +https://github.com/tiankongzhang/NSA. + +
+
+
+
+
+ + ☆ AATCT-IDS: A Benchmark Abdominal Adipose Tissue CT Image Dataset for + Image Denoising, Semantic Segmentation, and Radiomics Evaluation + + +
+ Methods: In this study, a benchmark \emph{Abdominal Adipose Tissue CT Image +Dataset} (AATTCT-IDS) containing 300 subjects is prepared and published. +AATTCT-IDS publics 13,732 raw CT slices, and the researchers individually +annotate the subcutaneous and visceral adipose tissue regions of 3,213 of those +slices that have the same slice distance to validate denoising methods, train +semantic segmentation models, and study radiomics. For different tasks, this +paper compares and analyzes the performance of various methods on AATTCT-IDS by +combining the visualization results and evaluation data. Thus, verify the +research potential of this data set in the above three types of tasks. + Results: In the comparative study of image denoising, algorithms using a +smoothing strategy suppress mixed noise at the expense of image details and +obtain better evaluation data. Methods such as BM3D preserve the original image +structure better, although the evaluation data are slightly lower. The results +show significant differences among them. In the comparative study of semantic +segmentation of abdominal adipose tissue, the segmentation results of adipose +tissue by each model show different structural characteristics. Among them, +BiSeNet obtains segmentation results only slightly inferior to U-Net with the +shortest training time and effectively separates small and isolated adipose +tissue. In addition, the radiomics study based on AATTCT-IDS reveals three +adipose distributions in the subject population. + Conclusion: AATTCT-IDS contains the ground truth of adipose tissue regions in +abdominal CT slices. This open-source dataset can attract researchers to +explore the multi-dimensional characteristics of abdominal adipose tissue and +thus help physicians and patients in clinical practice. AATCT-IDS is freely +published for non-commercial purpose at: +\url{https://figshare.com/articles/dataset/AATTCT-IDS/23807256}. + +
+
+ comment: 17 pages, 7 figures +
+
+
+
+
+ + ☆ Interpretability Benchmark for Evaluating Spatial Misalignment of + Prototypical Parts Explanations + + +
+ Prototypical parts-based networks are becoming increasingly popular due to +their faithful self-explanations. However, their similarity maps are calculated +in the penultimate network layer. Therefore, the receptive field of the +prototype activation region often depends on parts of the image outside this +region, which can lead to misleading interpretations. We name this undesired +behavior a spatial explanation misalignment and introduce an interpretability +benchmark with a set of dedicated metrics for quantifying this phenomenon. In +addition, we propose a method for misalignment compensation and apply it to +existing state-of-the-art models. We show the expressiveness of our benchmark +and the effectiveness of the proposed compensation methodology through +extensive empirical studies. + +
+
+ comment: Under review. Code will be release upon acceptance +
+
+
+
+
+ + ☆ Learning to Generate Semantic Layouts for Higher Text-Image + Correspondence in Text-to-Image Synthesis ICCV 2023 + + +
+ Existing text-to-image generation approaches have set high standards for +photorealism and text-image correspondence, largely benefiting from web-scale +text-image datasets, which can include up to 5~billion pairs. However, +text-to-image generation models trained on domain-specific datasets, such as +urban scenes, medical images, and faces, still suffer from low text-image +correspondence due to the lack of text-image pairs. Additionally, collecting +billions of text-image pairs for a specific domain can be time-consuming and +costly. Thus, ensuring high text-image correspondence without relying on +web-scale text-image datasets remains a challenging task. In this paper, we +present a novel approach for enhancing text-image correspondence by leveraging +available semantic layouts. Specifically, we propose a Gaussian-categorical +diffusion process that simultaneously generates both images and corresponding +layout pairs. Our experiments reveal that we can guide text-to-image generation +models to be aware of the semantics of different image regions, by training the +model to generate semantic labels for each pixel. We demonstrate that our +approach achieves higher text-image correspondence compared to existing +text-to-image generation approaches in the Multi-Modal CelebA-HQ and the +Cityscapes dataset, where text-image pairs are scarce. Codes are available in +this https://pmh9960.github.io/research/GCDP + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Conditional Perceptual Quality Preserving Image Compression + + +
+ We propose conditional perceptual quality, an extension of the perceptual +quality defined in \citet{blau2018perception}, by conditioning it on user +defined information. Specifically, we extend the original perceptual quality +$d(p_{X},p_{\hat{X}})$ to the conditional perceptual quality +$d(p_{X|Y},p_{\hat{X}|Y})$, where $X$ is the original image, $\hat{X}$ is the +reconstructed, $Y$ is side information defined by user and $d(.,.)$ is +divergence. We show that conditional perceptual quality has similar theoretical +properties as rate-distortion-perception trade-off \citep{blau2019rethinking}. +Based on these theoretical results, we propose an optimal framework for +conditional perceptual quality preserving compression. Experimental results +show that our codec successfully maintains high perceptual quality and semantic +quality at all bitrate. Besides, by providing a lowerbound of common randomness +required, we settle the previous arguments on whether randomness should be +incorporated into generator for (conditional) perceptual quality compression. +The source code is provided in supplementary material. + +
+
+
+
+
+ + ☆ SCANet: A Self- and Cross-Attention Network for Audio-Visual Speech + Separation + + +
+ The integration of different modalities, such as audio and visual +information, plays a crucial role in human perception of the surrounding +environment. Recent research has made significant progress in designing fusion +modules for audio-visual speech separation. However, they predominantly focus +on multi-modal fusion architectures situated either at the top or bottom +positions, rather than comprehensively considering multi-modal fusion at +various hierarchical positions within the network. In this paper, we propose a +novel model called self- and cross-attention network (SCANet), which leverages +the attention mechanism for efficient audio-visual feature fusion. SCANet +consists of two types of attention blocks: self-attention (SA) and +cross-attention (CA) blocks, where the CA blocks are distributed at the top +(TCA), middle (MCA) and bottom (BCA) of SCANet. These blocks maintain the +ability to learn modality-specific features and enable the extraction of +different semantics from audio-visual features. Comprehensive experiments on +three standard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2) +demonstrate the effectiveness of SCANet, outperforming existing +state-of-the-art (SOTA) methods while maintaining comparable inference time. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ☆ S2R: Exploring a Double-Win Transformer-Based Framework for Ideal and + Blind Super-Resolution + + +
+ Nowadays, deep learning based methods have demonstrated impressive +performance on ideal super-resolution (SR) datasets, but most of these methods +incur dramatically performance drops when directly applied in real-world SR +reconstruction tasks with unpredictable blur kernels. To tackle this issue, +blind SR methods are proposed to improve the visual results on random blur +kernels, which causes unsatisfactory reconstruction effects on ideal +low-resolution images similarly. In this paper, we propose a double-win +framework for ideal and blind SR task, named S2R, including a light-weight +transformer-based SR model (S2R transformer) and a novel coarse-to-fine +training strategy, which can achieve excellent visual results on both ideal and +random fuzzy conditions. On algorithm level, S2R transformer smartly combines +some efficient and light-weight blocks to enhance the representation ability of +extracted features with relatively low number of parameters. For training +strategy, a coarse-level learning process is firstly performed to improve the +generalization of the network with the help of a large-scale external dataset, +and then, a fast fine-tune process is developed to transfer the pre-trained +model to real-world SR tasks by mining the internal features of the image. +Experimental results show that the proposed S2R outperforms other single-image +SR models in ideal SR condition with only 578K parameters. Meanwhile, it can +achieve better visual results than regular blind SR models in blind fuzzy +conditions with only 10 gradient updates, which improve convergence speed by +300 times, significantly accelerating the transfer-learning process in +real-world situations. + +
+
+
+
+
+ + ☆ GPA-3D: Geometry-aware Prototype Alignment for Unsupervised Domain + Adaptive 3D Object Detection from Point Clouds ICCV 2023 + + +
+ LiDAR-based 3D detection has made great progress in recent years. However, +the performance of 3D detectors is considerably limited when deployed in unseen +environments, owing to the severe domain gap problem. Existing domain adaptive +3D detection methods do not adequately consider the problem of the +distributional discrepancy in feature space, thereby hindering generalization +of detectors across domains. In this work, we propose a novel unsupervised +domain adaptive \textbf{3D} detection framework, namely \textbf{G}eometry-aware +\textbf{P}rototype \textbf{A}lignment (\textbf{GPA-3D}), which explicitly +leverages the intrinsic geometric relationship from point cloud objects to +reduce the feature discrepancy, thus facilitating cross-domain transferring. +Specifically, GPA-3D assigns a series of tailored and learnable prototypes to +point cloud objects with distinct geometric structures. Each prototype aligns +BEV (bird's-eye-view) features derived from corresponding point cloud objects +on source and target domains, reducing the distributional discrepancy and +achieving better adaptation. The evaluation results obtained on various +benchmarks, including Waymo, nuScenes and KITTI, demonstrate the superiority of +our GPA-3D over the state-of-the-art approaches for different adaptation +scenarios. The MindSpore version code will be publicly available at +\url{https://github.com/Liz66666/GPA3D}. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ SYENet: A Simple Yet Effective Network for Multiple Low-Level Vision + Tasks with Real-time Performance on Mobile Device + + +
+ With the rapid development of AI hardware accelerators, applying deep +learning-based algorithms to solve various low-level vision tasks on mobile +devices has gradually become possible. However, two main problems still need to +be solved: task-specific algorithms make it difficult to integrate them into a +single neural network architecture, and large amounts of parameters make it +difficult to achieve real-time inference. To tackle these problems, we propose +a novel network, SYENet, with only $~$6K parameters, to handle multiple +low-level vision tasks on mobile devices in a real-time manner. The SYENet +consists of two asymmetrical branches with simple building blocks. To +effectively connect the results by asymmetrical branches, a Quadratic +Connection Unit(QCU) is proposed. Furthermore, to improve performance, a new +Outlier-Aware Loss is proposed to process the image. The proposed method proves +its superior performance with the best PSNR as compared with other networks in +real-time applications such as Image Signal Processing(ISP), Low-Light +Enhancement(LLE), and Super-Resolution(SR) with 2K60FPS throughput on Qualcomm +8 Gen 1 mobile SoC(System-on-Chip). Particularly, for ISP task, SYENet got the +highest score in MAI 2022 Learned Smartphone ISP challenge. + +
+
+
+
+
+ + ☆ Ranking-aware Uncertainty for Text-guided Image Retrieval + + +
+ Text-guided image retrieval is to incorporate conditional text to better +capture users' intent. Traditionally, the existing methods focus on minimizing +the embedding distances between the source inputs and the targeted image, using +the provided triplets $\langle$source image, source text, target +image$\rangle$. However, such triplet optimization may limit the learned +retrieval model to capture more detailed ranking information, e.g., the +triplets are one-to-one correspondences and they fail to account for +many-to-many correspondences arising from semantic diversity in feedback +languages and images. To capture more ranking information, we propose a novel +ranking-aware uncertainty approach to model many-to-many correspondences by +only using the provided triplets. We introduce uncertainty learning to learn +the stochastic ranking list of features. Specifically, our approach mainly +comprises three components: (1) In-sample uncertainty, which aims to capture +semantic diversity using a Gaussian distribution derived from both combined and +target features; (2) Cross-sample uncertainty, which further mines the ranking +information from other samples' distributions; and (3) Distribution +regularization, which aligns the distributional representations of source +inputs and targeted image. Compared to the existing state-of-the-art methods, +our proposed method achieves significant results on two public datasets for +composed image retrieval. + +
+
+
+
+
+ + ☆ OmniZoomer: Learning to Move and Zoom in on Sphere at High-Resolution ICCV 2023 + + +
+ Omnidirectional images (ODIs) have become increasingly popular, as their +large field-of-view (FoV) can offer viewers the chance to freely choose the +view directions in immersive environments such as virtual reality. The M\"obius +transformation is typically employed to further provide the opportunity for +movement and zoom on ODIs, but applying it to the image level often results in +blurry effect and aliasing problem. In this paper, we propose a novel deep +learning-based approach, called \textbf{OmniZoomer}, to incorporate the +M\"obius transformation into the network for movement and zoom on ODIs. By +learning various transformed feature maps under different conditions, the +network is enhanced to handle the increasing edge curvatures, which alleviates +the blurry effect. Moreover, to address the aliasing problem, we propose two +key components. Firstly, to compensate for the lack of pixels for describing +curves, we enhance the feature maps in the high-resolution (HR) space and +calculate the transformed index map with a spatial index generation module. +Secondly, considering that ODIs are inherently represented in the spherical +space, we propose a spherical resampling module that combines the index map and +HR feature maps to transform the feature maps for better spherical correlation. +The transformed feature maps are decoded to output a zoomed ODI. Experiments +show that our method can produce HR and high-quality ODIs with the flexibility +to move and zoom in to the object of interest. Project page is available at +http://vlislab22.github.io/OmniZoomer/. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ View Consistent Purification for Accurate Cross-View Localization ICCV 2023 + + +
+ This paper proposes a fine-grained self-localization method for outdoor +robotics that utilizes a flexible number of onboard cameras and readily +accessible satellite images. The proposed method addresses limitations in +existing cross-view localization methods that struggle to handle noise sources +such as moving objects and seasonal variations. It is the first sparse +visual-only method that enhances perception in dynamic environments by +detecting view-consistent key points and their corresponding deep features from +ground and satellite views, while removing off-the-ground objects and +establishing homography transformation between the two views. Moreover, the +proposed method incorporates a spatial embedding approach that leverages camera +intrinsic and extrinsic information to reduce the ambiguity of purely visual +matching, leading to improved feature matching and overall pose estimation +accuracy. The method exhibits strong generalization and is robust to +environmental changes, requiring only geo-poses as ground truth. Extensive +experiments on the KITTI and Ford Multi-AV Seasonal datasets demonstrate that +our proposed method outperforms existing state-of-the-art methods, achieving +median spatial accuracy errors below $0.5$ meters along the lateral and +longitudinal directions, and a median orientation accuracy error below 2 +degrees. + +
+
+ comment: Accepted for ICCV 2023 +
+
+
+
+
+ + ☆ Snapshot High Dynamic Range Imaging with a Polarization Camera + + +
+ High dynamic range (HDR) images are important for a range of tasks, from +navigation to consumer photography. Accordingly, a host of specialized HDR +sensors have been developed, the most successful of which are based on +capturing variable per-pixel exposures. In essence, these methods capture an +entire exposure bracket sequence at once in a single shot. This paper presents +a straightforward but highly effective approach for turning an off-the-shelf +polarization camera into a high-performance HDR camera. By placing a linear +polarizer in front of the polarization camera, we are able to simultaneously +capture four images with varied exposures, which are determined by the +orientation of the polarizer. We develop an outlier-robust and self-calibrating +algorithm to reconstruct an HDR image (at a single polarity) from these +measurements. Finally, we demonstrate the efficacy of our approach with +extensive real-world experiments. + +
+
+ comment: 9 pages, 10 figures +
+
+
+
+
+ + ☆ DragNUWA: Fine-grained Control in Video Generation by Integrating Text, + Image, and Trajectory + + +
+ Controllable video generation has gained significant attention in recent +years. However, two main limitations persist: Firstly, most existing works +focus on either text, image, or trajectory-based control, leading to an +inability to achieve fine-grained control in videos. Secondly, trajectory +control research is still in its early stages, with most experiments being +conducted on simple datasets like Human3.6M. This constraint limits the models' +capability to process open-domain images and effectively handle complex curved +trajectories. In this paper, we propose DragNUWA, an open-domain +diffusion-based video generation model. To tackle the issue of insufficient +control granularity in existing works, we simultaneously introduce text, image, +and trajectory information to provide fine-grained control over video content +from semantic, spatial, and temporal perspectives. To resolve the problem of +limited open-domain trajectory control in current research, We propose +trajectory modeling with three aspects: a Trajectory Sampler (TS) to enable +open-domain control of arbitrary trajectories, a Multiscale Fusion (MF) to +control trajectories in different granularities, and an Adaptive Training (AT) +strategy to generate consistent videos following trajectories. Our experiments +validate the effectiveness of DragNUWA, demonstrating its superior performance +in fine-grained control in video generation. The homepage link is +\url{https://www.microsoft.com/en-us/research/project/dragnuwa/} + +
+
+
+
+
+ + ☆ Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme + Detection ACM MM + + +
+ Hateful meme detection is a challenging multimodal task that requires +comprehension of both vision and language, as well as cross-modal interactions. +Recent studies have tried to fine-tune pre-trained vision-language models +(PVLMs) for this task. However, with increasing model sizes, it becomes +important to leverage powerful PVLMs more efficiently, rather than simply +fine-tuning them. Recently, researchers have attempted to convert meme images +into textual captions and prompt language models for predictions. This approach +has shown good performance but suffers from non-informative image captions. +Considering the two factors mentioned above, we propose a probing-based +captioning approach to leverage PVLMs in a zero-shot visual question answering +(VQA) manner. Specifically, we prompt a frozen PVLM by asking hateful +content-related questions and use the answers as image captions (which we call +Pro-Cap), so that the captions contain information critical for hateful content +detection. The good performance of models with Pro-Cap on three benchmarks +validates the effectiveness and generalization of the proposed method. + +
+
+ comment: Camera-ready for 23, ACM MM +
+
+
+
+
+ + ♻ ☆ SHERF: Generalizable Human NeRF from a Single Image ICCV2023 + + +
+ Existing Human NeRF methods for reconstructing 3D humans typically rely on +multiple 2D images from multi-view cameras or monocular videos captured from +fixed camera views. However, in real-world scenarios, human images are often +captured from random camera angles, presenting challenges for high-quality 3D +human reconstruction. In this paper, we propose SHERF, the first generalizable +Human NeRF model for recovering animatable 3D humans from a single input image. +SHERF extracts and encodes 3D human representations in canonical space, +enabling rendering and animation from free views and poses. To achieve +high-fidelity novel view and pose synthesis, the encoded 3D human +representations should capture both global appearance and local fine-grained +textures. To this end, we propose a bank of 3D-aware hierarchical features, +including global, point-level, and pixel-aligned features, to facilitate +informative encoding. Global features enhance the information extracted from +the single input image and complement the information missing from the partial +2D observation. Point-level features provide strong clues of 3D human +structure, while pixel-aligned features preserve more fine-grained details. To +effectively integrate the 3D-aware hierarchical feature bank, we design a +feature fusion transformer. Extensive experiments on THuman, RenderPeople, +ZJU_MoCap, and HuMMan datasets demonstrate that SHERF achieves state-of-the-art +performance, with better generalizability for novel view and pose synthesis. + +
+
+ comment: Accepted by ICCV2023. Project webpage: + https://skhu101.github.io/SHERF/ +
+
+
+
+
+ + ♻ ☆ Normalizing Flows for Human Pose Anomaly Detection + + +
+ Video anomaly detection is an ill-posed problem because it relies on many +parameters such as appearance, pose, camera angle, background, and more. We +distill the problem to anomaly detection of human pose, thus decreasing the +risk of nuisance parameters such as appearance affecting the result. Focusing +on pose alone also has the side benefit of reducing bias against distinct +minority groups. Our model works directly on human pose graph sequences and is +exceptionally lightweight (~1K parameters), capable of running on any machine +able to run the pose estimation with negligible additional resources. We +leverage the highly compact pose representation in a normalizing flows +framework, which we extend to tackle the unique characteristics of +spatio-temporal pose data and show its advantages in this use case. The +algorithm is quite general and can handle training data of only normal examples +as well as a supervised setting that consists of labeled normal and abnormal +examples. We report state-of-the-art results on two anomaly detection +benchmarks - the unsupervised ShanghaiTech dataset and the recent supervised +UBnormal dataset. + +
+
+
+
+
+ + ♻ ☆ DINAR: Diffusion Inpainting of Neural Textures for One-Shot Human + Avatars + + +
+ We present DINAR, an approach for creating realistic rigged fullbody avatars +from single RGB images. Similarly to previous works, our method uses neural +textures combined with the SMPL-X body model to achieve photo-realistic quality +of avatars while keeping them easy to animate and fast to infer. To restore the +texture, we use a latent diffusion model and show how such model can be trained +in the neural texture space. The use of the diffusion model allows us to +realistically reconstruct large unseen regions such as the back of a person +given the frontal view. The models in our pipeline are trained using 2D images +and videos only. In the experiments, our approach achieves state-of-the-art +rendering quality and good generalization to new poses and viewpoints. In +particular, the approach improves state-of-the-art on the SnapshotPeople public +benchmark. + +
+
+
+
+
+ + ♻ ☆ EndoDepthL: Lightweight Endoscopic Monocular Depth Estimation with + CNN-Transformer + + +
+ In this study, we address the key challenges concerning the accuracy and +effectiveness of depth estimation for endoscopic imaging, with a particular +emphasis on real-time inference and the impact of light reflections. We propose +a novel lightweight solution named EndoDepthL that integrates Convolutional +Neural Networks (CNN) and Transformers to predict multi-scale depth maps. Our +approach includes optimizing the network architecture, incorporating +multi-scale dilated convolution, and a multi-channel attention mechanism. We +also introduce a statistical confidence boundary mask to minimize the impact of +reflective areas. To better evaluate the performance of monocular depth +estimation in endoscopic imaging, we propose a novel complexity evaluation +metric that considers network parameter size, floating-point operations, and +inference frames per second. We comprehensively evaluate our proposed method +and compare it with existing baseline solutions. The results demonstrate that +EndoDepthL ensures depth estimation accuracy with a lightweight structure. + +
+
+
+
+
+ + ♻ ☆ Adaptive Split-Fusion Transformer + + +
+ Neural networks for visual content understanding have recently evolved from +convolutional ones (CNNs) to transformers. The prior (CNN) relies on +small-windowed kernels to capture the regional clues, demonstrating solid local +expressiveness. On the contrary, the latter (transformer) establishes +long-range global connections between localities for holistic learning. +Inspired by this complementary nature, there is a growing interest in designing +hybrid models to best utilize each technique. Current hybrids merely replace +convolutions as simple approximations of linear projection or juxtapose a +convolution branch with attention, without concerning the importance of +local/global modeling. To tackle this, we propose a new hybrid named Adaptive +Split-Fusion Transformer (ASF-former) to treat convolutional and attention +branches differently with adaptive weights. Specifically, an ASF-former encoder +equally splits feature channels into half to fit dual-path inputs. Then, the +outputs of dual-path are fused with weighting scalars calculated from visual +cues. We also design the convolutional path compactly for efficiency concerns. +Extensive experiments on standard benchmarks, such as ImageNet-1K, CIFAR-10, +and CIFAR-100, show that our ASF-former outperforms its CNN, transformer +counterparts, and hybrid pilots in terms of accuracy (83.9% on ImageNet-1K), +under similar conditions (12.9G MACs/56.7M Params, without large-scale +pre-training). The code is available at: +https://github.com/szx503045266/ASF-former. + +
+
+
+
+
+ + ♻ ☆ Black Box Few-Shot Adaptation for Vision-Language models ICCV 2023 + + +
+ Vision-Language (V-L) models trained with contrastive learning to align the +visual and language modalities have been shown to be strong few-shot learners. +Soft prompt learning is the method of choice for few-shot downstream adaption +aiming to bridge the modality gap caused by the distribution shift induced by +the new domain. While parameter-efficient, prompt learning still requires +access to the model weights and can be computationally infeasible for large +models with billions of parameters. To address these shortcomings, in this +work, we describe a black-box method for V-L few-shot adaptation that (a) +operates on pre-computed image and text features and hence works without access +to the model's weights, (b) it is orders of magnitude faster at training time, +(c) it is amenable to both supervised and unsupervised training, and (d) it can +be even used to align image and text features computed from uni-modal models. +To achieve this, we propose Linear Feature Alignment (LFA), a simple linear +approach for V-L re-alignment in the target domain. LFA is initialized from a +closed-form solution to a least-squares problem and then it is iteratively +updated by minimizing a re-ranking loss. Despite its simplicity, our approach +can even surpass soft-prompt learning methods as shown by extensive experiments +on 11 image and 2 video datasets. + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ HGCN-GJS: Hierarchical Graph Convolutional Network with Groupwise Joint + Sampling for Trajectory Prediction IROS 2022 + + +
+ Accurate pedestrian trajectory prediction is of great importance for +downstream tasks such as autonomous driving and mobile robot navigation. Fully +investigating the social interactions within the crowd is crucial for accurate +pedestrian trajectory prediction. However, most existing methods do not capture +group level interactions well, focusing only on pairwise interactions and +neglecting group-wise interactions. In this work, we propose a hierarchical +graph convolutional network, HGCN-GJS, for trajectory prediction which well +leverages group level interactions within the crowd. Furthermore, we introduce +a novel joint sampling scheme for modeling the joint distribution of multiple +pedestrians in the future trajectories. Based on the group information, this +scheme associates the trajectory of one person with the trajectory of other +people in the group, but maintains the independence of the trajectories of +outsiders. We demonstrate the performance of our network on several trajectory +prediction datasets, achieving state-of-the-art results on all datasets +considered. + +
+
+ comment: 8 pages, 8 figures, accepted by IROS 2022 +
+
+
+
+
+ + ♻ ☆ Cross Contrasting Feature Perturbation for Domain Generalization + + +
+ Domain generalization (DG) aims to learn a robust model from source domains +that generalize well on unseen target domains. Recent studies focus on +generating novel domain samples or features to diversify distributions +complementary to source domains. Yet, these approaches can hardly deal with the +restriction that the samples synthesized from various domains can cause +semantic distortion. In this paper, we propose an online one-stage Cross +Contrasting Feature Perturbation (CCFP) framework to simulate domain shift by +generating perturbed features in the latent space while regularizing the model +prediction against domain shift. Different from the previous fixed synthesizing +strategy, we design modules with learnable feature perturbations and semantic +consistency constraints. In contrast to prior work, our method does not use any +generative-based models or domain labels. We conduct extensive experiments on a +standard DomainBed benchmark with a strict evaluation protocol for a fair +comparison. Comprehensive experiments show that our method outperforms the +previous state-of-the-art, and quantitative analyses illustrate that our +approach can alleviate the domain shift problem in out-of-distribution (OOD) +scenarios. + +
+
+
+
+
+ + ♻ ☆ EfficientTrain: Exploring Generalized Curriculum Learning for Training + Visual Backbones ICCV 2023 + + +
+ The superior performance of modern deep networks usually comes with a costly +training procedure. This paper presents a new curriculum learning approach for +the efficient training of visual backbones (e.g., vision Transformers). Our +work is inspired by the inherent learning dynamics of deep networks: we +experimentally show that at an earlier training stage, the model mainly learns +to recognize some 'easier-to-learn' discriminative patterns within each +example, e.g., the lower-frequency components of images and the original +information before data augmentation. Driven by this phenomenon, we propose a +curriculum where the model always leverages all the training data at each +epoch, while the curriculum starts with only exposing the 'easier-to-learn' +patterns of each example, and introduces gradually more difficult patterns. To +implement this idea, we 1) introduce a cropping operation in the Fourier +spectrum of the inputs, which enables the model to learn from only the +lower-frequency components efficiently, 2) demonstrate that exposing the +features of original images amounts to adopting weaker data augmentation, and +3) integrate 1) and 2) and design a curriculum learning schedule with a +greedy-search algorithm. The resulting approach, EfficientTrain, is simple, +general, yet surprisingly effective. As an off-the-shelf method, it reduces the +wall-time training cost of a wide variety of popular models (e.g., ResNet, +ConvNeXt, DeiT, PVT, Swin, and CSWin) by >1.5x on ImageNet-1K/22K without +sacrificing accuracy. It is also effective for self-supervised learning (e.g., +MAE). Code is available at https://github.com/LeapLabTHU/EfficientTrain. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ DiffIR: Efficient Diffusion Model for Image Restoration ICCV2023 + + +
+ Diffusion model (DM) has achieved SOTA performance by modeling the image +synthesis process into a sequential application of a denoising network. +However, different from image synthesis, image restoration (IR) has a strong +constraint to generate results in accordance with ground-truth. Thus, for IR, +traditional DMs running massive iterations on a large model to estimate whole +images or feature maps is inefficient. To address this issue, we propose an +efficient DM for IR (DiffIR), which consists of a compact IR prior extraction +network (CPEN), dynamic IR transformer (DIRformer), and denoising network. +Specifically, DiffIR has two training stages: pretraining and training DM. In +pretraining, we input ground-truth images into CPEN$_{S1}$ to capture a compact +IR prior representation (IPR) to guide DIRformer. In the second stage, we train +the DM to directly estimate the same IRP as pretrained CPEN$_{S1}$ only using +LQ images. We observe that since the IPR is only a compact vector, DiffIR can +use fewer iterations than traditional DM to obtain accurate estimations and +generate more stable and realistic results. Since the iterations are few, our +DiffIR can adopt a joint optimization of CPEN$_{S2}$, DIRformer, and denoising +network, which can further reduce the estimation error influence. We conduct +extensive experiments on several IR tasks and achieve SOTA performance while +consuming less computational costs. Code is available at +\url{https://github.com/Zj-BinXia/DiffIR}. + +
+
+ comment: This paper is accepted by ICCV2023. Codes and models are available at + https://github.com/Zj-BinXia/DiffIR +
+
+
+
+
+ + ♻ ☆ MixCycle: Mixup Assisted Semi-Supervised 3D Single Object Tracking with + Cycle Consistency ICCV23 + + +
+ 3D single object tracking (SOT) is an indispensable part of automated +driving. Existing approaches rely heavily on large, densely labeled datasets. +However, annotating point clouds is both costly and time-consuming. Inspired by +the great success of cycle tracking in unsupervised 2D SOT, we introduce the +first semi-supervised approach to 3D SOT. Specifically, we introduce two +cycle-consistency strategies for supervision: 1) Self tracking cycles, which +leverage labels to help the model converge better in the early stages of +training; 2) forward-backward cycles, which strengthen the tracker's robustness +to motion variations and the template noise caused by the template update +strategy. Furthermore, we propose a data augmentation strategy named SOTMixup +to improve the tracker's robustness to point cloud diversity. SOTMixup +generates training samples by sampling points in two point clouds with a mixing +rate and assigns a reasonable loss weight for training according to the mixing +rate. The resulting MixCycle approach generalizes to appearance matching-based +trackers. On the KITTI benchmark, based on the P2B tracker, MixCycle trained +with $\textbf{10\%}$ labels outperforms P2B trained with $\textbf{100\%}$ +labels, and achieves a $\textbf{28.4\%}$ precision improvement when using +$\textbf{1\%}$ labels. Our code will be released at +\url{https://github.com/Mumuqiao/MixCycle}. + +
+
+ comment: Accepted by ICCV23 +
+
+
+
+
+ + ♻ ☆ Adaptive Segmentation Network for Scene Text Detection + + +
+ Inspired by deep convolution segmentation algorithms, scene text detectors +break the performance ceiling of datasets steadily. However, these methods +often encounter threshold selection bottlenecks and have poor performance on +text instances with extreme aspect ratios. In this paper, we propose to +automatically learn the discriminate segmentation threshold, which +distinguishes text pixels from background pixels for segmentation-based scene +text detectors and then further reduces the time-consuming manual parameter +adjustment. Besides, we design a Global-information Enhanced Feature Pyramid +Network (GE-FPN) for capturing text instances with macro size and extreme +aspect ratios. Following the GE-FPN, we introduce a cascade optimization +structure to further refine the text instances. Finally, together with the +proposed threshold learning strategy and text detection structure, we design an +Adaptive Segmentation Network (ASNet) for scene text detection. Extensive +experiments are carried out to demonstrate that the proposed ASNet can achieve +the state-of-the-art performance on four text detection benchmarks, i.e., ICDAR +2015, MSRA-TD500, ICDAR 2017 MLT and CTW1500. The ablation experiments also +verify the effectiveness of our contributions. + +
+
+
+
+
+ + ♻ ☆ End-to-end Remote Sensing Change Detection of Unregistered Bi-temporal + Images for Natural Disasters + + +
+ Change detection based on remote sensing images has been a prominent area of +interest in the field of remote sensing. Deep networks have demonstrated +significant success in detecting changes in bi-temporal remote sensing images +and have found applications in various fields. Given the degradation of natural +environments and the frequent occurrence of natural disasters, accurately and +swiftly identifying damaged buildings in disaster-stricken areas through remote +sensing images holds immense significance. This paper aims to investigate +change detection specifically for natural disasters. Considering that existing +public datasets used in change detection research are registered, which does +not align with the practical scenario where bi-temporal images are not matched, +this paper introduces an unregistered end-to-end change detection synthetic +dataset called xBD-E2ECD. Furthermore, we propose an end-to-end change +detection network named E2ECDNet, which takes an unregistered bi-temporal image +pair as input and simultaneously generates the flow field prediction result and +the change detection prediction result. It is worth noting that our E2ECDNet +also supports change detection for registered image pairs, as registration can +be seen as a special case of non-registration. Additionally, this paper +redefines the criteria for correctly predicting a positive case and introduces +neighborhood-based change detection evaluation metrics. The experimental +results have demonstrated significant improvements. + +
+
+
+
+
+ + ♻ ☆ LiDAR Meta Depth Completion IROS 2023 + + +
+ Depth estimation is one of the essential tasks to be addressed when creating +mobile autonomous systems. While monocular depth estimation methods have +improved in recent times, depth completion provides more accurate and reliable +depth maps by additionally using sparse depth information from other sensors +such as LiDAR. However, current methods are specifically trained for a single +LiDAR sensor. As the scanning pattern differs between sensors, every new sensor +would require re-training a specialized depth completion model, which is +computationally inefficient and not flexible. Therefore, we propose to +dynamically adapt the depth completion model to the used sensor type enabling +LiDAR adaptive depth completion. Specifically, we propose a meta depth +completion network that uses data patterns derived from the data to learn a +task network to alter weights of the main depth completion network to solve a +given depth completion task effectively. The method demonstrates a strong +capability to work on multiple LiDAR scanning patterns and can also generalize +to scanning patterns that are unseen during training. While using a single +model, our method yields significantly better results than a non-adaptive +baseline trained on different LiDAR patterns. It outperforms LiDAR-specific +expert models for very sparse cases. These advantages allow flexible deployment +of a single depth completion model on different sensors, which could also prove +valuable to process the input of nascent LiDAR technology with adaptive instead +of fixed scanning patterns. + +
+
+ comment: Accepted at IROS 2023, v2 has updated author list and fixed a figure + caption +
+
+
+
+
+ + ♻ ☆ HFGD: High-level Feature Guided Decoder for Semantic Segmentation + + +
+ Existing pyramid-based upsamplers (e.g. SemanticFPN), although efficient, +usually produce less accurate results compared to dilation-based models when +using the same backbone. This is partially caused by the contaminated +high-level features since they are fused and fine-tuned with noisy low-level +features on limited data. To address this issue, we propose to use powerful +pretrained high-level features as guidance (HFG) when learning to upsample the +fine-grained low-level features. Specifically, the class tokens are trained +along with only the high-level features from the backbone. These class tokens +are reused by the upsampler for classification, guiding the upsampler features +to more discriminative backbone features. One key design of the HFG is to +protect the high-level features from being contaminated with proper +stop-gradient operations so that the backbone does not update according to the +gradient from the upsampler. To push the upper limit of HFG, we introduce an +context augmentation encoder (CAE) that can efficiently and effectively +operates on low-resolution high-level feature, resulting in improved +representation and thus better guidance. We evaluate the proposed method on +three benchmarks: Pascal Context, COCOStuff164k, and Cityscapes. Our method +achieves state-of-the-art results among methods that do not use extra training +data, demonstrating its effectiveness and generalization ability. The complete +code will be released + +
+
+ comment: Revised version, refactored presentation and added more experiments +
+
+
+
+
+ + ♻ ☆ SEMI-DiffusionInst: A Diffusion Model Based Approach for Semiconductor + Defect Classification and Segmentation + + +
+ With continuous progression of Moore's Law, integrated circuit (IC) device +complexity is also increasing. Scanning Electron Microscope (SEM) image based +extensive defect inspection and accurate metrology extraction are two main +challenges in advanced node (2 nm and beyond) technology. Deep learning (DL) +algorithm based computer vision approaches gained popularity in semiconductor +defect inspection over last few years. In this research work, a new +semiconductor defect inspection framework "SEMI-DiffusionInst" is investigated +and compared to previous frameworks. To the best of the authors' knowledge, +this work is the first demonstration to accurately detect and precisely segment +semiconductor defect patterns by using a diffusion model. Different feature +extractor networks as backbones and data sampling strategies are investigated +towards achieving a balanced trade-off between precision and computing +efficiency. Our proposed approach outperforms previous work on overall mAP and +performs comparatively better or as per for almost all defect classes (per +class APs). The bounding box and segmentation mAPs achieved by the proposed +SEMI-DiffusionInst model are improved by 3.83% and 2.10%, respectively. Among +individual defect types, precision on line collapse and thin bridge defects are +improved approximately 15\% on detection task for both defect types. It has +also been shown that by tuning inference hyperparameters, inference time can be +improved significantly without compromising model precision. Finally, certain +limitations and future work strategy to overcome them are discussed. + +
+
+ comment: 6 pages, 5 figures, To be published by IEEE in the proceedings of the + 2023 ELMAR conference +
+
+
+
+
+ + ♻ ☆ Unsafe Diffusion: On the Generation of Unsafe Images and Hateful Memes + From Text-To-Image Models + + +
+ State-of-the-art Text-to-Image models like Stable Diffusion and DALLE$\cdot$2 +are revolutionizing how people generate visual content. At the same time, +society has serious concerns about how adversaries can exploit such models to +generate unsafe images. In this work, we focus on demystifying the generation +of unsafe images and hateful memes from Text-to-Image models. We first +construct a typology of unsafe images consisting of five categories (sexually +explicit, violent, disturbing, hateful, and political). Then, we assess the +proportion of unsafe images generated by four advanced Text-to-Image models +using four prompt datasets. We find that these models can generate a +substantial percentage of unsafe images; across four models and four prompt +datasets, 14.56% of all generated images are unsafe. When comparing the four +models, we find different risk levels, with Stable Diffusion being the most +prone to generating unsafe content (18.92% of all generated images are unsafe). +Given Stable Diffusion's tendency to generate more unsafe content, we evaluate +its potential to generate hateful meme variants if exploited by an adversary to +attack a specific individual or community. We employ three image editing +methods, DreamBooth, Textual Inversion, and SDEdit, which are supported by +Stable Diffusion. Our evaluation result shows that 24% of the generated images +using DreamBooth are hateful meme variants that present the features of the +original hateful meme and the target individual/community; these generated +images are comparable to hateful meme variants collected from the real world. +Overall, our results demonstrate that the danger of large-scale generation of +unsafe images is imminent. We discuss several mitigating measures, such as +curating training data, regulating prompts, and implementing safety filters, +and encourage better safeguard tools to be developed to prevent unsafe +generation. + +
+
+ comment: To Appear in the ACM Conference on Computer and Communications + Security, November 26, 2023 +
+
+
+
+
+ + ♻ ☆ 3D-aware Blending with Generative NeRFs ICCV 2023 + + +
+ Image blending aims to combine multiple images seamlessly. It remains +challenging for existing 2D-based methods, especially when input images are +misaligned due to differences in 3D camera poses and object shapes. To tackle +these issues, we propose a 3D-aware blending method using generative Neural +Radiance Fields (NeRF), including two key components: 3D-aware alignment and +3D-aware blending. For 3D-aware alignment, we first estimate the camera pose of +the reference image with respect to generative NeRFs and then perform 3D local +alignment for each part. To further leverage 3D information of the generative +NeRF, we propose 3D-aware blending that directly blends images on the NeRF's +latent representation space, rather than raw pixel space. Collectively, our +method outperforms existing 2D baselines, as validated by extensive +quantitative and qualitative evaluations with FFHQ and AFHQ-Cat. + +
+
+ comment: ICCV 2023, Project page: https://blandocs.github.io/blendnerf +
+
+
+
+
+ + ♻ ☆ Source-free Depth for Object Pop-out ICCV 2023 + + +
+ Depth cues are known to be useful for visual perception. However, direct +measurement of depth is often impracticable. Fortunately, though, modern +learning-based methods offer promising depth maps by inference in the wild. In +this work, we adapt such depth inference models for object segmentation using +the objects' "pop-out" prior in 3D. The "pop-out" is a simple composition prior +that assumes objects reside on the background surface. Such compositional prior +allows us to reason about objects in the 3D space. More specifically, we adapt +the inferred depth maps such that objects can be localized using only 3D +information. Such separation, however, requires knowledge about contact surface +which we learn using the weak supervision of the segmentation mask. Our +intermediate representation of contact surface, and thereby reasoning about +objects purely in 3D, allows us to better transfer the depth knowledge into +semantics. The proposed adaptation method uses only the depth model without +needing the source data used for training, making the learning process +efficient and practical. Our experiments on eight datasets of two challenging +tasks, namely camouflaged object detection and salient object detection, +consistently demonstrate the benefit of our method in terms of both performance +and generalizability. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ BlindHarmony: "Blind" Harmonization for MR Images via Flow model ICCV 2023 + + +
+ In MRI, images of the same contrast (e.g., T$_1$) from the same subject can +exhibit noticeable differences when acquired using different hardware, +sequences, or scan parameters. These differences in images create a domain gap +that needs to be bridged by a step called image harmonization, to process the +images successfully using conventional or deep learning-based image analysis +(e.g., segmentation). Several methods, including deep learning-based +approaches, have been proposed to achieve image harmonization. However, they +often require datasets from multiple domains for deep learning training and may +still be unsuccessful when applied to images from unseen domains. To address +this limitation, we propose a novel concept called `Blind Harmonization', which +utilizes only target domain data for training but still has the capability to +harmonize images from unseen domains. For the implementation of blind +harmonization, we developed BlindHarmony using an unconditional flow model +trained on target domain data. The harmonized image is optimized to have a +correlation with the input source domain image while ensuring that the latent +vector of the flow model is close to the center of the Gaussian distribution. +BlindHarmony was evaluated on both simulated and real datasets and compared to +conventional methods. BlindHarmony demonstrated noticeable performance on both +datasets, highlighting its potential for future use in clinical settings. The +source code is available at: https://github.com/SNU-LIST/BlindHarmony + +
+
+ comment: ICCV 2023 accepted. 9 pages and 5 Figures for manuscipt, + supplementary included +
+
+
+
+
+ + ♻ ☆ STS-GAN: Can We Synthesize Solid Texture with High Fidelity from + Arbitrary 2D Exemplar? + + +
+ Solid texture synthesis (STS), an effective way to extend a 2D exemplar to a +3D solid volume, exhibits advantages in computational photography. However, +existing methods generally fail to accurately learn arbitrary textures, which +may result in the failure to synthesize solid textures with high fidelity. In +this paper, we propose a novel generative adversarial nets-based framework +(STS-GAN) to extend the given 2D exemplar to arbitrary 3D solid textures. In +STS-GAN, multi-scale 2D texture discriminators evaluate the similarity between +the given 2D exemplar and slices from the generated 3D texture, promoting the +3D texture generator synthesizing realistic solid textures. Finally, +experiments demonstrate that the proposed method can generate high-fidelity +solid textures with similar visual characteristics to the 2D exemplar. + +
+
+
+
+
+ + ♻ ☆ Seeing through the Brain: Image Reconstruction of Visual Perception from + Human Brain Signals + + +
+ Seeing is believing, however, the underlying mechanism of how human visual +perceptions are intertwined with our cognitions is still a mystery. Thanks to +the recent advances in both neuroscience and artificial intelligence, we have +been able to record the visually evoked brain activities and mimic the visual +perception ability through computational approaches. In this paper, we pay +attention to visual stimuli reconstruction by reconstructing the observed +images based on portably accessible brain signals, i.e., electroencephalography +(EEG) data. Since EEG signals are dynamic in the time-series format and are +notorious to be noisy, processing and extracting useful information requires +more dedicated efforts; In this paper, we propose a comprehensive pipeline, +named NeuroImagen, for reconstructing visual stimuli images from EEG signals. +Specifically, we incorporate a novel multi-level perceptual information +decoding to draw multi-grained outputs from the given EEG data. A latent +diffusion model will then leverage the extracted information to reconstruct the +high-resolution visual stimuli images. The experimental results have +illustrated the effectiveness of image reconstruction and superior quantitative +performance of our proposed method. + +
+
+ comment: A preprint version of an ongoing work +
+
+
+
+
+ + ♻ ☆ ACTIVE: Towards Highly Transferable 3D Physical Camouflage for Universal + and Robust Vehicle Evasion ICCV 2023 + + +
+ Adversarial camouflage has garnered attention for its ability to attack +object detectors from any viewpoint by covering the entire object's surface. +However, universality and robustness in existing methods often fall short as +the transferability aspect is often overlooked, thus restricting their +application only to a specific target with limited performance. To address +these challenges, we present Adversarial Camouflage for Transferable and +Intensive Vehicle Evasion (ACTIVE), a state-of-the-art physical camouflage +attack framework designed to generate universal and robust adversarial +camouflage capable of concealing any 3D vehicle from detectors. Our framework +incorporates innovative techniques to enhance universality and robustness, +including a refined texture rendering that enables common texture application +to different vehicles without being constrained to a specific texture map, a +novel stealth loss that renders the vehicle undetectable, and a smooth and +camouflage loss to enhance the naturalness of the adversarial camouflage. Our +extensive experiments on 15 different models show that ACTIVE consistently +outperforms existing works on various public detectors, including the latest +YOLOv7. Notably, our universality evaluations reveal promising transferability +to other vehicle classes, tasks (segmentation models), and the real world, not +just other vehicles. + +
+
+ comment: Accepted for ICCV 2023. Main Paper with Supplementary Material. + Project Page: https://islab-ai.github.io/active-iccv2023/ +
+
+
+
+
+ + ♻ ☆ VM-NeRF: Tackling Sparsity in NeRF with View Morphing + + +
+ NeRF aims to learn a continuous neural scene representation by using a finite +set of input images taken from various viewpoints. A well-known limitation of +NeRF methods is their reliance on data: the fewer the viewpoints, the higher +the likelihood of overfitting. This paper addresses this issue by introducing a +novel method to generate geometrically consistent image transitions between +viewpoints using View Morphing. Our VM-NeRF approach requires no prior +knowledge about the scene structure, as View Morphing is based on the +fundamental principles of projective geometry. VM-NeRF tightly integrates this +geometric view generation process during the training procedure of standard +NeRF approaches. Notably, our method significantly improves novel view +synthesis, particularly when only a few views are available. Experimental +evaluation reveals consistent improvement over current methods that handle +sparse viewpoints in NeRF models. We report an increase in PSNR of up to 1.8dB +and 1.0dB when training uses eight and four views, respectively. Source code: +\url{https://github.com/mbortolon97/VM-NeRF} + +
+
+ comment: ICIAP 2023 +
+
+
+
+
+ + ♻ ☆ Towards the extraction of robust sign embeddings for low resource sign + language recognition + + +
+ Isolated Sign Language Recognition (SLR) has mostly been applied on datasets +containing signs executed slowly and clearly by a limited group of signers. In +real-world scenarios, however, we are met with challenging visual conditions, +coarticulated signing, small datasets, and the need for signer independent +models. To tackle this difficult problem, we require a robust feature extractor +to process the sign language videos. One could expect human pose estimators to +be ideal candidates. However, due to a domain mismatch with their training sets +and challenging poses in sign language, they lack robustness on sign language +data and image-based models often still outperform keypoint-based models. +Furthermore, whereas the common practice of transfer learning with image-based +models yields even higher accuracy, keypoint-based models are typically trained +from scratch on every SLR dataset. These factors limit their usefulness for +SLR. From the existing literature, it is also not clear which, if any, pose +estimator performs best for SLR. We compare the three most popular pose +estimators for SLR: OpenPose, MMPose and MediaPipe. We show that through +keypoint normalization, missing keypoint imputation, and learning a pose +embedding, we can obtain significantly better results and enable transfer +learning. We show that keypoint-based embeddings contain cross-lingual +features: they can transfer between sign languages and achieve competitive +performance even when fine-tuning only the classifier layer of an SLR model on +a target sign language. We furthermore achieve better performance using +fine-tuned transferred embeddings than models trained only on the target sign +language. The embeddings can also be learned in a multilingual fashion. The +application of these embeddings could prove particularly useful for low +resource sign languages in the future. + +
+
+
+
+
+ + ♻ ☆ Instruct-NeuralTalker: Editing Audio-Driven Talking Radiance Fields with + Instructions + + +
+ Recent neural talking radiance field methods have shown great success in +photorealistic audio-driven talking face synthesis. In this paper, we propose a +novel interactive framework that utilizes human instructions to edit such +implicit neural representations to achieve real-time personalized talking face +generation. Given a short speech video, we first build an efficient talking +radiance field, and then apply the latest conditional diffusion model for image +editing based on the given instructions and guiding implicit representation +optimization towards the editing target. To ensure audio-lip synchronization +during the editing process, we propose an iterative dataset updating strategy +and utilize a lip-edge loss to constrain changes in the lip region. We also +introduce a lightweight refinement network for complementing image details and +achieving controllable detail generation in the final rendered image. Our +method also enables real-time rendering at up to 30FPS on consumer hardware. +Multiple metrics and user verification show that our approach provides a +significant improvement in rendering quality compared to state-of-the-art +methods. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ 3D-TOGO: Towards Text-Guided Cross-Category 3D Object Generation + + +
+ Text-guided 3D object generation aims to generate 3D objects described by +user-defined captions, which paves a flexible way to visualize what we +imagined. Although some works have been devoted to solving this challenging +task, these works either utilize some explicit 3D representations (e.g., mesh), +which lack texture and require post-processing for rendering photo-realistic +views; or require individual time-consuming optimization for every single case. +Here, we make the first attempt to achieve generic text-guided cross-category +3D object generation via a new 3D-TOGO model, which integrates a text-to-views +generation module and a views-to-3D generation module. The text-to-views +generation module is designed to generate different views of the target 3D +object given an input caption. prior-guidance, caption-guidance and view +contrastive learning are proposed for achieving better view-consistency and +caption similarity. Meanwhile, a pixelNeRF model is adopted for the views-to-3D +generation module to obtain the implicit 3D neural representation from the +previously-generated views. Our 3D-TOGO model generates 3D objects in the form +of the neural radiance field with good texture and requires no time-cost +optimization for every single caption. Besides, 3D-TOGO can control the +category, color and shape of generated 3D objects with the input caption. +Extensive experiments on the largest 3D object dataset (i.e., ABO) are +conducted to verify that 3D-TOGO can better generate high-quality 3D objects +according to the input captions across 98 different categories, in terms of +PSNR, SSIM, LPIPS and CLIP-score, compared with text-NeRF and Dreamfields. + +
+
+
+
+
+ + ♻ ☆ HyperSparse Neural Networks: Shifting Exploration to Exploitation + through Adaptive Regularization ICCV'23 + + +
+ Sparse neural networks are a key factor in developing resource-efficient +machine learning applications. We propose the novel and powerful sparse +learning method Adaptive Regularized Training (ART) to compress dense into +sparse networks. Instead of the commonly used binary mask during training to +reduce the number of model weights, we inherently shrink weights close to zero +in an iterative manner with increasing weight regularization. Our method +compresses the pre-trained model knowledge into the weights of highest +magnitude. Therefore, we introduce a novel regularization loss named +HyperSparse that exploits the highest weights while conserving the ability of +weight exploration. Extensive experiments on CIFAR and TinyImageNet show that +our method leads to notable performance gains compared to other sparsification +methods, especially in extremely high sparsity regimes up to 99.8 percent model +sparsity. Additional investigations provide new insights into the patterns that +are encoded in weights with high magnitudes. + +
+
+ comment: ICCV'23 Workshops +
+
+
+
+
+ + ♻ ☆ Social Occlusion Inference with Vectorized Representation for Autonomous + Driving + + +
+ Autonomous vehicles must be capable of handling the occlusion of the +environment to ensure safe and efficient driving. In urban environment, +occlusion often arises due to other vehicles obscuring the perception of the +ego vehicle. Since the occlusion condition can impact the trajectories of +vehicles, the behavior of other vehicles is helpful in making inferences about +the occlusion as a remedy for perceptual deficiencies. This paper introduces a +novel social occlusion inference approach that learns a mapping from agent +trajectories and scene context to an occupancy grid map (OGM) representing the +view of ego vehicle. Specially, vectorized features are encoded through the +polyline encoder to aggregate features of vectors into features of polylines. A +transformer module is then utilized to model the high-order interactions of +polylines. Importantly, occlusion queries are proposed to fuse polyline +features and generate the OGM without the input of visual modality. To verify +the performance of vectorized representation, we design a baseline based on a +fully transformer encoder-decoder architecture mapping the OGM with occlusion +and historical trajectories information to the ground truth OGM. We evaluate +our approach on an unsignalized intersection in the INTERACTION dataset, which +outperforms the state-of-the-art results. + +
+
+
+
+
+ + ♻ ☆ On the Effectiveness of Spectral Discriminators for Perceptual Quality + Improvement ICCV 2023 + + +
+ Several recent studies advocate the use of spectral discriminators, which +evaluate the Fourier spectra of images for generative modeling. However, the +effectiveness of the spectral discriminators is not well interpreted yet. We +tackle this issue by examining the spectral discriminators in the context of +perceptual image super-resolution (i.e., GAN-based SR), as SR image quality is +susceptible to spectral changes. Our analyses reveal that the spectral +discriminator indeed performs better than the ordinary (a.k.a. spatial) +discriminator in identifying the differences in the high-frequency range; +however, the spatial discriminator holds an advantage in the low-frequency +range. Thus, we suggest that the spectral and spatial discriminators shall be +used simultaneously. Moreover, we improve the spectral discriminators by first +calculating the patch-wise Fourier spectrum and then aggregating the spectra by +Transformer. We verify the effectiveness of the proposed method twofold. On the +one hand, thanks to the additional spectral discriminator, our obtained SR +images have their spectra better aligned to those of the real images, which +leads to a better PD tradeoff. On the other hand, our ensembled discriminator +predicts the perceptual quality more accurately, as evidenced in the +no-reference image quality assessment task. + +
+
+ comment: Accepted to ICCV 2023. Code and Models are publicly available at + https://github.com/Luciennnnnnn/DualFormer +
+
+
+
+
+ + ♻ ☆ DiffGuard: Semantic Mismatch-Guided Out-of-Distribution Detection using + Pre-trained Diffusion Models ICCV2023 + + +
+ Given a classifier, the inherent property of semantic Out-of-Distribution +(OOD) samples is that their contents differ from all legal classes in terms of +semantics, namely semantic mismatch. There is a recent work that directly +applies it to OOD detection, which employs a conditional Generative Adversarial +Network (cGAN) to enlarge semantic mismatch in the image space. While achieving +remarkable OOD detection performance on small datasets, it is not applicable to +ImageNet-scale datasets due to the difficulty in training cGANs with both input +images and labels as conditions. As diffusion models are much easier to train +and amenable to various conditions compared to cGANs, in this work, we propose +to directly use pre-trained diffusion models for semantic mismatch-guided OOD +detection, named DiffGuard. Specifically, given an OOD input image and the +predicted label from the classifier, we try to enlarge the semantic difference +between the reconstructed OOD image under these conditions and the original +input image. We also present several test-time techniques to further strengthen +such differences. Experimental results show that DiffGuard is effective on both +Cifar-10 and hard cases of the large-scale ImageNet, and it can be easily +combined with existing OOD detection techniques to achieve state-of-the-art OOD +detection results. + +
+
+ comment: Accepted by ICCV2023, with supplementary materials +
+
+
+
+
+ + ♻ ☆ SAMFlow: Eliminating Any Fragmentation in Optical Flow with Segment + Anything Model + + +
+ Optical Flow Estimation aims to find the 2D dense motion field between two +frames. Due to the limitation of model structures and training datasets, +existing methods often rely too much on local clues and ignore the integrity of +objects, resulting in fragmented motion estimation. Through theoretical +analysis, we find the pre-trained large vision models are helpful in optical +flow estimation, and we notice that the recently famous Segment Anything Model +(SAM) demonstrates a strong ability to segment complete objects, which is +suitable for solving the fragmentation problem. We thus propose a solution to +embed the frozen SAM image encoder into FlowFormer to enhance object +perception. To address the challenge of in-depth utilizing SAM in +non-segmentation tasks like optical flow estimation, we propose an Optical Flow +Task-Specific Adaption scheme, including a Context Fusion Module to fuse the +SAM encoder with the optical flow context encoder, and a Context Adaption +Module to adapt the SAM features for optical flow task with Learned +Task-Specific Embedding. Our proposed SAMFlow model reaches 0.86/2.10 +clean/final EPE and 3.55/12.32 EPE/F1-all on Sintel and KITTI-15 training set, +surpassing Flowformer by 8.5%/9.9% and 13.2%/16.3%. Furthermore, our model +achieves state-of-the-art performance on the Sintel and KITTI-15 benchmarks, +ranking #1 among all two-frame methods on Sintel clean pass. + +
+
+
+
+
+ + ♻ ☆ Scalable Surface Water Mapping up to Fine-scale using Geometric Features + of Water from Topographic Airborne LiDAR Data + + +
+ Despite substantial technological advancements, the comprehensive mapping of +surface water, particularly smaller bodies (<1ha), continues to be a challenge +due to a lack of robust, scalable methods. Standard methods require either +training labels or site-specific parameter tuning, which complicates automated +mapping and introduces biases related to training data and parameters. The +reliance on water's reflectance properties, including LiDAR intensity, further +complicates the matter, as higher-resolution images inherently produce more +noise. To mitigate these difficulties, we propose a unique method that focuses +on the geometric characteristics of water instead of its variable reflectance +properties. Unlike preceding approaches, our approach relies entirely on 3D +coordinate observations from airborne LiDAR data, taking advantage of the +principle that connected surface water remains flat due to gravity. By +harnessing this natural law in conjunction with connectivity, our method can +accurately and scalably identify small water bodies, eliminating the need for +training labels or repetitive parameter tuning. Consequently, our approach +enables the creation of comprehensive 3D topographic maps that include both +water and terrain, all performed in an unsupervised manner using only airborne +laser scanning data, potentially enhancing the process of generating reliable +3D topographic maps. We validated our method across extensive and diverse +landscapes, while comparing it to highly competitive Normalized Difference +Water Index (NDWI)-based methods and assessing it using a reference surface +water map. In conclusion, our method offers a new approach to address +persistent difficulties in robust, scalable surface water mapping and 3D +topographic mapping, using solely airborne LiDAR data. + +
+
+
+
+
+ + ♻ ☆ An unsupervised, open-source workflow for 2D and 3D building mapping + from airborne LiDAR data + + +
+ Despite the substantial demand for high-quality, large-area building maps, no +established open-source workflow for generating 2D and 3D maps currently +exists. This study introduces an automated, open-source workflow for +large-scale 2D and 3D building mapping utilizing airborne LiDAR data. Uniquely, +our workflow operates entirely unsupervised, eliminating the need for any +training procedures. We have integrated a specifically tailored DTM generation +algorithm into our workflow to prevent errors in complex urban landscapes, +especially around highways and overpasses. Through fine rasterization of LiDAR +point clouds, we've enhanced building-tree differentiation, reduced errors near +water bodies, and augmented computational efficiency by introducing a new +planarity calculation. Our workflow offers a practical and scalable solution +for the mass production of rasterized 2D and 3D building maps from raw airborne +LiDAR data. Also, we elaborate on the influence of parameters and potential +error sources to provide users with practical guidance. Our method's robustness +has been rigorously optimized and tested using an extensive dataset (> 550 +km$^2$), and further validated through comparison with deep learning-based and +hand-digitized products. Notably, through these unparalleled, large-scale +comparisons, we offer a valuable analysis of large-scale building maps +generated via different methodologies, providing insightful evaluations of the +effectiveness of each approach. We anticipate that our highly scalable building +mapping workflow will facilitate the production of reliable 2D and 3D building +maps, fostering advances in large-scale urban analysis. The code will be +released upon publication. + +
+
+
+
+
+ + ♻ ☆ 3D Semantic Subspace Traverser: Empowering 3D Generative Model with + Shape Editing Capability ICCV 2023 + + +
+ Shape generation is the practice of producing 3D shapes as various +representations for 3D content creation. Previous studies on 3D shape +generation have focused on shape quality and structure, without or less +considering the importance of semantic information. Consequently, such +generative models often fail to preserve the semantic consistency of shape +structure or enable manipulation of the semantic attributes of shapes during +generation. In this paper, we proposed a novel semantic generative model named +3D Semantic Subspace Traverser that utilizes semantic attributes for +category-specific 3D shape generation and editing. Our method utilizes implicit +functions as the 3D shape representation and combines a novel latent-space GAN +with a linear subspace model to discover semantic dimensions in the local +latent space of 3D shapes. Each dimension of the subspace corresponds to a +particular semantic attribute, and we can edit the attributes of generated +shapes by traversing the coefficients of those dimensions. Experimental results +demonstrate that our method can produce plausible shapes with complex +structures and enable the editing of semantic attributes. The code and trained +models are available at +https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser + +
+
+ comment: Published in ICCV 2023. Code: + https://github.com/TrepangCat/3D_Semantic_Subspace_Traverser +
+
+
+
+
+ + ♻ ☆ Swin3D: A Pretrained Transformer Backbone for 3D Indoor Scene + Understanding + + +
+ The use of pretrained backbones with fine-tuning has been successful for 2D +vision and natural language processing tasks, showing advantages over +task-specific networks. In this work, we introduce a pretrained 3D backbone, +called {\SST}, for 3D indoor scene understanding. We design a 3D Swin +transformer as our backbone network, which enables efficient self-attention on +sparse voxels with linear memory complexity, making the backbone scalable to +large models and datasets. We also introduce a generalized contextual relative +positional embedding scheme to capture various irregularities of point signals +for improved network performance. We pretrained a large {\SST} model on a +synthetic Structured3D dataset, which is an order of magnitude larger than the +ScanNet dataset. Our model pretrained on the synthetic dataset not only +generalizes well to downstream segmentation and detection on real 3D point +datasets, but also outperforms state-of-the-art methods on downstream tasks +with +2.3 mIoU and +2.2 mIoU on S3DIS Area5 and 6-fold semantic segmentation, ++1.8 mIoU on ScanNet segmentation (val), +1.9 mAP@0.5 on ScanNet detection, and ++8.1 mAP@0.5 on S3DIS detection. A series of extensive ablation studies further +validate the scalability, generality, and superior performance enabled by our +approach. The code and models are available at +https://github.com/microsoft/Swin3D . + +
+
+ comment: Project page: https://yukichiii.github.io/project/swin3D/swin3D.html +
+
+
+
+
+ + ♻ ☆ Interaction-Aware Personalized Vehicle Trajectory Prediction Using + Temporal Graph Neural Networks + + +
+ Accurate prediction of vehicle trajectories is vital for advanced driver +assistance systems and autonomous vehicles. Existing methods mainly rely on +generic trajectory predictions derived from large datasets, overlooking the +personalized driving patterns of individual drivers. To address this gap, we +propose an approach for interaction-aware personalized vehicle trajectory +prediction that incorporates temporal graph neural networks. Our method +utilizes Graph Convolution Networks (GCN) and Long Short-Term Memory (LSTM) to +model the spatio-temporal interactions between target vehicles and their +surrounding traffic. To personalize the predictions, we establish a pipeline +that leverages transfer learning: the model is initially pre-trained on a +large-scale trajectory dataset and then fine-tuned for each driver using their +specific driving data. We employ human-in-the-loop simulation to collect +personalized naturalistic driving trajectories and corresponding surrounding +vehicle trajectories. Experimental results demonstrate the superior performance +of our personalized GCN-LSTM model, particularly for longer prediction +horizons, compared to its generic counterpart. Moreover, the personalized model +outperforms individual models created without pre-training, emphasizing the +significance of pre-training on a large dataset to avoid overfitting. By +incorporating personalization, our approach enhances trajectory prediction +accuracy. + +
+
+
+
+
+ + ♻ ☆ High-Performance Fine Defect Detection in Artificial Leather Using Dual + Feature Pool Object Detection + + +
+ In this study, the structural problems of the YOLOv5 model were analyzed +emphatically. Based on the characteristics of fine defects in artificial +leather, four innovative structures, namely DFP, IFF, AMP, and EOS, were +designed. These advancements led to the proposal of a high-performance +artificial leather fine defect detection model named YOLOD. YOLOD demonstrated +outstanding performance on the artificial leather defect dataset, achieving an +impressive increase of 11.7% - 13.5% in AP_50 compared to YOLOv5, along with a +significant reduction of 5.2% - 7.2% in the error detection rate. Moreover, +YOLOD also exhibited remarkable performance on the general MS-COCO dataset, +with an increase of 0.4% - 2.6% in AP compared to YOLOv5, and a rise of 2.5% - +4.1% in AP_S compared to YOLOv5. These results demonstrate the superiority of +YOLOD in both artificial leather defect detection and general object detection +tasks, making it a highly efficient and effective model for real-world +applications. + +
+
+
+
+
+ + ♻ ☆ YOLOCS: Object Detection based on Dense Channel Compression for Feature + Spatial Solidification + + +
+ In this study, we examine the associations between channel features and +convolutional kernels during the processes of feature purification and gradient +backpropagation, with a focus on the forward and backward propagation within +the network. Consequently, we propose a method called Dense Channel Compression +for Feature Spatial Solidification. Drawing upon the central concept of this +method, we introduce two innovative modules for backbone and head networks: the +Dense Channel Compression for Feature Spatial Solidification Structure (DCFS) +and the Asymmetric Multi-Level Compression Decoupled Head (ADH). When +integrated into the YOLOv5 model, these two modules demonstrate exceptional +performance, resulting in a modified model referred to as YOLOCS. Evaluated on +the MSCOCO dataset, the large, medium, and small YOLOCS models yield AP of +50.1%, 47.6%, and 42.5%, respectively. Maintaining inference speeds remarkably +similar to those of the YOLOv5 model, the large, medium, and small YOLOCS +models surpass the YOLOv5 model's AP by 1.1%, 2.3%, and 5.2%, respectively. + +
+
+
+
+
+ + ♻ ☆ A Two-Step Deep Learning Method for 3DCT-2DUS Kidney Registration During + Breathing + + +
+ This work proposed a novel deep registration pipeline for 3D CT and 2D U/S +kidney scans of free breathing, which consists of a feature network, and a +3D-2D CNN-based registration network. The feature network has handcraft texture +feature layers to reduce the semantic gap. The registration network is +encoder-decoder structure with loss of feature-image-motion (FIM), which +enables hierarchical regression at decoder layers and avoids multiple network +concatenation. It was first pretrained with retrospective datasets cum training +data generation strategy, then adapted to specific patient data under +unsupervised one-cycle transfer learning in onsite application. The experiment +was on 132 U/S sequences, 39 multiple phase CT and 210 public single phase CT +images, and 25 pairs of CT and U/S sequences. It resulted in mean contour +distance (MCD) of 0.94 mm between kidneys on CT and U/S images and MCD of 1.15 +mm on CT and reference CT images. For datasets with small transformations, it +resulted in MCD of 0.82 and 1.02 mm respectively. For large transformations, it +resulted in MCD of 1.10 and 1.28 mm respectively. This work addressed +difficulties in 3DCT-2DUS kidney registration during free breathing via novel +network structures and training strategy. + +
+
+ comment: 16 pages, 8 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ Convex Decomposition of Indoor Scenes + + +
+ We describe a method to parse a complex, cluttered indoor scene into +primitives which offer a parsimonious abstraction of scene structure. Our +primitives are simple convexes. Our method uses a learned regression procedure +to parse a scene into a fixed number of convexes from RGBD input, and can +optionally accept segmentations to improve the decomposition. The result is +then polished with a descent method which adjusts the convexes to produce a +very good fit, and greedily removes superfluous primitives. Because the entire +scene is parsed, we can evaluate using traditional depth, normal, and +segmentation error metrics. Our evaluation procedure demonstrates that the +error from our primitive representation is comparable to that of predicting +depth from a single image. + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Ablating Concepts in Text-to-Image Diffusion Models ICCV 2023 + + +
+ Large-scale text-to-image diffusion models can generate high-fidelity images +with powerful compositional ability. However, these models are typically +trained on an enormous amount of Internet data, often containing copyrighted +material, licensed images, and personal photos. Furthermore, they have been +found to replicate the style of various living artists or memorize exact +training samples. How can we remove such copyrighted concepts or images without +retraining the model from scratch? To achieve this goal, we propose an +efficient method of ablating concepts in the pretrained model, i.e., preventing +the generation of a target concept. Our algorithm learns to match the image +distribution for a target style, instance, or text prompt we wish to ablate to +the distribution corresponding to an anchor concept. This prevents the model +from generating target concepts given its text condition. Extensive experiments +show that our method can successfully prevent the generation of the ablated +concept while preserving closely related concepts in the model. + +
+
+ comment: ICCV 2023. Project website: https://www.cs.cmu.edu/~concept-ablation/ +
+
+
+
+
+
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ A Bi-Step Grounding Paradigm for Large Language Models in Recommendation + Systems + + +
+ As the focus on Large Language Models (LLMs) in the field of recommendation +intensifies, the optimization of LLMs for recommendation purposes (referred to +as LLM4Rec) assumes a crucial role in augmenting their effectiveness in +providing recommendations. However, existing approaches for LLM4Rec often +assess performance using restricted sets of candidates, which may not +accurately reflect the models' overall ranking capabilities. In this paper, our +objective is to investigate the comprehensive ranking capacity of LLMs and +propose a two-step grounding framework known as BIGRec (Bi-step Grounding +Paradigm for Recommendation). It initially grounds LLMs to the recommendation +space by fine-tuning them to generate meaningful tokens for items and +subsequently identifies appropriate actual items that correspond to the +generated tokens. By conducting extensive experiments on two datasets, we +substantiate the superior performance, capacity for handling few-shot +scenarios, and versatility across multiple domains exhibited by BIGRec. +Furthermore, we observe that the marginal benefits derived from increasing the +quantity of training samples are modest for BIGRec, implying that LLMs possess +the limited capability to assimilate statistical information, such as +popularity and collaborative filtering, due to their robust semantic priors. +These findings also underline the efficacy of integrating diverse statistical +information into the LLM4Rec framework, thereby pointing towards a potential +avenue for future research. Our code and data are available at +https://github.com/SAI990323/Grounding4Rec. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Knowledge-Enhanced Multi-Label Few-Shot Product Attribute-Value + Extraction CIKM 2023 + + +
+ Existing attribute-value extraction (AVE) models require large quantities of +labeled data for training. However, new products with new attribute-value pairs +enter the market every day in real-world e-Commerce. Thus, we formulate AVE in +multi-label few-shot learning (FSL), aiming to extract unseen attribute value +pairs based on a small number of training examples. We propose a +Knowledge-Enhanced Attentive Framework (KEAF) based on prototypical networks, +leveraging the generated label description and category information to learn +more discriminative prototypes. Besides, KEAF integrates with hybrid attention +to reduce noise and capture more informative semantics for each class by +calculating the label-relevant and query-related weights. To achieve +multi-label inference, KEAF further learns a dynamic threshold by integrating +the semantic information from both the support set and the query set. Extensive +experiments with ablation studies conducted on two datasets demonstrate that +KEAF outperforms other SOTA models for information extraction in FSL. The code +can be found at: https://github.com/gjiaying/KEAF + +
+
+ comment: 6 pages, 2 figures, published in CIKM 2023 +
+
+
+
+
+ + ☆ Content-based Recommendation Engine for Video Streaming Platform + + +
+ Recommendation engine suggest content, product or services to the user by +using machine learning algorithm. This paper proposed a content-based +recommendation engine for providing video suggestion to the user based on their +previous interests and choices. We will use TF-IDF text vectorization method to +determine the relevance of words in a document. Then we will find out the +similarity between each content by calculating cosine similarity between them. +Finally, engine will recommend videos to the users based on the obtained +similarity score value. In addition, we will measure the engine's performance +by computing precision, recall, and F1 core of the proposed system. + +
+
+
+
+
+ + ☆ Advancing continual lifelong learning in neural information retrieval: + definition, dataset, framework, and empirical evaluation + + +
+ Continual learning refers to the capability of a machine learning model to +learn and adapt to new information, without compromising its performance on +previously learned tasks. Although several studies have investigated continual +learning methods for information retrieval tasks, a well-defined task +formulation is still lacking, and it is unclear how typical learning strategies +perform in this context. To address this challenge, a systematic task +formulation of continual neural information retrieval is presented, along with +a multiple-topic dataset that simulates continuous information retrieval. A +comprehensive continual neural information retrieval framework consisting of +typical retrieval models and continual learning strategies is then proposed. +Empirical evaluations illustrate that the proposed framework can successfully +prevent catastrophic forgetting in neural information retrieval and enhance +performance on previously learned tasks. The results indicate that +embedding-based retrieval models experience a decline in their continual +learning performance as the topic shift distance and dataset volume of new +tasks increase. In contrast, pretraining-based models do not show any such +correlation. Adopting suitable learning strategies can mitigate the effects of +topic shift and data augmentation. + +
+
+ comment: Submitted to Information Sciences +
+
+
+
+
+ + ☆ Is Meta-Learning the Right Approach for the Cold-Start Problem in + Recommender Systems? + + +
+ Recommender systems have become fundamental building blocks of modern online +products and services, and have a substantial impact on user experience. In the +past few years, deep learning methods have attracted a lot of research, and are +now heavily used in modern real-world recommender systems. Nevertheless, +dealing with recommendations in the cold-start setting, e.g., when a user has +done limited interactions in the system, is a problem that remains far from +solved. Meta-learning techniques, and in particular optimization-based +meta-learning, have recently become the most popular approaches in the academic +research literature for tackling the cold-start problem in deep learning models +for recommender systems. However, current meta-learning approaches are not +practical for real-world recommender systems, which have billions of users and +items, and strict latency requirements. In this paper we show that it is +possible to obtaining similar, or higher, performance on commonly used +benchmarks for the cold-start problem without using meta-learning techniques. +In more detail, we show that, when tuned correctly, standard and widely adopted +deep learning models perform just as well as newer meta-learning models. We +further show that an extremely simple modular approach using common +representation learning techniques, can perform comparably to meta-learning +techniques specifically designed for the cold-start setting while being much +more easily deployable in real-world applications. + +
+
+
+
+
+ + ☆ Phase Retrieval with Background Information: Decreased References and + Efficient Methods + + +
+ Fourier phase retrieval(PR) is a severely ill-posed inverse problem that +arises in various applications. To guarantee a unique solution and relieve the +dependence on the initialization, background information can be exploited as a +structural priors. However, the requirement for the background information may +be challenging when moving to the high-resolution imaging. At the same time, +the previously proposed projected gradient descent(PGD) method also demands +much background information. + In this paper, we present an improved theoretical result about the demand for +the background information, along with two Douglas Rachford(DR) based methods. +Analytically, we demonstrate that the background required to ensure a unique +solution can be decreased by nearly $1/2$ for the 2-D signals compared to the +1-D signals. By generalizing the results into $d$-dimension, we show that the +length of the background information more than $(2^{\frac{d+1}{d}}-1)$ folds of +the signal is sufficient to ensure the uniqueness. At the same time, we also +analyze the stability and robustness of the model when measurements and +background information are corrupted by the noise. Furthermore, two methods +called Background Douglas-Rachford (BDR) and Convex Background Douglas-Rachford +(CBDR) are proposed. BDR which is a kind of non-convex method is proven to have +the local R-linear convergence rate under mild assumptions. Instead, CBDR +method uses the techniques of convexification and can be proven to own a global +convergence guarantee as long as the background information is sufficient. To +support this, a new property called F-RIP is established. We test the +performance of the proposed methods through simulations as well as real +experimental measurements, and demonstrate that they achieve a higher recovery +rate with less background information compared to the PGD method. + +
+
+
+
+
+ + ☆ Pre-training with Large Language Model-based Document Expansion for + Dense Passage Retrieval + + +
+ In this paper, we systematically study the potential of pre-training with +Large Language Model(LLM)-based document expansion for dense passage retrieval. +Concretely, we leverage the capabilities of LLMs for document expansion, i.e. +query generation, and effectively transfer expanded knowledge to retrievers +using pre-training strategies tailored for passage retrieval. These strategies +include contrastive learning and bottlenecked query generation. Furthermore, we +incorporate a curriculum learning strategy to reduce the reliance on LLM +inferences. Experimental results demonstrate that pre-training with LLM-based +document expansion significantly boosts the retrieval performance on +large-scale web-search tasks. Our work shows strong zero-shot and out-of-domain +retrieval abilities, making it more widely applicable for retrieval when +initializing with no human-labeled data. + +
+
+ comment: 10 pages, 3 tables, 4 figures, under review +
+
+
+
+
+ + ☆ Uncovering User Interest from Biased and Noised Watch Time in Video + Recommendation + + +
+ In the video recommendation, watch time is commonly adopted as an indicator +of user interest. However, watch time is not only influenced by the matching of +users' interests but also by other factors, such as duration bias and noisy +watching. Duration bias refers to the tendency for users to spend more time on +videos with longer durations, regardless of their actual interest level. Noisy +watching, on the other hand, describes users taking time to determine whether +they like a video or not, which can result in users spending time watching +videos they do not like. Consequently, the existence of duration bias and noisy +watching make watch time an inadequate label for indicating user interest. +Furthermore, current methods primarily address duration bias and ignore the +impact of noisy watching, which may limit their effectiveness in uncovering +user interest from watch time. In this study, we first analyze the generation +mechanism of users' watch time from a unified causal viewpoint. Specifically, +we considered the watch time as a mixture of the user's actual interest level, +the duration-biased watch time, and the noisy watch time. To mitigate both the +duration bias and noisy watching, we propose Debiased and Denoised watch time +Correction (D$^2$Co), which can be divided into two steps: First, we employ a +duration-wise Gaussian Mixture Model plus frequency-weighted moving average for +estimating the bias and noise terms; then we utilize a sensitivity-controlled +correction function to separate the user interest from the watch time, which is +robust to the estimation error of bias and noise terms. The experiments on two +public video recommendation datasets and online A/B testing indicate the +effectiveness of the proposed method. + +
+
+ comment: Accepted by Recsys'23 +
+
+
+
+
+ + ☆ Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme + Detection ACM MM + + +
+ Hateful meme detection is a challenging multimodal task that requires +comprehension of both vision and language, as well as cross-modal interactions. +Recent studies have tried to fine-tune pre-trained vision-language models +(PVLMs) for this task. However, with increasing model sizes, it becomes +important to leverage powerful PVLMs more efficiently, rather than simply +fine-tuning them. Recently, researchers have attempted to convert meme images +into textual captions and prompt language models for predictions. This approach +has shown good performance but suffers from non-informative image captions. +Considering the two factors mentioned above, we propose a probing-based +captioning approach to leverage PVLMs in a zero-shot visual question answering +(VQA) manner. Specifically, we prompt a frozen PVLM by asking hateful +content-related questions and use the answers as image captions (which we call +Pro-Cap), so that the captions contain information critical for hateful content +detection. The good performance of models with Pro-Cap on three benchmarks +validates the effectiveness and generalization of the proposed method. + +
+
+ comment: Camera-ready for 23, ACM MM +
+
+
+
+
+ + ☆ Group Identification via Transitional Hypergraph Convolution with + Cross-view Self-supervised Learning CIKM'23 + + +
+ With the proliferation of social media, a growing number of users search for +and join group activities in their daily life. This develops a need for the +study on the group identification (GI) task, i.e., recommending groups to +users. The major challenge in this task is how to predict users' preferences +for groups based on not only previous group participation of users but also +users' interests in items. Although recent developments in Graph Neural +Networks (GNNs) accomplish embedding multiple types of objects in graph-based +recommender systems, they, however, fail to address this GI problem +comprehensively. In this paper, we propose a novel framework named Group +Identification via Transitional Hypergraph Convolution with Graph +Self-supervised Learning (GTGS). We devise a novel transitional hypergraph +convolution layer to leverage users' preferences for items as prior knowledge +when seeking their group preferences. To construct comprehensive user/group +representations for GI task, we design the cross-view self-supervised learning +to encourage the intrinsic consistency between item and group preferences for +each user, and the group-based regularization to enhance the distinction among +group embeddings. Experimental results on three benchmark datasets verify the +superiority of GTGS. Additional detailed investigations are conducted to +demonstrate the effectiveness of the proposed framework. + +
+
+ comment: 11 pages. Accepted by CIKM'23 +
+
+
+
+
+ + ♻ ☆ LLM-Rec: Personalized Recommendation via Prompting Large Language Models + + +
+ We investigate various prompting strategies for enhancing personalized +recommendation performance with large language models (LLMs) through input +augmentation. Our proposed approach, termed LLM-Rec, encompasses four distinct +prompting strategies: (1) basic prompting, (2) recommendation-driven prompting, +(3) engagement-guided prompting, and (4) recommendation-driven + +engagement-guided prompting. Our empirical experiments show that incorporating +the augmented input text generated by LLM leads to improved recommendation +performance. Recommendation-driven and engagement-guided prompting strategies +are found to elicit LLM's understanding of global and local item +characteristics. This finding highlights the importance of leveraging diverse +prompts and input augmentation techniques to enhance the recommendation +capabilities with LLMs. + +
+
+
+
+
+ + ♻ ☆ Editing Language Model-based Knowledge Graph Embeddings + + +
+ Recently decades have witnessed the empirical success of framing Knowledge +Graph (KG) embeddings via language models. However, language model-based KG +embeddings are usually deployed as static artifacts, making them difficult to +modify post-deployment without re-training after deployment. To address this +issue, we propose a new task of editing language model-based KG embeddings in +this paper. This task is designed to facilitate rapid, data-efficient updates +to KG embeddings without compromising the performance of other aspects. We +build four new datasets: E-FB15k237, A-FB15k237, E-WN18RR, and A-WN18RR, and +evaluate several knowledge editing baselines demonstrating the limited ability +of previous models to handle the proposed challenging task. We further propose +a simple yet strong baseline dubbed KGEditor, which utilizes additional +parametric layers of the hyper network to edit/add facts. Our comprehensive +experimental results reveal that KGEditor excels in updating specific facts +without impacting the overall performance, even when faced with limited +training resources. Code and datasets are available in +https://github.com/zjunlp/PromptKG/tree/main/deltaKG. + +
+
+ comment: Work in progress and the project website is + https://zjunlp.github.io/project/KGE_Editing/ +
+
+
+
+
+ + ♻ ☆ A Survey on Point-of-Interest Recommendations Leveraging Heterogeneous + Data + + +
+ Tourism is an important application domain for recommender systems. In this +domain, recommender systems are for example tasked with providing personalized +recommendations for transportation, accommodation, points-of-interest (POIs), +or tourism services. Among these tasks, in particular the problem of +recommending POIs that are of likely interest to individual tourists has gained +growing attention in recent years. Providing POI recommendations to tourists +\emph{during their trip} can however be especially challenging due to the +variability of the users' context. With the rapid development of the Web and +today's multitude of online services, vast amounts of data from various sources +have become available, and these heterogeneous data sources represent a huge +potential to better address the challenges of in-trip POI recommendation +problems. In this work, we provide a comprehensive survey of published research +on POI recommendation between 2017 and 2022 from the perspective of +heterogeneous data sources. Specifically, we investigate which types of data +are used in the literature and which technical approaches and evaluation +methods are predominant. Among other aspects, we find that today's research +works often focus on a narrow range of data sources, leaving great potential +for future works that better utilize heterogeneous data sources and diverse +data types for improved in-trip recommendations. + +
+
+ comment: 35 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ SPM: Structured Pretraining and Matching Architectures for Relevance + Modeling in Meituan Search CIKM '23 + + +
+ In e-commerce search, relevance between query and documents is an essential +requirement for satisfying user experience. Different from traditional +e-commerce platforms that offer products, users search on life service +platforms such as Meituan mainly for product providers, which usually have +abundant structured information, e.g. name, address, category, thousands of +products. Modeling search relevance with these rich structured contents is +challenging due to the following issues: (1) there is language distribution +discrepancy among different fields of structured document, making it difficult +to directly adopt off-the-shelf pretrained language model based methods like +BERT. (2) different fields usually have different importance and their length +vary greatly, making it difficult to extract document information helpful for +relevance matching. + To tackle these issues, in this paper we propose a novel two-stage +pretraining and matching architecture for relevance matching with rich +structured documents. At pretraining stage, we propose an effective pretraining +method that employs both query and multiple fields of document as inputs, +including an effective information compression method for lengthy fields. At +relevance matching stage, a novel matching method is proposed by leveraging +domain knowledge in search query to generate more effective document +representations for relevance scoring. Extensive offline experiments and online +A/B tests on millions of users verify that the proposed architectures +effectively improve the performance of relevance modeling. The model has +already been deployed online, serving the search traffic of Meituan for over a +year. + +
+
+ comment: Accepted by CIKM '23 +
+
+
+
+
+ + ♻ ☆ Beyond Semantics: Learning a Behavior Augmented Relevance Model with + Self-supervised Learning + + +
+ Relevance modeling aims to locate desirable items for corresponding queries, +which is crucial for search engines to ensure user experience. Although most +conventional approaches address this problem by assessing the semantic +similarity between the query and item, pure semantic matching is not +everything. + +
+
+ comment: Partial content +
+
+
+
+
+
+
+
+ + Machine Learning 114 + +
+
+
+ + ☆ Proprioceptive Learning with Soft Polyhedral Networks + + +
+ Proprioception is the "sixth sense" that detects limb postures with motor +neurons. It requires a natural integration between the musculoskeletal systems +and sensory receptors, which is challenging among modern robots that aim for +lightweight, adaptive, and sensitive designs at a low cost. Here, we present +the Soft Polyhedral Network with an embedded vision for physical interactions, +capable of adaptive kinesthesia and viscoelastic proprioception by learning +kinetic features. This design enables passive adaptations to omni-directional +interactions, visually captured by a miniature high-speed motion tracking +system embedded inside for proprioceptive learning. The results show that the +soft network can infer real-time 6D forces and torques with accuracies of +0.25/0.24/0.35 N and 0.025/0.034/0.006 Nm in dynamic interactions. We also +incorporate viscoelasticity in proprioception during static adaptation by +adding a creep and relaxation modifier to refine the predicted results. The +proposed soft network combines simplicity in design, omni-adaptation, and +proprioceptive sensing with high accuracy, making it a versatile solution for +robotics at a low cost with more than 1 million use cycles for tasks such as +sensitive and competitive grasping, and touch-based geometry reconstruction. +This study offers new insights into vision-based proprioception for soft robots +in adaptive grasping, soft manipulation, and human-robot interaction. + +
+
+ comment: 20 pages, 10 figures, 2 tables, submitted to the International + Journal of Robotics Research for review +
+
+
+
+
+ + ☆ Can Transformers Learn Optimal Filtering for Unknown Systems? + + +
+ Transformers have demonstrated remarkable success in natural language +processing; however, their potential remains mostly unexplored for problems +arising in dynamical systems. In this work, we investigate the optimal output +estimation problem using transformers, which generate output predictions using +all the past ones. We train the transformer using various systems drawn from a +prior distribution and then evaluate its performance on previously unseen +systems from the same distribution. As a result, the obtained transformer acts +like a prediction algorithm that learns in-context and quickly adapts to and +predicts well for different systems - thus we call it meta-output-predictor +(MOP). MOP matches the performance of the optimal output estimator, based on +Kalman filter, for most linear dynamical systems even though it does not have +access to a model. We observe via extensive numerical experiments that MOP also +performs well in challenging scenarios with non-i.i.d. noise, time-varying +dynamics, and nonlinear dynamics like a quadrotor system with unknown +parameters. To further support this observation, in the second part of the +paper, we provide statistical guarantees on the performance of MOP and quantify +the required amount of training to achieve a desired excess risk during +test-time. Finally, we point out some limitations of MOP by identifying two +classes of problems MOP fails to perform well, highlighting the need for +caution when using transformers for control and estimation. + +
+
+
+
+
+ + ☆ Painter: Teaching Auto-regressive Language Models to Draw Sketches + + +
+ Large language models (LLMs) have made tremendous progress in natural +language understanding and they have also been successfully adopted in other +domains such as computer vision, robotics, reinforcement learning, etc. In this +work, we apply LLMs to image generation tasks by directly generating the +virtual brush strokes to paint an image. We present Painter, an LLM that can +convert user prompts in text description format to sketches by generating the +corresponding brush strokes in an auto-regressive way. We construct Painter +based on off-the-shelf LLM that is pre-trained on a large text corpus, by +fine-tuning it on the new task while preserving language understanding +capabilities. We create a dataset of diverse multi-object sketches paired with +textual prompts that covers several object types and tasks. Painter can +generate sketches from text descriptions, remove objects from canvas, and +detect and classify objects in sketches. Although this is an unprecedented +pioneering work in using LLMs for auto-regressive image generation, the results +are very encouraging. + +
+
+
+
+
+ + ☆ Two-and-a-half Order Score-based Model for Solving 3D Ill-posed Inverse + Problems + + +
+ Computed Tomography (CT) and Magnetic Resonance Imaging (MRI) are crucial +technologies in the field of medical imaging. Score-based models have proven to +be effective in addressing different inverse problems encountered in CT and +MRI, such as sparse-view CT and fast MRI reconstruction. However, these models +face challenges in achieving accurate three dimensional (3D) volumetric +reconstruction. The existing score-based models primarily focus on +reconstructing two dimensional (2D) data distribution, leading to +inconsistencies between adjacent slices in the reconstructed 3D volumetric +images. To overcome this limitation, we propose a novel two-and-a-half order +score-based model (TOSM). During the training phase, our TOSM learns data +distributions in 2D space, which reduces the complexity of training compared to +directly working on 3D volumes. However, in the reconstruction phase, the TOSM +updates the data distribution in 3D space, utilizing complementary scores along +three directions (sagittal, coronal, and transaxial) to achieve a more precise +reconstruction. The development of TOSM is built on robust theoretical +principles, ensuring its reliability and efficacy. Through extensive +experimentation on large-scale sparse-view CT and fast MRI datasets, our method +demonstrates remarkable advancements and attains state-of-the-art results in +solving 3D ill-posed inverse problems. Notably, the proposed TOSM effectively +addresses the inter-slice inconsistency issue, resulting in high-quality 3D +volumetric reconstruction. + +
+
+
+
+
+ + ☆ Autoencoding a Soft Touch to Learn Grasping from On-land to Underwater + + +
+ Robots play a critical role as the physical agent of human operators in +exploring the ocean. However, it remains challenging to grasp objects reliably +while fully submerging under a highly pressurized aquatic environment with +little visible light, mainly due to the fluidic interference on the tactile +mechanics between the finger and object surfaces. This study investigates the +transferability of grasping knowledge from on-land to underwater via a +vision-based soft robotic finger that learns 6D forces and torques (FT) using a +Supervised Variational Autoencoder (SVAE). A high-framerate camera captures the +whole-body deformations while a soft robotic finger interacts with physical +objects on-land and underwater. Results show that the trained SVAE model +learned a series of latent representations of the soft mechanics transferrable +from land to water, presenting a superior adaptation to the changing +environments against commercial FT sensors. Soft, delicate, and reactive +grasping enabled by tactile intelligence enhances the gripper's underwater +interaction with improved reliability and robustness at a much-reduced cost, +paving the path for learning-based intelligent grasping to support fundamental +scientific discoveries in environmental and ocean research. + +
+
+ comment: 17 pages, 5 figures, 1 table, submitted to Advanced Intelligent + Systems for review +
+
+
+
+
+ + ☆ ResBuilder: Automated Learning of Depth with Residual Structures + + +
+ In this work, we develop a neural architecture search algorithm, termed +Resbuilder, that develops ResNet architectures from scratch that achieve high +accuracy at moderate computational cost. It can also be used to modify existing +architectures and has the capability to remove and insert ResNet blocks, in +this way searching for suitable architectures in the space of ResNet +architectures. In our experiments on different image classification datasets, +Resbuilder achieves close to state-of-the-art performance while saving +computational cost compared to off-the-shelf ResNets. Noteworthy, we once tune +the parameters on CIFAR10 which yields a suitable default choice for all other +datasets. We demonstrate that this property generalizes even to industrial +applications by applying our method with default parameters on a proprietary +fraud detection dataset. + +
+
+
+
+
+ + ☆ Time Travel in LLMs: Tracing Data Contamination in Large Language Models + + +
+ Data contamination, i.e., the presence of test data from downstream tasks in +the training data of large language models (LLMs), is a potential major issue +in understanding LLMs' effectiveness on other tasks. We propose a +straightforward yet effective method for identifying data contamination within +LLMs. At its core, our approach starts by identifying potential contamination +in individual instances that are drawn from a small random sample; using this +information, our approach then assesses if an entire dataset partition is +contaminated. To estimate contamination of individual instances, we employ +"guided instruction:" a prompt consisting of the dataset name, partition type, +and the initial segment of a reference instance, asking the LLM to complete it. +An instance is flagged as contaminated if the LLM's output either exactly or +closely matches the latter segment of the reference. To understand if an entire +partition is contaminated, we propose two ideas. The first idea marks a dataset +partition as contaminated if the average overlap score with the reference +instances (as measured by ROUGE or BLEURT) is statistically significantly +better with the guided instruction vs. a general instruction that does not +include the dataset and partition name. The second idea marks a dataset as +contaminated if a classifier based on GPT-4 with in-context learning prompting +marks multiple instances as contaminated. Our best method achieves an accuracy +between 92% and 100% in detecting if an LLM is contaminated with seven +datasets, containing train and test/validation partitions, when contrasted with +manual evaluation by human expert. Further, our findings indicate that GPT-4 is +contaminated with AG News, WNLI, and XSum datasets. + +
+
+ comment: v1 preprint +
+
+
+
+
+ + ☆ Label Propagation Techniques for Artifact Detection in Imbalanced + Classes using Photoplethysmogram Signals + + +
+ Photoplethysmogram (PPG) signals are widely used in healthcare for monitoring +vital signs, but they are susceptible to motion artifacts that can lead to +inaccurate interpretations. In this study, the use of label propagation +techniques to propagate labels among PPG samples is explored, particularly in +imbalanced class scenarios where clean PPG samples are significantly +outnumbered by artifact-contaminated samples. With a precision of 91%, a recall +of 90% and an F1 score of 90% for the class without artifacts, the results +demonstrate its effectiveness in labeling a medical dataset, even when clean +samples are rare. For the classification of artifacts our study compares +supervised classifiers such as conventional classifiers and neural networks +(MLP, Transformers, FCN) with the semi-supervised label propagation algorithm. +With a precision of 89%, a recall of 95% and an F1 score of 92%, the KNN +supervised model gives good results, but the semi-supervised algorithm performs +better in detecting artifacts. The findings suggest that the semi-supervised +algorithm label propagation hold promise for artifact detection in PPG signals, +which can enhance the reliability of PPG-based health monitoring systems in +real-world applications. + +
+
+ comment: Under preparation to submit to IEEE for possible publications +
+
+
+
+
+ + ☆ LLM4TS: Two-Stage Fine-Tuning for Time-Series Forecasting with + Pre-Trained LLMs + + +
+ In this work, we leverage pre-trained Large Language Models (LLMs) to enhance +time-series forecasting. Mirroring the growing interest in unifying models for +Natural Language Processing and Computer Vision, we envision creating an +analogous model for long-term time-series forecasting. Due to limited +large-scale time-series data for building robust foundation models, our +approach LLM4TS focuses on leveraging the strengths of pre-trained LLMs. By +combining time-series patching with temporal encoding, we have enhanced the +capability of LLMs to handle time-series data effectively. Inspired by the +supervised fine-tuning in chatbot domains, we prioritize a two-stage +fine-tuning process: first conducting supervised fine-tuning to orient the LLM +towards time-series data, followed by task-specific downstream fine-tuning. +Furthermore, to unlock the flexibility of pre-trained LLMs without extensive +parameter adjustments, we adopt several Parameter-Efficient Fine-Tuning (PEFT) +techniques. Drawing on these innovations, LLM4TS has yielded state-of-the-art +results in long-term forecasting. Our model has also shown exceptional +capabilities as both a robust representation learner and an effective few-shot +learner, thanks to the knowledge transferred from the pre-trained LLM. + +
+
+
+
+
+ + ☆ An Expert's Guide to Training Physics-informed Neural Networks + + +
+ Physics-informed neural networks (PINNs) have been popularized as a deep +learning framework that can seamlessly synthesize observational data and +partial differential equation (PDE) constraints. Their practical effectiveness +however can be hampered by training pathologies, but also oftentimes by poor +choices made by users who lack deep learning expertise. In this paper we +present a series of best practices that can significantly improve the training +efficiency and overall accuracy of PINNs. We also put forth a series of +challenging benchmark problems that highlight some of the most prominent +difficulties in training PINNs, and present comprehensive and fully +reproducible ablation studies that demonstrate how different architecture +choices and training strategies affect the test accuracy of the resulting +models. We show that the methods and guiding principles put forth in this study +lead to state-of-the-art results and provide strong baselines that future +studies should use for comparison purposes. To this end, we also release a +highly optimized library in JAX that can be used to reproduce all results +reported in this paper, enable future research studies, as well as facilitate +easy adaptation to new use-case scenarios. + +
+
+ comment: 36 pages, 25 figures, 13 tables +
+
+
+
+
+ + ☆ On Neural Quantum Support Vector Machines + + +
+ In \cite{simon2023algorithms} we introduced four algorithms for the training +of neural support vector machines (NSVMs) and demonstrated their feasibility. +In this note we introduce neural quantum support vector machines, that is, +NSVMs with a quantum kernel, and extend our results to this setting. + +
+
+ comment: 13 pages, 0 figures. arXiv admin note: substantial text overlap with + arXiv:2308.07204 +
+
+
+
+
+ + ☆ Hierarchical Uncertainty Estimation for Medical Image Segmentation + Networks + + +
+ Learning a medical image segmentation model is an inherently ambiguous task, +as uncertainties exist in both images (noise) and manual annotations (human +errors and bias) used for model training. To build a trustworthy image +segmentation model, it is important to not just evaluate its performance but +also estimate the uncertainty of the model prediction. Most state-of-the-art +image segmentation networks adopt a hierarchical encoder architecture, +extracting image features at multiple resolution levels from fine to coarse. In +this work, we leverage this hierarchical image representation and propose a +simple yet effective method for estimating uncertainties at multiple levels. +The multi-level uncertainties are modelled via the skip-connection module and +then sampled to generate an uncertainty map for the predicted image +segmentation. We demonstrate that a deep learning segmentation network such as +U-net, when implemented with such hierarchical uncertainty estimation module, +can achieve a high segmentation performance, while at the same time provide +meaningful uncertainty maps that can be used for out-of-distribution detection. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Accurate synthesis of Dysarthric Speech for ASR data augmentation + + +
+ Dysarthria is a motor speech disorder often characterized by reduced speech +intelligibility through slow, uncoordinated control of speech production +muscles. Automatic Speech recognition (ASR) systems can help dysarthric talkers +communicate more effectively. However, robust dysarthria-specific ASR requires +a significant amount of training speech, which is not readily available for +dysarthric talkers. This paper presents a new dysarthric speech synthesis +method for the purpose of ASR training data augmentation. Differences in +prosodic and acoustic characteristics of dysarthric spontaneous speech at +varying severity levels are important components for dysarthric speech +modeling, synthesis, and augmentation. For dysarthric speech synthesis, a +modified neural multi-talker TTS is implemented by adding a dysarthria severity +level coefficient and a pause insertion model to synthesize dysarthric speech +for varying severity levels. To evaluate the effectiveness for synthesis of +training data for ASR, dysarthria-specific speech recognition was used. Results +show that a DNN-HMM model trained on additional synthetic dysarthric speech +achieves WER improvement of 12.2% compared to the baseline, and that the +addition of the severity level and pause insertion controls decrease WER by +6.5%, showing the effectiveness of adding these parameters. Overall results on +the TORGO database demonstrate that using dysarthric synthetic speech to +increase the amount of dysarthric-patterned speech for training has significant +impact on the dysarthric ASR systems. In addition, we have conducted a +subjective evaluation to evaluate the dysarthric-ness and similarity of +synthesized speech. Our subjective evaluation shows that the perceived +dysartrhic-ness of synthesized speech is similar to that of true dysarthric +speech, especially for higher levels of dysarthria + +
+
+ comment: arXiv admin note: text overlap with arXiv:2201.11571 +
+
+
+
+
+ + ☆ Eliciting Risk Aversion with Inverse Reinforcement Learning via + Interactive Questioning + + +
+ This paper proposes a novel framework for identifying an agent's risk +aversion using interactive questioning. Our study is conducted in two +scenarios: a one-period case and an infinite horizon case. In the one-period +case, we assume that the agent's risk aversion is characterized by a cost +function of the state and a distortion risk measure. In the infinite horizon +case, we model risk aversion with an additional component, a discount factor. +Assuming the access to a finite set of candidates containing the agent's true +risk aversion, we show that asking the agent to demonstrate her optimal +policies in various environment, which may depend on their previous answers, is +an effective means of identifying the agent's risk aversion. Specifically, we +prove that the agent's risk aversion can be identified as the number of +questions tends to infinity, and the questions are randomly designed. We also +develop an algorithm for designing optimal questions and provide empirical +evidence that our method learns risk aversion significantly faster than +randomly designed questions in simulations. Our framework has important +applications in robo-advising and provides a new approach for identifying an +agent's risk preferences. + +
+
+
+
+
+ + ☆ Digital twinning of cardiac electrophysiology models from the surface + ECG: a geodesic backpropagation approach + + +
+ The eikonal equation has become an indispensable tool for modeling cardiac +electrical activation accurately and efficiently. In principle, by matching +clinically recorded and eikonal-based electrocardiograms (ECGs), it is possible +to build patient-specific models of cardiac electrophysiology in a purely +non-invasive manner. Nonetheless, the fitting procedure remains a challenging +task. The present study introduces a novel method, Geodesic-BP, to solve the +inverse eikonal problem. Geodesic-BP is well-suited for GPU-accelerated machine +learning frameworks, allowing us to optimize the parameters of the eikonal +equation to reproduce a given ECG. We show that Geodesic-BP can reconstruct a +simulated cardiac activation with high accuracy in a synthetic test case, even +in the presence of modeling inaccuracies. Furthermore, we apply our algorithm +to a publicly available dataset of a rabbit model, with very positive results. +Given the future shift towards personalized medicine, Geodesic-BP has the +potential to help in future functionalizations of cardiac models meeting +clinical time constraints while maintaining the physiological accuracy of +state-of-the-art cardiac models. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ Explainable AI for clinical risk prediction: a survey of concepts, + methods, and modalities + + +
+ Recent advancements in AI applications to healthcare have shown incredible +promise in surpassing human performance in diagnosis and disease prognosis. +With the increasing complexity of AI models, however, concerns regarding their +opacity, potential biases, and the need for interpretability. To ensure trust +and reliability in AI systems, especially in clinical risk prediction models, +explainability becomes crucial. Explainability is usually referred to as an AI +system's ability to provide a robust interpretation of its decision-making +logic or the decisions themselves to human stakeholders. In clinical risk +prediction, other aspects of explainability like fairness, bias, trust, and +transparency also represent important concepts beyond just interpretability. In +this review, we address the relationship between these concepts as they are +often used together or interchangeably. This review also discusses recent +progress in developing explainable models for clinical risk prediction, +highlighting the importance of quantitative and clinical evaluation and +validation across multiple common modalities in clinical practice. It +emphasizes the need for external validation and the combination of diverse +interpretability methods to enhance trust and fairness. Adopting rigorous +testing, such as using synthetic datasets with known generative factors, can +further improve the reliability of explainability methods. Open access and +code-sharing resources are essential for transparency and reproducibility, +enabling the growth and trustworthiness of explainable research. While +challenges exist, an end-to-end approach to explainability in clinical risk +prediction, incorporating stakeholders from clinicians to developers, is +essential for success. + +
+
+
+
+
+ + ☆ Content-based Recommendation Engine for Video Streaming Platform + + +
+ Recommendation engine suggest content, product or services to the user by +using machine learning algorithm. This paper proposed a content-based +recommendation engine for providing video suggestion to the user based on their +previous interests and choices. We will use TF-IDF text vectorization method to +determine the relevance of words in a document. Then we will find out the +similarity between each content by calculating cosine similarity between them. +Finally, engine will recommend videos to the users based on the obtained +similarity score value. In addition, we will measure the engine's performance +by computing precision, recall, and F1 core of the proposed system. + +
+
+
+
+
+ + ☆ Fast Uncertainty Quantification of Spent Nuclear Fuel with Neural + Networks + + +
+ The accurate calculation and uncertainty quantification of the +characteristics of spent nuclear fuel (SNF) play a crucial role in ensuring the +safety, efficiency, and sustainability of nuclear energy production, waste +management, and nuclear safeguards. State of the art physics-based models, +while reliable, are computationally intensive and time-consuming. This paper +presents a surrogate modeling approach using neural networks (NN) to predict a +number of SNF characteristics with reduced computational costs compared to +physics-based models. An NN is trained using data generated from CASMO5 lattice +calculations. The trained NN accurately predicts decay heat and nuclide +concentrations of SNF, as a function of key input parameters, such as +enrichment, burnup, cooling time between cycles, mean boron concentration and +fuel temperature. The model is validated against physics-based decay heat +simulations and measurements of different uranium oxide fuel assemblies from +two different pressurized water reactors. In addition, the NN is used to +perform sensitivity analysis and uncertainty quantification. The results are in +very good alignment to CASMO5, while the computational costs (taking into +account the costs of generating training samples) are reduced by a factor of 10 +or more. Our findings demonstrate the feasibility of using NNs as surrogate +models for fast characterization of SNF, providing a promising avenue for +improving computational efficiency in assessing nuclear fuel behavior and +associated risks. + +
+
+
+
+
+ + ☆ Continuous Sweep: an improved, binary quantifier + + +
+ Quantification is a supervised machine learning task, focused on estimating +the class prevalence of a dataset rather than labeling its individual +observations. We introduce Continuous Sweep, a new parametric binary quantifier +inspired by the well-performing Median Sweep. Median Sweep is currently one of +the best binary quantifiers, but we have changed this quantifier on three +points, namely 1) using parametric class distributions instead of empirical +distributions, 2) optimizing decision boundaries instead of applying discrete +decision rules, and 3) calculating the mean instead of the median. We derive +analytic expressions for the bias and variance of Continuous Sweep under +general model assumptions. This is one of the first theoretical contributions +in the field of quantification learning. Moreover, these derivations enable us +to find the optimal decision boundaries. Finally, our simulation study shows +that Continuous Sweep outperforms Median Sweep in a wide range of situations. + +
+
+
+
+
+ + ☆ Precision and Recall Reject Curves for Classification + + +
+ For some classification scenarios, it is desirable to use only those +classification instances that a trained model associates with a high certainty. +To obtain such high-certainty instances, previous work has proposed +accuracy-reject curves. Reject curves allow to evaluate and compare the +performance of different certainty measures over a range of thresholds for +accepting or rejecting classifications. However, the accuracy may not be the +most suited evaluation metric for all applications, and instead precision or +recall may be preferable. This is the case, for example, for data with +imbalanced class distributions. We therefore propose reject curves that +evaluate precision and recall, the recall-reject curve and the precision-reject +curve. Using prototype-based classifiers from learning vector quantization, we +first validate the proposed curves on artificial benchmark data against the +accuracy reject curve as a baseline. We then show on imbalanced benchmarks and +medical, real-world data that for these scenarios, the proposed precision- and +recall-curves yield more accurate insights into classifier performance than +accuracy reject curves. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ A distributed neural network architecture for dynamic sensor selection + with application to bandwidth-constrained body-sensor networks + + +
+ We propose a dynamic sensor selection approach for deep neural networks +(DNNs), which is able to derive an optimal sensor subset selection for each +specific input sample instead of a fixed selection for the entire dataset. This +dynamic selection is jointly learned with the task model in an end-to-end way, +using the Gumbel-Softmax trick to allow the discrete decisions to be learned +through standard backpropagation. We then show how we can use this dynamic +selection to increase the lifetime of a wireless sensor network (WSN) by +imposing constraints on how often each node is allowed to transmit. We further +improve performance by including a dynamic spatial filter that makes the +task-DNN more robust against the fact that it now needs to be able to handle a +multitude of possible node subsets. Finally, we explain how the selection of +the optimal channels can be distributed across the different nodes in a WSN. We +validate this method on a use case in the context of body-sensor networks, +where we use real electroencephalography (EEG) sensor data to emulate an EEG +sensor network. We analyze the resulting trade-offs between transmission load +and task accuracy. + +
+
+
+
+
+ + ☆ PDPK: A Framework to Synthesise Process Data and Corresponding + Procedural Knowledge for Manufacturing + + +
+ Procedural knowledge describes how to accomplish tasks and mitigate problems. +Such knowledge is commonly held by domain experts, e.g. operators in +manufacturing who adjust parameters to achieve quality targets. To the best of +our knowledge, no real-world datasets containing process data and corresponding +procedural knowledge are publicly available, possibly due to corporate +apprehensions regarding the loss of knowledge advances. Therefore, we provide a +framework to generate synthetic datasets that can be adapted to different +domains. The design choices are inspired by two real-world datasets of +procedural knowledge we have access to. Apart from containing representations +of procedural knowledge in Resource Description Framework (RDF)-compliant +knowledge graphs, the framework simulates parametrisation processes and +provides consistent process data. We compare established embedding methods on +the resulting knowledge graphs, detailing which out-of-the-box methods have the +potential to represent procedural knowledge. This provides a baseline which can +be used to increase the comparability of future work. Furthermore, we validate +the overall characteristics of a synthesised dataset by comparing the results +to those achievable on a real-world dataset. The framework and evaluation code, +as well as the dataset used in the evaluation, are available open source. + +
+
+
+
+
+ + ☆ Dual-Branch Temperature Scaling Calibration for Long-Tailed Recognition + + +
+ The calibration for deep neural networks is currently receiving widespread +attention and research. Miscalibration usually leads to overconfidence of the +model. While, under the condition of long-tailed distribution of data, the +problem of miscalibration is more prominent due to the different confidence +levels of samples in minority and majority categories, and it will result in +more serious overconfidence. To address this problem, some current research +have designed diverse temperature coefficients for different categories based +on temperature scaling (TS) method. However, in the case of rare samples in +minority classes, the temperature coefficient is not generalizable, and there +is a large difference between the temperature coefficients of the training set +and the validation set. To solve this challenge, this paper proposes a +dual-branch temperature scaling calibration model (Dual-TS), which considers +the diversities in temperature parameters of different categories and the +non-generalizability of temperature parameters for rare samples in minority +classes simultaneously. Moreover, we noticed that the traditional calibration +evaluation metric, Excepted Calibration Error (ECE), gives a higher weight to +low-confidence samples in the minority classes, which leads to inaccurate +evaluation of model calibration. Therefore, we also propose Equal Sample Bin +Excepted Calibration Error (Esbin-ECE) as a new calibration evaluation metric. +Through experiments, we demonstrate that our model yields state-of-the-art in +both traditional ECE and Esbin-ECE metrics. + +
+
+
+
+
+ + ☆ KernelWarehouse: Towards Parameter-Efficient Dynamic Convolution + + +
+ Dynamic convolution learns a linear mixture of $n$ static kernels weighted +with their sample-dependent attentions, demonstrating superior performance +compared to normal convolution. However, existing designs are +parameter-inefficient: they increase the number of convolutional parameters by +$n$ times. This and the optimization difficulty lead to no research progress in +dynamic convolution that can allow us to use a significant large value of $n$ +(e.g., $n>100$ instead of typical setting $n<10$) to push forward the +performance boundary. In this paper, we propose $KernelWarehouse$, a more +general form of dynamic convolution, which can strike a favorable trade-off +between parameter efficiency and representation power. Its key idea is to +redefine the basic concepts of "$kernels$" and "$assembling$ $kernels$" in +dynamic convolution from the perspective of reducing kernel dimension and +increasing kernel number significantly. In principle, KernelWarehouse enhances +convolutional parameter dependencies within the same layer and across +successive layers via tactful kernel partition and warehouse sharing, yielding +a high degree of freedom to fit a desired parameter budget. We validate our +method on ImageNet and MS-COCO datasets with different ConvNet architectures, +and show that it attains state-of-the-art results. For instance, the +ResNet18|ResNet50|MobileNetV2|ConvNeXt-Tiny model trained with KernelWarehouse +on ImageNet reaches 76.05%|81.05%|75.52%|82.51% top-1 accuracy. Thanks to its +flexible design, KernelWarehouse can even reduce the model size of a ConvNet +while improving the accuracy, e.g., our ResNet18 model with 36.45%|65.10% +parameter reduction to the baseline shows 2.89%|2.29% absolute improvement to +top-1 accuracy. + +
+
+ comment: This research work was completed and submitted in early May 2023. + Code and pre-trained models are available at + https://github.com/OSVAI/KernelWarehouse +
+
+
+
+
+ + ☆ Independent Distribution Regularization for Private Graph Embedding CIKM 2023 + + +
+ Learning graph embeddings is a crucial task in graph mining tasks. An +effective graph embedding model can learn low-dimensional representations from +graph-structured data for data publishing benefiting various downstream +applications such as node classification, link prediction, etc. However, recent +studies have revealed that graph embeddings are susceptible to attribute +inference attacks, which allow attackers to infer private node attributes from +the learned graph embeddings. To address these concerns, privacy-preserving +graph embedding methods have emerged, aiming to simultaneously consider primary +learning and privacy protection through adversarial learning. However, most +existing methods assume that representation models have access to all sensitive +attributes in advance during the training stage, which is not always the case +due to diverse privacy preferences. Furthermore, the commonly used adversarial +learning technique in privacy-preserving representation learning suffers from +unstable training issues. In this paper, we propose a novel approach called +Private Variational Graph AutoEncoders (PVGAE) with the aid of independent +distribution penalty as a regularization term. Specifically, we split the +original variational graph autoencoder (VGAE) to learn sensitive and +non-sensitive latent representations using two sets of encoders. Additionally, +we introduce a novel regularization to enforce the independence of the +encoders. We prove the theoretical effectiveness of regularization from the +perspective of mutual information. Experimental results on three real-world +datasets demonstrate that PVGAE outperforms other baselines in private +embedding learning regarding utility performance and privacy protection. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ☆ Convergence of Two-Layer Regression with Nonlinear Units + + +
+ Large language models (LLMs), such as ChatGPT and GPT4, have shown +outstanding performance in many human life task. Attention computation plays an +important role in training LLMs. Softmax unit and ReLU unit are the key +structure in attention computation. Inspired by them, we put forward a softmax +ReLU regression problem. Generally speaking, our goal is to find an optimal +solution to the regression problem involving the ReLU unit. In this work, we +calculate a close form representation for the Hessian of the loss function. +Under certain assumptions, we prove the Lipschitz continuous and the PSDness of +the Hessian. Then, we introduce an greedy algorithm based on approximate Newton +method, which converges in the sense of the distance to optimal solution. Last, +We relax the Lipschitz condition and prove the convergence in the sense of loss +value. + +
+
+
+
+
+ + ☆ Is Meta-Learning the Right Approach for the Cold-Start Problem in + Recommender Systems? + + +
+ Recommender systems have become fundamental building blocks of modern online +products and services, and have a substantial impact on user experience. In the +past few years, deep learning methods have attracted a lot of research, and are +now heavily used in modern real-world recommender systems. Nevertheless, +dealing with recommendations in the cold-start setting, e.g., when a user has +done limited interactions in the system, is a problem that remains far from +solved. Meta-learning techniques, and in particular optimization-based +meta-learning, have recently become the most popular approaches in the academic +research literature for tackling the cold-start problem in deep learning models +for recommender systems. However, current meta-learning approaches are not +practical for real-world recommender systems, which have billions of users and +items, and strict latency requirements. In this paper we show that it is +possible to obtaining similar, or higher, performance on commonly used +benchmarks for the cold-start problem without using meta-learning techniques. +In more detail, we show that, when tuned correctly, standard and widely adopted +deep learning models perform just as well as newer meta-learning models. We +further show that an extremely simple modular approach using common +representation learning techniques, can perform comparably to meta-learning +techniques specifically designed for the cold-start setting while being much +more easily deployable in real-world applications. + +
+
+
+
+
+ + ☆ Graph Out-of-Distribution Generalization with Controllable Data + Augmentation + + +
+ Graph Neural Network (GNN) has demonstrated extraordinary performance in +classifying graph properties. However, due to the selection bias of training +and testing data (e.g., training on small graphs and testing on large graphs, +or training on dense graphs and testing on sparse graphs), distribution +deviation is widespread. More importantly, we often observe \emph{hybrid +structure distribution shift} of both scale and density, despite of one-sided +biased data partition. The spurious correlations over hybrid distribution +deviation degrade the performance of previous GNN methods and show large +instability among different datasets. To alleviate this problem, we propose +\texttt{OOD-GMixup} to jointly manipulate the training distribution with +\emph{controllable data augmentation} in metric space. Specifically, we first +extract the graph rationales to eliminate the spurious correlations due to +irrelevant information. Secondly, we generate virtual samples with perturbation +on graph rationale representation domain to obtain potential OOD training +samples. Finally, we propose OOD calibration to measure the distribution +deviation of virtual samples by leveraging Extreme Value Theory, and further +actively control the training distribution by emphasizing the impact of virtual +OOD samples. Extensive studies on several real-world datasets on graph +classification demonstrate the superiority of our proposed method over +state-of-the-art baselines. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Learning Logic Programs by Discovering Higher-Order Abstractions + + +
+ Discovering novel abstractions is important for human-level AI. We introduce +an approach to discover higher-order abstractions, such as map, filter, and +fold. We focus on inductive logic programming, which induces logic programs +from examples and background knowledge. We introduce the higher-order +refactoring problem, where the goal is to compress a logic program by +introducing higher-order abstractions. We implement our approach in STEVIE, +which formulates the higher-order refactoring problem as a constraint +optimisation problem. Our experimental results on multiple domains, including +program synthesis and visual reasoning, show that, compared to no refactoring, +STEVIE can improve predictive accuracies by 27% and reduce learning times by +47%. We also show that STEVIE can discover abstractions that transfer to +different domains + +
+
+
+
+
+ + ☆ Warped geometric information on the optimisation of Euclidean functions + + +
+ We consider the fundamental task of optimizing a real-valued function defined +in a potentially high-dimensional Euclidean space, such as the loss function in +many machine-learning tasks or the logarithm of the probability distribution in +statistical inference. We use the warped Riemannian geometry notions to +redefine the optimisation problem of a function on Euclidean space to a +Riemannian manifold with a warped metric, and then find the function's optimum +along this manifold. The warped metric chosen for the search domain induces a +computational friendly metric-tensor for which optimal search directions +associate with geodesic curves on the manifold becomes easier to compute. +Performing optimization along geodesics is known to be generally infeasible, +yet we show that in this specific manifold we can analytically derive Taylor +approximations up to third-order. In general these approximations to the +geodesic curve will not lie on the manifold, however we construct suitable +retraction maps to pull them back onto the manifold. Therefore, we can +efficiently optimize along the approximate geodesic curves. We cover the +related theory, describe a practical optimization algorithm and empirically +evaluate it on a collection of challenging optimisation benchmarks. Our +proposed algorithm, using third-order approximation of geodesics, outperforms +standard Euclidean gradient-based counterparts in term of number of iterations +until convergence and an alternative method for Hessian-based optimisation +routines. + +
+
+
+
+
+ + ☆ Robust Bayesian Satisficing + + +
+ Distributional shifts pose a significant challenge to achieving robustness in +contemporary machine learning. To overcome this challenge, robust satisficing +(RS) seeks a robust solution to an unspecified distributional shift while +achieving a utility above a desired threshold. This paper focuses on the +problem of RS in contextual Bayesian optimization when there is a discrepancy +between the true and reference distributions of the context. We propose a novel +robust Bayesian satisficing algorithm called RoBOS for noisy black-box +optimization. Our algorithm guarantees sublinear lenient regret under certain +assumptions on the amount of distribution shift. In addition, we define a +weaker notion of regret called robust satisficing regret, in which our +algorithm achieves a sublinear upper bound independent of the amount of +distribution shift. To demonstrate the effectiveness of our method, we apply it +to various learning problems and compare it to other approaches, such as +distributionally robust optimization. + +
+
+
+
+
+ + ☆ DFedADMM: Dual Constraints Controlled Model Inconsistency for + Decentralized Federated Learning + + +
+ To address the communication burden issues associated with federated learning +(FL), decentralized federated learning (DFL) discards the central server and +establishes a decentralized communication network, where each client +communicates only with neighboring clients. However, existing DFL methods still +suffer from two major challenges: local inconsistency and local heterogeneous +overfitting, which have not been fundamentally addressed by existing DFL +methods. To tackle these issues, we propose novel DFL algorithms, DFedADMM and +its enhanced version DFedADMM-SAM, to enhance the performance of DFL. The +DFedADMM algorithm employs primal-dual optimization (ADMM) by utilizing dual +variables to control the model inconsistency raised from the decentralized +heterogeneous data distributions. The DFedADMM-SAM algorithm further improves +on DFedADMM by employing a Sharpness-Aware Minimization (SAM) optimizer, which +uses gradient perturbations to generate locally flat models and searches for +models with uniformly low loss values to mitigate local heterogeneous +overfitting. Theoretically, we derive convergence rates of $\small +\mathcal{O}\Big(\frac{1}{\sqrt{KT}}+\frac{1}{KT(1-\psi)^2}\Big)$ and $\small +\mathcal{O}\Big(\frac{1}{\sqrt{KT}}+\frac{1}{KT(1-\psi)^2}+ +\frac{1}{T^{3/2}K^{1/2}}\Big)$ in the non-convex setting for DFedADMM and +DFedADMM-SAM, respectively, where $1 - \psi$ represents the spectral gap of the +gossip matrix. Empirically, extensive experiments on MNIST, CIFAR10 and +CIFAR100 datesets demonstrate that our algorithms exhibit superior performance +in terms of both generalization and convergence speed compared to existing +state-of-the-art (SOTA) optimizers in DFL. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ CARE: A Large Scale CT Image Dataset and Clinical Applicable Benchmark + Model for Rectal Cancer Segmentation + + +
+ Rectal cancer segmentation of CT image plays a crucial role in timely +clinical diagnosis, radiotherapy treatment, and follow-up. Although current +segmentation methods have shown promise in delineating cancerous tissues, they +still encounter challenges in achieving high segmentation precision. These +obstacles arise from the intricate anatomical structures of the rectum and the +difficulties in performing differential diagnosis of rectal cancer. +Additionally, a major obstacle is the lack of a large-scale, finely annotated +CT image dataset for rectal cancer segmentation. To address these issues, this +work introduces a novel large scale rectal cancer CT image dataset CARE with +pixel-level annotations for both normal and cancerous rectum, which serves as a +valuable resource for algorithm research and clinical application development. +Moreover, we propose a novel medical cancer lesion segmentation benchmark model +named U-SAM. The model is specifically designed to tackle the challenges posed +by the intricate anatomical structures of abdominal organs by incorporating +prompt information. U-SAM contains three key components: promptable information +(e.g., points) to aid in target area localization, a convolution module for +capturing low-level lesion details, and skip-connections to preserve and +recover spatial information during the encoding-decoding process. To evaluate +the effectiveness of U-SAM, we systematically compare its performance with +several popular segmentation methods on the CARE dataset. The generalization of +the model is further verified on the WORD dataset. Extensive experiments +demonstrate that the proposed U-SAM outperforms state-of-the-art methods on +these two datasets. These experiments can serve as the baseline for future +research and clinical application development. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ It Ain't That Bad: Understanding the Mysterious Performance Drop in OOD + Generalization for Generative Transformer Models + + +
+ Generative Transformer-based models have achieved remarkable proficiency on +solving diverse problems. However, their generalization ability is not fully +understood and not always satisfying. Researchers take basic mathematical tasks +like n-digit addition or multiplication as important perspectives for +investigating their generalization behaviors. Curiously, it is observed that +when training on n-digit operations (e.g., additions) in which both input +operands are n-digit in length, models generalize successfully on unseen +n-digit inputs (in-distribution (ID) generalization), but fail miserably and +mysteriously on longer, unseen cases (out-of-distribution (OOD) +generalization). Studies try to bridge this gap with workarounds such as +modifying position embedding, fine-tuning, and priming with more extensive or +instructive data. However, without addressing the essential mechanism, there is +hardly any guarantee regarding the robustness of these solutions. We bring this +unexplained performance drop into attention and ask whether it is purely from +random errors. Here we turn to the mechanistic line of research which has +notable successes in model interpretability. We discover that the strong ID +generalization stems from structured representations, while behind the +unsatisfying OOD performance, the models still exhibit clear learned algebraic +structures. Specifically, these models map unseen OOD inputs to outputs with +equivalence relations in the ID domain. These highlight the potential of the +models to carry useful information for improved generalization. + +
+
+
+
+
+ + ☆ Graph Relation Aware Continual Learning + + +
+ Continual graph learning (CGL) studies the problem of learning from an +infinite stream of graph data, consolidating historical knowledge, and +generalizing it to the future task. At once, only current graph data are +available. Although some recent attempts have been made to handle this task, we +still face two potential challenges: 1) most of existing works only manipulate +on the intermediate graph embedding and ignore intrinsic properties of graphs. +It is non-trivial to differentiate the transferred information across graphs. +2) recent attempts take a parameter-sharing policy to transfer knowledge across +time steps or progressively expand new architecture given shifted graph +distribution. Learning a single model could loss discriminative information for +each graph task while the model expansion scheme suffers from high model +complexity. In this paper, we point out that latent relations behind graph +edges can be attributed as an invariant factor for the evolving graphs and the +statistical information of latent relations evolves. Motivated by this, we +design a relation-aware adaptive model, dubbed as RAM-CG, that consists of a +relation-discovery modular to explore latent relations behind edges and a +task-awareness masking classifier to accounts for the shifted. Extensive +experiments show that RAM-CG provides significant 2.2%, 6.9% and 6.6% accuracy +improvements over the state-of-the-art results on CitationNet, OGBN-arxiv and +TWITCH dataset, respective. + +
+
+
+
+
+ + ☆ Two Phases of Scaling Laws for Nearest Neighbor Classifiers + + +
+ A scaling law refers to the observation that the test performance of a model +improves as the number of training data increases. A fast scaling law implies +that one can solve machine learning problems by simply boosting the data and +the model sizes. Yet, in many cases, the benefit of adding more data can be +negligible. In this work, we study the rate of scaling laws of nearest neighbor +classifiers. We show that a scaling law can have two phases: in the first +phase, the generalization error depends polynomially on the data dimension and +decreases fast; whereas in the second phase, the error depends exponentially on +the data dimension and decreases slowly. Our analysis highlights the complexity +of the data distribution in determining the generalization error. When the data +distributes benignly, our result suggests that nearest neighbor classifier can +achieve a generalization error that depends polynomially, instead of +exponentially, on the data dimension. + +
+
+
+
+
+ + ☆ The Expressive Power of Graph Neural Networks: A Survey + + +
+ Graph neural networks (GNNs) are effective machine learning models for many +graph-related applications. Despite their empirical success, many research +efforts focus on the theoretical limitations of GNNs, i.e., the GNNs expressive +power. Early works in this domain mainly focus on studying the graph +isomorphism recognition ability of GNNs, and recent works try to leverage the +properties such as subgraph counting and connectivity learning to characterize +the expressive power of GNNs, which are more practical and closer to +real-world. However, no survey papers and open-source repositories +comprehensively summarize and discuss models in this important direction. To +fill the gap, we conduct a first survey for models for enhancing expressive +power under different forms of definition. Concretely, the models are reviewed +based on three categories, i.e., Graph feature enhancement, Graph topology +enhancement, and GNNs architecture enhancement. + +
+
+
+
+
+ + ☆ Challenges and Opportunities of Using Transformer-Based Multi-Task + Learning in NLP Through ML Lifecycle: A Survey + + +
+ The increasing adoption of natural language processing (NLP) models across +industries has led to practitioners' need for machine learning systems to +handle these models efficiently, from training to serving them in production. +However, training, deploying, and updating multiple models can be complex, +costly, and time-consuming, mainly when using transformer-based pre-trained +language models. Multi-Task Learning (MTL) has emerged as a promising approach +to improve efficiency and performance through joint training, rather than +training separate models. Motivated by this, we first provide an overview of +transformer-based MTL approaches in NLP. Then, we discuss the challenges and +opportunities of using MTL approaches throughout typical ML lifecycle phases, +specifically focusing on the challenges related to data engineering, model +development, deployment, and monitoring phases. This survey focuses on +transformer-based MTL architectures and, to the best of our knowledge, is novel +in that it systematically analyses how transformer-based MTL in NLP fits into +ML lifecycle phases. Furthermore, we motivate research on the connection +between MTL and continual learning (CL), as this area remains unexplored. We +believe it would be practical to have a model that can handle both MTL and CL, +as this would make it easier to periodically re-train the model, update it due +to distribution shifts, and add new capabilities to meet real-world +requirements. + +
+
+
+
+
+ + ☆ SCQPTH: an efficient differentiable splitting method for convex + quadratic programming + + +
+ We present SCQPTH: a differentiable first-order splitting method for convex +quadratic programs. The SCQPTH framework is based on the alternating direction +method of multipliers (ADMM) and the software implementation is motivated by +the state-of-the art solver OSQP: an operating splitting solver for convex +quadratic programs (QPs). The SCQPTH software is made available as an +open-source python package and contains many similar features including +efficient reuse of matrix factorizations, infeasibility detection, automatic +scaling and parameter selection. The forward pass algorithm performs operator +splitting in the dimension of the original problem space and is therefore +suitable for large scale QPs with $100-1000$ decision variables and thousands +of constraints. Backpropagation is performed by implicit differentiation of the +ADMM fixed-point mapping. Experiments demonstrate that for large scale QPs, +SCQPTH can provide a $1\times - 10\times$ improvement in computational +efficiency in comparison to existing differentiable QP solvers. + +
+
+
+
+
+ + ☆ Exploring Winograd Convolution for Cost-effective Neural Network Fault + Tolerance + + +
+ Winograd is generally utilized to optimize convolution performance and +computational efficiency because of the reduced multiplication operations, but +the reliability issues brought by winograd are usually overlooked. In this +work, we observe the great potential of winograd convolution in improving +neural network (NN) fault tolerance. Based on the observation, we evaluate +winograd convolution fault tolerance comprehensively from different +granularities ranging from models, layers, and operation types for the first +time. Then, we explore the use of inherent fault tolerance of winograd +convolution for cost-effective NN protection against soft errors. Specifically, +we mainly investigate how winograd convolution can be effectively incorporated +with classical fault-tolerant design approaches including triple modular +redundancy (TMR), fault-aware retraining, and constrained activation functions. +According to our experiments, winograd convolution can reduce the +fault-tolerant design overhead by 55.77\% on average without any accuracy loss +compared to standard convolution, and further reduce the computing overhead by +17.24\% when the inherent fault tolerance of winograd convolution is +considered. When it is applied on fault-tolerant neural networks enhanced with +fault-aware retraining and constrained activation functions, the resulting +model accuracy generally shows significant improvement in presence of various +faults. + +
+
+
+
+
+ + ☆ Inherent Redundancy in Spiking Neural Networks ICCV2023 + + +
+ Spiking Neural Networks (SNNs) are well known as a promising energy-efficient +alternative to conventional artificial neural networks. Subject to the +preconceived impression that SNNs are sparse firing, the analysis and +optimization of inherent redundancy in SNNs have been largely overlooked, thus +the potential advantages of spike-based neuromorphic computing in accuracy and +energy efficiency are interfered. In this work, we pose and focus on three key +questions regarding the inherent redundancy in SNNs. We argue that the +redundancy is induced by the spatio-temporal invariance of SNNs, which enhances +the efficiency of parameter utilization but also invites lots of noise spikes. +Further, we analyze the effect of spatio-temporal invariance on the +spatio-temporal dynamics and spike firing of SNNs. Then, motivated by these +analyses, we propose an Advance Spatial Attention (ASA) module to harness SNNs' +redundancy, which can adaptively optimize their membrane potential distribution +by a pair of individual spatial attention sub-modules. In this way, noise spike +features are accurately regulated. Experimental results demonstrate that the +proposed method can significantly drop the spike firing with better performance +than state-of-the-art SNN baselines. Our code is available in +\url{https://github.com/BICLab/ASA-SNN}. + +
+
+ comment: Accepted by ICCV2023 +
+
+
+
+
+ + ☆ How To Overcome Confirmation Bias in Semi-Supervised Image + Classification By Active Learning ECML + + +
+ Do we need active learning? The rise of strong deep semi-supervised methods +raises doubt about the usability of active learning in limited labeled data +settings. This is caused by results showing that combining semi-supervised +learning (SSL) methods with a random selection for labeling can outperform +existing active learning (AL) techniques. However, these results are obtained +from experiments on well-established benchmark datasets that can overestimate +the external validity. However, the literature lacks sufficient research on the +performance of active semi-supervised learning methods in realistic data +scenarios, leaving a notable gap in our understanding. Therefore we present +three data challenges common in real-world applications: between-class +imbalance, within-class imbalance, and between-class similarity. These +challenges can hurt SSL performance due to confirmation bias. We conduct +experiments with SSL and AL on simulated data challenges and find that random +sampling does not mitigate confirmation bias and, in some cases, leads to worse +performance than supervised learning. In contrast, we demonstrate that AL can +overcome confirmation bias in SSL in these realistic settings. Our results +provide insights into the potential of combining active and semi-supervised +learning in the presence of common real-world challenges, which is a promising +direction for robust methods when learning with limited labeled data in +real-world applications. + +
+
+ comment: Accepted @ ECML PKDD 2023. This is the author's version of the work. + The definitive Version of Record will be published in the Proceedings of ECML + PKDD 2023 +
+
+
+
+
+ + ☆ HyperSNN: A new efficient and robust deep learning model for resource + constrained control applications + + +
+ In light of the increasing adoption of edge computing in areas such as +intelligent furniture, robotics, and smart homes, this paper introduces +HyperSNN, an innovative method for control tasks that uses spiking neural +networks (SNNs) in combination with hyperdimensional computing. HyperSNN +substitutes expensive 32-bit floating point multiplications with 8-bit integer +additions, resulting in reduced energy consumption while enhancing robustness +and potentially improving accuracy. Our model was tested on AI Gym benchmarks, +including Cartpole, Acrobot, MountainCar, and Lunar Lander. HyperSNN achieves +control accuracies that are on par with conventional machine learning methods +but with only 1.36% to 9.96% of the energy expenditure. Furthermore, our +experiments showed increased robustness when using HyperSNN. We believe that +HyperSNN is especially suitable for interactive, mobile, and wearable devices, +promoting energy-efficient and robust system design. Furthermore, it paves the +way for the practical implementation of complex algorithms like model +predictive control (MPC) in real-world industrial scenarios. + +
+
+
+
+
+ + ☆ Epicure: Distilling Sequence Model Predictions into Patterns + + +
+ Most machine learning models predict a probability distribution over concrete +outputs and struggle to accurately predict names over high entropy sequence +distributions. Here, we explore finding abstract, high-precision patterns +intrinsic to these predictions in order to make abstract predictions that +usefully capture rare sequences. In this short paper, we present Epicure, a +method that distils the predictions of a sequence model, such as the output of +beam search, into simple patterns. Epicure maps a model's predictions into a +lattice that represents increasingly more general patterns that subsume the +concrete model predictions. + On the tasks of predicting a descriptive name of a function given the source +code of its body and detecting anomalous names given a function, we show that +Epicure yields accurate naming patterns that match the ground truth more often +compared to just the highest probability model prediction. For a false alarm +rate of 10%, Epicure predicts patterns that match 61% more ground-truth names +compared to the best model prediction, making Epicure well-suited for scenarios +that require high precision. + +
+
+
+
+
+ + ☆ DeSCo: Towards Generalizable and Scalable Deep Subgraph Counting + + +
+ Subgraph counting is the problem of counting the occurrences of a given query +graph in a large target graph. Large-scale subgraph counting is useful in +various domains, such as motif counting for social network analysis and loop +counting for money laundering detection on transaction networks. Recently, to +address the exponential runtime complexity of scalable subgraph counting, +neural methods are proposed. However, existing neural counting approaches fall +short in three aspects. Firstly, the counts of the same query can vary from +zero to millions on different target graphs, posing a much larger challenge +than most graph regression tasks. Secondly, current scalable graph neural +networks have limited expressive power and fail to efficiently distinguish +graphs in count prediction. Furthermore, existing neural approaches cannot +predict the occurrence position of queries in the target graph. + Here we design DeSCo, a scalable neural deep subgraph counting pipeline, +which aims to accurately predict the query count and occurrence position on any +target graph after one-time training. Firstly, DeSCo uses a novel canonical +partition and divides the large target graph into small neighborhood graphs. +The technique greatly reduces the count variation while guaranteeing no missing +or double-counting. Secondly, neighborhood counting uses an expressive +subgraph-based heterogeneous graph neural network to accurately perform +counting in each neighborhood. Finally, gossip propagation propagates +neighborhood counts with learnable gates to harness the inductive biases of +motif counts. DeSCo is evaluated on eight real-world datasets from various +domains. It outperforms state-of-the-art neural methods with 137x improvement +in the mean squared error of count prediction, while maintaining the polynomial +runtime complexity. + +
+
+ comment: 8 pages main text, 10 pages appendix +
+
+
+
+
+ + ☆ Endogenous Macrodynamics in Algorithmic Recourse + + +
+ Existing work on Counterfactual Explanations (CE) and Algorithmic Recourse +(AR) has largely focused on single individuals in a static environment: given +some estimated model, the goal is to find valid counterfactuals for an +individual instance that fulfill various desiderata. The ability of such +counterfactuals to handle dynamics like data and model drift remains a largely +unexplored research challenge. There has also been surprisingly little work on +the related question of how the actual implementation of recourse by one +individual may affect other individuals. Through this work, we aim to close +that gap. We first show that many of the existing methodologies can be +collectively described by a generalized framework. We then argue that the +existing framework does not account for a hidden external cost of recourse, +that only reveals itself when studying the endogenous dynamics of recourse at +the group level. Through simulation experiments involving various state-of +the-art counterfactual generators and several benchmark datasets, we generate +large numbers of counterfactuals and study the resulting domain and model +shifts. We find that the induced shifts are substantial enough to likely impede +the applicability of Algorithmic Recourse in some situations. Fortunately, we +find various strategies to mitigate these concerns. Our simulation framework +for studying recourse dynamics is fast and opensourced. + +
+
+ comment: 12 pages, 11 figures. Originally published at the 2023 IEEE + Conference on Secure and Trustworthy Machine Learning (SaTML). IEEE holds the + copyright +
+
+
+
+
+ + ☆ Accelerating Generic Graph Neural Networks via Architecture, Compiler, + Partition Method Co-Design + + +
+ Graph neural networks (GNNs) have shown significant accuracy improvements in +a variety of graph learning domains, sparking considerable research interest. +To translate these accuracy improvements into practical applications, it is +essential to develop high-performance and efficient hardware acceleration for +GNN models. However, designing GNN accelerators faces two fundamental +challenges: the high bandwidth requirement of GNN models and the diversity of +GNN models. Previous works have addressed the first challenge by using more +expensive memory interfaces to achieve higher bandwidth. For the second +challenge, existing works either support specific GNN models or have generic +designs with poor hardware utilization. + In this work, we tackle both challenges simultaneously. First, we identify a +new type of partition-level operator fusion, which we utilize to internally +reduce the high bandwidth requirement of GNNs. Next, we introduce +partition-level multi-threading to schedule the concurrent processing of graph +partitions, utilizing different hardware resources. To further reduce the extra +on-chip memory required by multi-threading, we propose fine-grained graph +partitioning to generate denser graph partitions. Importantly, these three +methods make no assumptions about the targeted GNN models, addressing the +challenge of model variety. We implement these methods in a framework called +SwitchBlade, consisting of a compiler, a graph partitioner, and a hardware +accelerator. Our evaluation demonstrates that SwitchBlade achieves an average +speedup of $1.85\times$ and energy savings of $19.03\times$ compared to the +NVIDIA V100 GPU. Additionally, SwitchBlade delivers performance comparable to +state-of-the-art specialized accelerators. + +
+
+
+
+
+ + ☆ Expressivity of Graph Neural Networks Through the Lens of Adversarial + Robustness + + +
+ We perform the first adversarial robustness study into Graph Neural Networks +(GNNs) that are provably more powerful than traditional Message Passing Neural +Networks (MPNNs). In particular, we use adversarial robustness as a tool to +uncover a significant gap between their theoretically possible and empirically +achieved expressive power. To do so, we focus on the ability of GNNs to count +specific subgraph patterns, which is an established measure of expressivity, +and extend the concept of adversarial robustness to this task. Based on this, +we develop efficient adversarial attacks for subgraph counting and show that +more powerful GNNs fail to generalize even to small perturbations to the +graph's structure. Expanding on this, we show that such architectures also fail +to count substructures on out-of-distribution graphs. + +
+
+ comment: Published in ${2}^{nd}$ AdvML Frontiers workshop at ${40}^{th}$ + International Conference on Machine Learning +
+
+
+
+
+ + ☆ AATCT-IDS: A Benchmark Abdominal Adipose Tissue CT Image Dataset for + Image Denoising, Semantic Segmentation, and Radiomics Evaluation + + +
+ Methods: In this study, a benchmark \emph{Abdominal Adipose Tissue CT Image +Dataset} (AATTCT-IDS) containing 300 subjects is prepared and published. +AATTCT-IDS publics 13,732 raw CT slices, and the researchers individually +annotate the subcutaneous and visceral adipose tissue regions of 3,213 of those +slices that have the same slice distance to validate denoising methods, train +semantic segmentation models, and study radiomics. For different tasks, this +paper compares and analyzes the performance of various methods on AATTCT-IDS by +combining the visualization results and evaluation data. Thus, verify the +research potential of this data set in the above three types of tasks. + Results: In the comparative study of image denoising, algorithms using a +smoothing strategy suppress mixed noise at the expense of image details and +obtain better evaluation data. Methods such as BM3D preserve the original image +structure better, although the evaluation data are slightly lower. The results +show significant differences among them. In the comparative study of semantic +segmentation of abdominal adipose tissue, the segmentation results of adipose +tissue by each model show different structural characteristics. Among them, +BiSeNet obtains segmentation results only slightly inferior to U-Net with the +shortest training time and effectively separates small and isolated adipose +tissue. In addition, the radiomics study based on AATTCT-IDS reveals three +adipose distributions in the subject population. + Conclusion: AATTCT-IDS contains the ground truth of adipose tissue regions in +abdominal CT slices. This open-source dataset can attract researchers to +explore the multi-dimensional characteristics of abdominal adipose tissue and +thus help physicians and patients in clinical practice. AATCT-IDS is freely +published for non-commercial purpose at: +\url{https://figshare.com/articles/dataset/AATTCT-IDS/23807256}. + +
+
+ comment: 17 pages, 7 figures +
+
+
+
+
+ + ☆ A Quantum Approximation Scheme for k-Means + + +
+ We give a quantum approximation scheme (i.e., $(1 + +\varepsilon)$-approximation for every $\varepsilon > 0$) for the classical +$k$-means clustering problem in the QRAM model with a running time that has +only polylogarithmic dependence on the number of data points. More +specifically, given a dataset $V$ with $N$ points in $\mathbb{R}^d$ stored in +QRAM data structure, our quantum algorithm runs in time $\tilde{O} \left( +2^{\tilde{O}(\frac{k}{\varepsilon})} \eta^2 d\right)$ and with high probability +outputs a set $C$ of $k$ centers such that $cost(V, C) \leq (1+\varepsilon) +\cdot cost(V, C_{OPT})$. Here $C_{OPT}$ denotes the optimal $k$-centers, +$cost(.)$ denotes the standard $k$-means cost function (i.e., the sum of the +squared distance of points to the closest center), and $\eta$ is the aspect +ratio (i.e., the ratio of maximum distance to minimum distance). This is the +first quantum algorithm with a polylogarithmic running time that gives a +provable approximation guarantee of $(1+\varepsilon)$ for the $k$-means +problem. Also, unlike previous works on unsupervised learning, our quantum +algorithm does not require quantum linear algebra subroutines and has a running +time independent of parameters (e.g., condition number) that appear in such +procedures. + +
+
+
+
+
+ + ☆ Characteristics of networks generated by kernel growing neural gas + + +
+ This research aims to develop kernel GNG, a kernelized version of the growing +neural gas (GNG) algorithm, and to investigate the features of the networks +generated by the kernel GNG. The GNG is an unsupervised artificial neural +network that can transform a dataset into an undirected graph, thereby +extracting the features of the dataset as a graph. The GNG is widely used in +vector quantization, clustering, and 3D graphics. Kernel methods are often used +to map a dataset to feature space, with support vector machines being the most +prominent application. This paper introduces the kernel GNG approach and +explores the characteristics of the networks generated by kernel GNG. Five +kernels, including Gaussian, Laplacian, Cauchy, inverse multiquadric, and log +kernels, are used in this study. + +
+
+
+
+
+ + ☆ Interpretability Benchmark for Evaluating Spatial Misalignment of + Prototypical Parts Explanations + + +
+ Prototypical parts-based networks are becoming increasingly popular due to +their faithful self-explanations. However, their similarity maps are calculated +in the penultimate network layer. Therefore, the receptive field of the +prototype activation region often depends on parts of the image outside this +region, which can lead to misleading interpretations. We name this undesired +behavior a spatial explanation misalignment and introduce an interpretability +benchmark with a set of dedicated metrics for quantifying this phenomenon. In +addition, we propose a method for misalignment compensation and apply it to +existing state-of-the-art models. We show the expressiveness of our benchmark +and the effectiveness of the proposed compensation methodology through +extensive empirical studies. + +
+
+ comment: Under review. Code will be release upon acceptance +
+
+
+
+
+ + ☆ Benchmarking Adversarial Robustness of Compressed Deep Learning Models + + +
+ The increasing size of Deep Neural Networks (DNNs) poses a pressing need for +model compression, particularly when employed on resource constrained devices. +Concurrently, the susceptibility of DNNs to adversarial attacks presents +another significant hurdle. Despite substantial research on both model +compression and adversarial robustness, their joint examination remains +underexplored. Our study bridges this gap, seeking to understand the effect of +adversarial inputs crafted for base models on their pruned versions. To examine +this relationship, we have developed a comprehensive benchmark across diverse +adversarial attacks and popular DNN models. We uniquely focus on models not +previously exposed to adversarial training and apply pruning schemes optimized +for accuracy and performance. Our findings reveal that while the benefits of +pruning enhanced generalizability, compression, and faster inference times are +preserved, adversarial robustness remains comparable to the base model. This +suggests that model compression while offering its unique advantages, does not +undermine adversarial robustness. + +
+
+
+
+
+ + ☆ Deep Generative Imputation Model for Missing Not At Random Data + + +
+ Data analysis usually suffers from the Missing Not At Random (MNAR) problem, +where the cause of the value missing is not fully observed. Compared to the +naive Missing Completely At Random (MCAR) problem, it is more in line with the +realistic scenario whereas more complex and challenging. Existing statistical +methods model the MNAR mechanism by different decomposition of the joint +distribution of the complete data and the missing mask. But we empirically find +that directly incorporating these statistical methods into deep generative +models is sub-optimal. Specifically, it would neglect the confidence of the +reconstructed mask during the MNAR imputation process, which leads to +insufficient information extraction and less-guaranteed imputation quality. In +this paper, we revisit the MNAR problem from a novel perspective that the +complete data and missing mask are two modalities of incomplete data on an +equal footing. Along with this line, we put forward a generative-model-specific +joint probability decomposition method, conjunction model, to represent the +distributions of two modalities in parallel and extract sufficient information +from both complete data and missing mask. Taking a step further, we exploit a +deep generative imputation model, namely GNR, to process the real-world missing +mechanism in the latent space and concurrently impute the incomplete data and +reconstruct the missing mask. The experimental results show that our GNR +surpasses state-of-the-art MNAR baselines with significant margins (averagely +improved from 9.9% to 18.8% in RMSE) and always gives a better mask +reconstruction accuracy which makes the imputation more principle. + +
+
+
+
+
+ + ☆ Sarcasm Detection in a Disaster Context + + +
+ During natural disasters, people often use social media platforms such as +Twitter to ask for help, to provide information about the disaster situation, +or to express contempt about the unfolding event or public policies and +guidelines. This contempt is in some cases expressed as sarcasm or irony. +Understanding this form of speech in a disaster-centric context is essential to +improving natural language understanding of disaster-related tweets. In this +paper, we introduce HurricaneSARC, a dataset of 15,000 tweets annotated for +intended sarcasm, and provide a comprehensive investigation of sarcasm +detection using pre-trained language models. Our best model is able to obtain +as much as 0.70 F1 on our dataset. We also demonstrate that the performance on +HurricaneSARC can be improved by leveraging intermediate task transfer +learning. We release our data and code at +https://github.com/tsosea2/HurricaneSarc. + +
+
+
+
+
+ + ☆ Hierarchical Topological Ordering with Conditional Independence Test for + Limited Time Series + + +
+ Learning directed acyclic graphs (DAGs) to identify causal relations +underlying observational data is crucial but also poses significant challenges. +Recently, topology-based methods have emerged as a two-step approach to +discovering DAGs by first learning the topological ordering of variables and +then eliminating redundant edges, while ensuring that the graph remains +acyclic. However, one limitation is that these methods would generate numerous +spurious edges that require subsequent pruning. To overcome this limitation, in +this paper, we propose an improvement to topology-based methods by introducing +limited time series data, consisting of only two cross-sectional records that +need not be adjacent in time and are subject to flexible timing. By +incorporating conditional instrumental variables as exogenous interventions, we +aim to identify descendant nodes for each variable. Following this line, we +propose a hierarchical topological ordering algorithm with conditional +independence test (HT-CIT), which enables the efficient learning of sparse DAGs +with a smaller search space compared to other popular approaches. The HT-CIT +algorithm greatly reduces the number of edges that need to be pruned. Empirical +results from synthetic and real-world datasets demonstrate the superiority of +the proposed HT-CIT algorithm. + +
+
+
+
+
+ + ☆ Online Control for Linear Dynamics: A Data-Driven Approach + + +
+ This paper considers an online control problem over a linear time-invariant +system with unknown dynamics, bounded disturbance, and adversarial cost. We +propose a data-driven strategy to reduce the regret of the controller. Unlike +model-based methods, our algorithm does not identify the system model, instead, +it leverages a single noise-free trajectory to calculate the accumulation of +disturbance and makes decisions using the accumulated disturbance action +controller we design, whose parameters are updated by online gradient descent. +We prove that the regret of our algorithm is $\mathcal{O}(\sqrt{T})$ under mild +assumptions, suggesting that its performance is on par with model-based +methods. + +
+
+
+
+
+ + ☆ Microstructure-Empowered Stock Factor Extraction and Utilization + + +
+ High-frequency quantitative investment is a crucial aspect of stock +investment. Notably, order flow data plays a critical role as it provides the +most detailed level of information among high-frequency trading data, including +comprehensive data from the order book and transaction records at the tick +level. The order flow data is extremely valuable for market analysis as it +equips traders with essential insights for making informed decisions. However, +extracting and effectively utilizing order flow data present challenges due to +the large volume of data involved and the limitations of traditional factor +mining techniques, which are primarily designed for coarser-level stock data. +To address these challenges, we propose a novel framework that aims to +effectively extract essential factors from order flow data for diverse +downstream tasks across different granularities and scenarios. Our method +consists of a Context Encoder and an Factor Extractor. The Context Encoder +learns an embedding for the current order flow data segment's context by +considering both the expected and actual market state. In addition, the Factor +Extractor uses unsupervised learning methods to select such important signals +that are most distinct from the majority within the given context. The +extracted factors are then utilized for downstream tasks. In empirical studies, +our proposed framework efficiently handles an entire year of stock order flow +data across diverse scenarios, offering a broader range of applications +compared to existing tick-level approaches that are limited to only a few days +of stock data. We demonstrate that our method extracts superior factors from +order flow data, enabling significant improvement for stock trend prediction +and order execution tasks at the second and minute level. + +
+
+
+
+
+ + ☆ Is Self-Supervised Pretraining Good for Extrapolation in Molecular + Property Prediction? + + +
+ The prediction of material properties plays a crucial role in the development +and discovery of materials in diverse applications, such as batteries, +semiconductors, catalysts, and pharmaceuticals. Recently, there has been a +growing interest in employing data-driven approaches by using machine learning +technologies, in combination with conventional theoretical calculations. In +material science, the prediction of unobserved values, commonly referred to as +extrapolation, is particularly critical for property prediction as it enables +researchers to gain insight into materials beyond the limits of available data. +However, even with the recent advancements in powerful machine learning models, +accurate extrapolation is still widely recognized as a significantly +challenging problem. On the other hand, self-supervised pretraining is a +machine learning technique where a model is first trained on unlabeled data +using relatively simple pretext tasks before being trained on labeled data for +target tasks. As self-supervised pretraining can effectively utilize material +data without observed property values, it has the potential to improve the +model's extrapolation ability. In this paper, we clarify how such +self-supervised pretraining can enhance extrapolation performance.We propose an +experimental framework for the demonstration and empirically reveal that while +models were unable to accurately extrapolate absolute property values, +self-supervised pretraining enables them to learn relative tendencies of +unobserved property values and improve extrapolation performance. + +
+
+
+
+
+ + ☆ How to Mask in Error Correction Code Transformer: Systematic and Double + Masking + + +
+ In communication and storage systems, error correction codes (ECCs) are +pivotal in ensuring data reliability. As deep learning's applicability has +broadened across diverse domains, there is a growing research focus on neural +network-based decoders that outperform traditional decoding algorithms. Among +these neural decoders, Error Correction Code Transformer (ECCT) has achieved +the state-of-the-art performance, outperforming other methods by large margins. +To further enhance the performance of ECCT, we propose two novel methods. +First, leveraging the systematic encoding technique of ECCs, we introduce a new +masking matrix for ECCT, aiming to improve the performance and reduce the +computational complexity. Second, we propose a novel transformer architecture +of ECCT called a double-masked ECCT. This architecture employs two different +mask matrices in a parallel manner to learn more diverse features of the +relationship between codeword bits in the masked self-attention blocks. +Extensive simulation results show that the proposed double-masked ECCT +outperforms the conventional ECCT, achieving the state-of-the-art decoding +performance with significant margins. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ S-Mixup: Structural Mixup for Graph Neural Networks CIKM 2023 + + +
+ Existing studies for applying the mixup technique on graphs mainly focus on +graph classification tasks, while the research in node classification is still +under-explored. In this paper, we propose a novel mixup augmentation for node +classification called Structural Mixup (S-Mixup). The core idea is to take into +account the structural information while mixing nodes. Specifically, S-Mixup +obtains pseudo-labels for unlabeled nodes in a graph along with their +prediction confidence via a Graph Neural Network (GNN) classifier. These serve +as the criteria for the composition of the mixup pool for both inter and +intra-class mixups. Furthermore, we utilize the edge gradient obtained from the +GNN training and propose a gradient-based edge selection strategy for selecting +edges to be attached to the nodes generated by the mixup. Through extensive +experiments on real-world benchmark datasets, we demonstrate the effectiveness +of S-Mixup evaluated on the node classification task. We observe that S-Mixup +enhances the robustness and generalization performance of GNNs, especially in +heterophilous situations. The source code of S-Mixup can be found at +\url{https://github.com/SukwonYun/S-Mixup} + +
+
+ comment: CIKM 2023 (Short Paper) +
+
+
+
+
+ + ☆ Safety Filter Design for Neural Network Systems via Convex Optimization + + +
+ With the increase in data availability, it has been widely demonstrated that +neural networks (NN) can capture complex system dynamics precisely in a +data-driven manner. However, the architectural complexity and nonlinearity of +the NNs make it challenging to synthesize a provably safe controller. In this +work, we propose a novel safety filter that relies on convex optimization to +ensure safety for a NN system, subject to additive disturbances that are +capable of capturing modeling errors. Our approach leverages tools from NN +verification to over-approximate NN dynamics with a set of linear bounds, +followed by an application of robust linear MPC to search for controllers that +can guarantee robust constraint satisfaction. We demonstrate the efficacy of +the proposed framework numerically on a nonlinear pendulum system. + +
+
+ comment: This paper has been accepted to the 2023 62nd IEEE Conference on + Decision and Control (CDC) +
+
+
+
+
+ + ☆ Rigid Transformations for Stabilized Lower Dimensional Space to Support + Subsurface Uncertainty Quantification and Interpretation + + +
+ Subsurface datasets inherently possess big data characteristics such as vast +volume, diverse features, and high sampling speeds, further compounded by the +curse of dimensionality from various physical, engineering, and geological +inputs. Among the existing dimensionality reduction (DR) methods, nonlinear +dimensionality reduction (NDR) methods, especially Metric-multidimensional +scaling (MDS), are preferred for subsurface datasets due to their inherent +complexity. While MDS retains intrinsic data structure and quantifies +uncertainty, its limitations include unstabilized unique solutions invariant to +Euclidean transformations and an absence of out-of-sample points (OOSP) +extension. To enhance subsurface inferential and machine learning workflows, +datasets must be transformed into stable, reduced-dimension representations +that accommodate OOSP. + Our solution employs rigid transformations for a stabilized Euclidean +invariant representation for LDS. By computing an MDS input dissimilarity +matrix, and applying rigid transformations on multiple realizations, we ensure +transformation invariance and integrate OOSP. This process leverages a convex +hull algorithm and incorporates loss function and normalized stress for +distortion quantification. We validate our approach with synthetic data, +varying distance metrics, and real-world wells from the Duvernay Formation. +Results confirm our method's efficacy in achieving consistent LDS +representations. Furthermore, our proposed "stress ratio" (SR) metric provides +insight into uncertainty, beneficial for model adjustments and inferential +analysis. Consequently, our workflow promises enhanced repeatability and +comparability in NDR for subsurface energy resource engineering and associated +big data workflows. + +
+
+ comment: 30 pages, 17 figures, Submitted to Computational Geosciences Journal +
+
+
+
+
+ + ♻ ☆ AI-Assisted Discovery of Quantitative and Formal Models in Social + Science + + +
+ In social science, formal and quantitative models, such as ones describing +economic growth and collective action, are used to formulate mechanistic +explanations, provide predictions, and uncover questions about observed +phenomena. Here, we demonstrate the use of a machine learning system to aid the +discovery of symbolic models that capture nonlinear and dynamical relationships +in social science datasets. By extending neuro-symbolic methods to find compact +functions and differential equations in noisy and longitudinal data, we show +that our system can be used to discover interpretable models from real-world +data in economics and sociology. Augmenting existing workflows with symbolic +regression can help uncover novel relationships and explore counterfactual +models during the scientific process. We propose that this AI-assisted +framework can bridge parametric and non-parametric models commonly employed in +social science research by systematically exploring the space of nonlinear +models and enabling fine-grained control over expressivity and +interpretability. + +
+
+ comment: 19 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Data Selection: A Surprisingly Effective and General Principle for + Building Small Interpretable Models + + +
+ We present convincing empirical evidence for an effective and general +strategy for building accurate small models. Such models are attractive for +interpretability and also find use in resource-constrained environments. The +strategy is to learn the training distribution instead of using data from the +test distribution. The distribution learning algorithm is not a contribution of +this work; we highlight the broad usefulness of this simple strategy on a +diverse set of tasks, and as such these rigorous empirical results are our +contribution. We apply it to the tasks of (1) building cluster explanation +trees, (2) prototype-based classification, and (3) classification using Random +Forests, and show that it improves the accuracy of weak traditional baselines +to the point that they are surprisingly competitive with specialized modern +techniques. + This strategy is also versatile wrt the notion of model size. In the first +two tasks, model size is identified by number of leaves in the tree and the +number of prototypes respectively. In the final task involving Random Forests +the strategy is shown to be effective even when model size is determined by +more than one factor: number of trees and their maximum depth. + Positive results using multiple datasets are presented that are shown to be +statistically significant. These lead us to conclude that this strategy is both +effective, i.e, leads to significant improvements, and general, i.e., is +applicable to different tasks and model families, and therefore merits further +attention in domains that require small accurate models. + +
+
+
+
+
+ + ♻ ☆ Decision-Focused Learning: Foundations, State of the Art, Benchmark and + Future Opportunities + + +
+ Decision-focused learning (DFL) is an emerging paradigm in machine learning +which trains a model to optimize decisions, integrating prediction and +optimization in an end-to-end system. This paradigm holds the promise to +revolutionize decision-making in many real-world applications which operate +under uncertainty, where the estimation of unknown parameters within these +decision models often becomes a substantial roadblock. This paper presents a +comprehensive review of DFL. It provides an in-depth analysis of the various +techniques devised to integrate machine learning and optimization models, +introduces a taxonomy of DFL methods distinguished by their unique +characteristics, and conducts an extensive empirical evaluation of these +methods proposing suitable benchmark dataset and tasks for DFL. Finally, the +study provides valuable insights into current and potential future avenues in +DFL research. + +
+
+ comment: Experimental Survey and Benchmarking +
+
+
+
+
+ + ♻ ☆ Large-Scale Traffic Congestion Prediction based on Multimodal Fusion and + Representation Mapping + + +
+ With the progress of the urbanisation process, the urban transportation +system is extremely critical to the development of cities and the quality of +life of the citizens. Among them, it is one of the most important tasks to +judge traffic congestion by analysing the congestion factors. Recently, various +traditional and machine-learning-based models have been introduced for +predicting traffic congestion. However, these models are either poorly +aggregated for massive congestion factors or fail to make accurate predictions +for every precise location in large-scale space. To alleviate these problems, a +novel end-to-end framework based on convolutional neural networks is proposed +in this paper. With learning representations, the framework proposes a novel +multimodal fusion module and a novel representation mapping module to achieve +traffic congestion predictions on arbitrary query locations on a large-scale +map, combined with various global reference information. The proposed framework +achieves significant results and efficient inference on real-world large-scale +datasets. + +
+
+
+
+
+ + ♻ ☆ Sensitivity-Aware Mixed-Precision Quantization and Width Optimization of + Deep Neural Networks Through Cluster-Based Tree-Structured Parzen Estimation + + +
+ As the complexity and computational demands of deep learning models rise, the +need for effective optimization methods for neural network designs becomes +paramount. This work introduces an innovative search mechanism for +automatically selecting the best bit-width and layer-width for individual +neural network layers. This leads to a marked enhancement in deep neural +network efficiency. The search domain is strategically reduced by leveraging +Hessian-based pruning, ensuring the removal of non-crucial parameters. +Subsequently, we detail the development of surrogate models for favorable and +unfavorable outcomes by employing a cluster-based tree-structured Parzen +estimator. This strategy allows for a streamlined exploration of architectural +possibilities and swift pinpointing of top-performing designs. Through rigorous +testing on well-known datasets, our method proves its distinct advantage over +existing methods. Compared to leading compression strategies, our approach +records an impressive 20% decrease in model size without compromising accuracy. +Additionally, our method boasts a 12x reduction in search time relative to the +best search-focused strategies currently available. As a result, our proposed +method represents a leap forward in neural network design optimization, paving +the way for quick model design and implementation in settings with limited +resources, thereby propelling the potential of scalable deep learning +solutions. + +
+
+
+
+
+ + ♻ ☆ Box$^2$EL: Concept and Role Box Embeddings for the Description Logic + EL++ + + +
+ Description logic (DL) ontologies extend knowledge graphs (KGs) with +conceptual information and logical background knowledge. In recent years, there +has been growing interest in inductive reasoning techniques for such +ontologies, which promise to complement classical deductive reasoning +algorithms. Similar to KG completion, several existing approaches learn +ontology embeddings in a latent space, while additionally ensuring that they +faithfully capture the logical semantics of the underlying DL. However, they +suffer from several shortcomings, mainly due to a limiting role representation. +We propose Box$^2$EL, which represents both concepts and roles as boxes (i.e., +axis-aligned hyperrectangles) and demonstrate how it overcomes the limitations +of previous methods. We theoretically prove the soundness of our model and +conduct an extensive experimental evaluation, achieving state-of-the-art +results across a variety of datasets. As part of our evaluation, we introduce a +novel benchmark for subsumption prediction involving both atomic and complex +concepts. + +
+
+
+
+
+ + ♻ ☆ Disentangled Representation Learning + + +
+ Disentangled Representation Learning (DRL) aims to learn a model capable of +identifying and disentangling the underlying factors hidden in the observable +data in representation form. The process of separating underlying factors of +variation into variables with semantic meaning benefits in learning explainable +representations of data, which imitates the meaningful understanding process of +humans when observing an object or relation. As a general learning strategy, +DRL has demonstrated its power in improving the model explainability, +controlability, robustness, as well as generalization capacity in a wide range +of scenarios such as computer vision, natural language processing, data mining +etc. In this article, we comprehensively review DRL from various aspects +including motivations, definitions, methodologies, evaluations, applications +and model designs. We discuss works on DRL based on two well-recognized +definitions, i.e., Intuitive Definition and Group Theory Definition. We further +categorize the methodologies for DRL into four groups, i.e., Traditional +Statistical Approaches, Variational Auto-encoder Based Approaches, Generative +Adversarial Networks Based Approaches, Hierarchical Approaches and Other +Approaches. We also analyze principles to design different DRL models that may +benefit different tasks in practical applications. Finally, we point out +challenges in DRL as well as potential research directions deserving future +investigations. We believe this work may provide insights for promoting the DRL +research in the community. + +
+
+ comment: 26 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Black Box Few-Shot Adaptation for Vision-Language models ICCV 2023 + + +
+ Vision-Language (V-L) models trained with contrastive learning to align the +visual and language modalities have been shown to be strong few-shot learners. +Soft prompt learning is the method of choice for few-shot downstream adaption +aiming to bridge the modality gap caused by the distribution shift induced by +the new domain. While parameter-efficient, prompt learning still requires +access to the model weights and can be computationally infeasible for large +models with billions of parameters. To address these shortcomings, in this +work, we describe a black-box method for V-L few-shot adaptation that (a) +operates on pre-computed image and text features and hence works without access +to the model's weights, (b) it is orders of magnitude faster at training time, +(c) it is amenable to both supervised and unsupervised training, and (d) it can +be even used to align image and text features computed from uni-modal models. +To achieve this, we propose Linear Feature Alignment (LFA), a simple linear +approach for V-L re-alignment in the target domain. LFA is initialized from a +closed-form solution to a least-squares problem and then it is iteratively +updated by minimizing a re-ranking loss. Despite its simplicity, our approach +can even surpass soft-prompt learning methods as shown by extensive experiments +on 11 image and 2 video datasets. + +
+
+ comment: Published at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ A Distributionally Robust Approach to Regret Optimal Control using the + Wasserstein Distance + + +
+ This paper proposes a distributionally robust approach to regret optimal +control of discrete-time linear dynamical systems with quadratic costs subject +to a stochastic additive disturbance on the state process. The underlying +probability distribution of the disturbance process is unknown, but assumed to +lie in a given ball of distributions defined in terms of the type-2 Wasserstein +distance. In this framework, strictly causal linear disturbance feedback +controllers are designed to minimize the worst-case expected regret. The regret +incurred by a controller is defined as the difference between the cost it +incurs in response to a realization of the disturbance process and the cost +incurred by the optimal noncausal controller which has perfect knowledge of the +disturbance process realization at the outset. Building on a well-established +duality theory for optimal transport problems, we derive a reformulation of the +minimax regret optimal control problem as a tractable semidefinite program. +Using the equivalent dual reformulation, we characterize a worst-case +distribution achieving the worst-case expected regret in relation to the +distribution at the center of the Wasserstein ball. We compare the minimax +regret optimal control design method with the distributionally robust optimal +control approach using an illustrative example and numerical experiments. + +
+
+ comment: 8 pages, 3 figures, to appear in the proceedings of the 2023 IEEE + Conference on Decision and Control (CDC) +
+
+
+
+
+ + ♻ ☆ RD-DPP: Rate-Distortion Theory Meets Determinantal Point Process to + Diversify Learning Data Samples + + +
+ In some practical learning tasks, such as traffic video analysis, the number +of available training samples is restricted by different factors, such as +limited communication bandwidth and computation power. Determinantal Point +Process (DPP) is a common method for selecting the most diverse samples to +enhance learning quality. However, the number of selected samples is restricted +to the rank of the kernel matrix implied by the dimensionality of data samples. +Secondly, it is not easily customizable to different learning tasks. In this +paper, we propose a new way of measuring task-oriented diversity based on the +Rate-Distortion (RD) theory, appropriate for multi-level classification. To +this end, we establish a fundamental relationship between DPP and RD theory. We +observe that the upper bound of the diversity of data selected by DPP has a +universal trend of $\textit{phase transition}$, which suggests that DPP is +beneficial only at the beginning of sample accumulation. This led to the design +of a bi-modal method, where RD-DPP is used in the first mode to select initial +data samples, then classification inconsistency (as an uncertainty measure) is +used to select the subsequent samples in the second mode. This phase transition +solves the limitation to the rank of the similarity matrix. Applying our method +to six different datasets and five benchmark models suggests that our method +consistently outperforms random selection, DPP-based methods, and alternatives +like uncertainty-based and coreset methods under all sampling budgets, while +exhibiting high generalizability to different learning tasks. + +
+
+
+
+
+ + ♻ ☆ EfficientTrain: Exploring Generalized Curriculum Learning for Training + Visual Backbones ICCV 2023 + + +
+ The superior performance of modern deep networks usually comes with a costly +training procedure. This paper presents a new curriculum learning approach for +the efficient training of visual backbones (e.g., vision Transformers). Our +work is inspired by the inherent learning dynamics of deep networks: we +experimentally show that at an earlier training stage, the model mainly learns +to recognize some 'easier-to-learn' discriminative patterns within each +example, e.g., the lower-frequency components of images and the original +information before data augmentation. Driven by this phenomenon, we propose a +curriculum where the model always leverages all the training data at each +epoch, while the curriculum starts with only exposing the 'easier-to-learn' +patterns of each example, and introduces gradually more difficult patterns. To +implement this idea, we 1) introduce a cropping operation in the Fourier +spectrum of the inputs, which enables the model to learn from only the +lower-frequency components efficiently, 2) demonstrate that exposing the +features of original images amounts to adopting weaker data augmentation, and +3) integrate 1) and 2) and design a curriculum learning schedule with a +greedy-search algorithm. The resulting approach, EfficientTrain, is simple, +general, yet surprisingly effective. As an off-the-shelf method, it reduces the +wall-time training cost of a wide variety of popular models (e.g., ResNet, +ConvNeXt, DeiT, PVT, Swin, and CSWin) by >1.5x on ImageNet-1K/22K without +sacrificing accuracy. It is also effective for self-supervised learning (e.g., +MAE). Code is available at https://github.com/LeapLabTHU/EfficientTrain. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ QBSD: Quartile-Based Seasonality Decomposition for Cost-Effective Time + Series Forecasting + + +
+ In the telecom domain, precise forecasting of time series patterns, such as +cell key performance indicators (KPIs), plays a pivotal role in enhancing +service quality and operational efficiency. State-of-the-art forecasting +approaches prioritize forecasting accuracy at the expense of computational +performance, rendering them less suitable for data-intensive applications +encompassing systems with a multitude of time series variables. To address this +issue, we introduce QBSD, a live forecasting approach tailored to optimize the +trade-off between accuracy and computational complexity. We have evaluated the +performance of QBSD against state-of-the-art forecasting approaches on publicly +available datasets. We have also extended this investigation to our curated +network KPI dataset, now publicly accessible, to showcase the effect of dynamic +operating ranges that varies with time. The results demonstrate that the +proposed method excels in runtime efficiency compared to the leading algorithms +available while maintaining competitive forecast accuracy. + +
+
+
+
+
+ + ♻ ☆ Latent Dynamical Implicit Diffusion Processes + + +
+ Latent dynamical models are commonly used to learn the distribution of a +latent dynamical process that represents a sequence of noisy data samples. +However, producing samples from such models with high fidelity is challenging +due to the complexity and variability of latent and observation dynamics. +Recent advances in diffusion-based generative models, such as DDPM and NCSN, +have shown promising alternatives to state-of-the-art latent generative models, +such as Neural ODEs, RNNs, and Normalizing flow networks, for generating +high-quality sequential samples from a prior distribution. However, their +application in modeling sequential data with latent dynamical models is yet to +be explored. Here, we propose a novel latent variable model named latent +dynamical implicit diffusion processes (LDIDPs), which utilizes implicit +diffusion processes to sample from dynamical latent processes and generate +sequential observation samples accordingly. We tested LDIDPs on synthetic and +simulated neural decoding problems. We demonstrate that LDIDPs can accurately +learn the dynamics over latent dimensions. Furthermore, the implicit sampling +method allows for the computationally efficient generation of high-quality +sequential data samples from the latent and observation spaces. + +
+
+ comment: I request a withdrawal because there are no experiments with + real-world datasets and also the method section requires major changes to + look mathematically sounds +
+
+
+
+
+ + ♻ ☆ LLM Cognitive Judgements Differ From Human + + +
+ Large Language Models (LLMs) have lately been on the spotlight of +researchers, businesses, and consumers alike. While the linguistic capabilities +of such models have been studied extensively, there is growing interest in +investigating them as cognitive subjects. In the present work I examine GPT-3 +and ChatGPT capabilities on an limited-data inductive reasoning task from the +cognitive science literature. The results suggest that these models' cognitive +judgements are not human-like. + +
+
+ comment: 7 pages, 1 figure. License changed to CC BY-NC-SA +
+
+
+
+
+ + ♻ ☆ Bluetooth and WiFi Dataset for Real World RF Fingerprinting of + Commercial Devices + + +
+ RF fingerprinting is emerging as a physical layer security scheme to identify +illegitimate and/or unauthorized emitters sharing the RF spectrum. However, due +to the lack of publicly accessible real-world datasets, most research focuses +on generating synthetic waveforms with software-defined radios (SDRs) which are +not suited for practical deployment settings. On other hand, the limited +datasets that are available focus only on chipsets that generate only one kind +of waveform. Commercial off-the-shelf (COTS) combo chipsets that support two +wireless standards (for example WiFi and Bluetooth) over a shared dual-band +antenna such as those found in laptops, adapters, wireless chargers, Raspberry +Pis, among others are becoming ubiquitous in the IoT realm. Hence, to keep up +with the modern IoT environment, there is a pressing need for real-world open +datasets capturing emissions from these combo chipsets transmitting +heterogeneous communication protocols. To this end, we capture the first known +emissions from the COTS IoT chipsets transmitting WiFi and Bluetooth under two +different time frames. The different time frames are essential to rigorously +evaluate the generalization capability of the models. To ensure widespread use, +each capture within the comprehensive 72 GB dataset is long enough (40 +MSamples) to support diverse input tensor lengths and formats. Finally, the +dataset also comprises emissions at varying signal powers to account for the +feeble to high signal strength emissions as encountered in a real-world +setting. + +
+
+ comment: Revision Under Review +
+
+
+
+
+ + ♻ ☆ Deep Unrolling Networks with Recurrent Momentum Acceleration for + Nonlinear Inverse Problems + + +
+ Combining the strengths of model-based iterative algorithms and data-driven +deep learning solutions, deep unrolling networks (DuNets) have become a popular +tool to solve inverse imaging problems. While DuNets have been successfully +applied to many linear inverse problems, nonlinear problems tend to impair the +performance of the method. Inspired by momentum acceleration techniques that +are often used in optimization algorithms, we propose a recurrent momentum +acceleration (RMA) framework that uses a long short-term memory recurrent +neural network (LSTM-RNN) to simulate the momentum acceleration process. The +RMA module leverages the ability of the LSTM-RNN to learn and retain knowledge +from the previous gradients. We apply RMA to two popular DuNets -- the learned +proximal gradient descent (LPGD) and the learned primal-dual (LPD) methods, +resulting in LPGD-RMA and LPD-RMA respectively. We provide experimental results +on two nonlinear inverse problems: a nonlinear deconvolution problem, and an +electrical impedance tomography problem with limited boundary measurements. In +the first experiment we have observed that the improvement due to RMA largely +increases with respect to the nonlinearity of the problem. The results of the +second example further demonstrate that the RMA schemes can significantly +improve the performance of DuNets in strongly ill-posed problems. + +
+
+
+
+
+ + ♻ ☆ Echoes: Unsupervised Debiasing via Pseudo-bias Labeling in an Echo + Chamber + + +
+ Neural networks often learn spurious correlations when exposed to biased +training data, leading to poor performance on out-of-distribution data. A +biased dataset can be divided, according to biased features, into bias-aligned +samples (i.e., with biased features) and bias-conflicting samples (i.e., +without biased features). Recent debiasing works typically assume that no bias +label is available during the training phase, as obtaining such information is +challenging and labor-intensive. Following this unsupervised assumption, +existing methods usually train two models: a biased model specialized to learn +biased features and a target model that uses information from the biased model +for debiasing. This paper first presents experimental analyses revealing that +the existing biased models overfit to bias-conflicting samples in the training +data, which negatively impacts the debiasing performance of the target models. +To address this issue, we propose a straightforward and effective method called +Echoes, which trains a biased model and a target model with a different +strategy. We construct an "echo chamber" environment by reducing the weights of +samples which are misclassified by the biased model, to ensure the biased model +fully learns the biased features without overfitting to the bias-conflicting +samples. The biased model then assigns lower weights on the bias-conflicting +samples. Subsequently, we use the inverse of the sample weights of the biased +model for training the target model. Experiments show that our approach +achieves superior debiasing results compared to the existing baselines on both +synthetic and real-world datasets. Our code is available at +https://github.com/isruihu/Echoes. + +
+
+ comment: Accepted by ACM Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ SpecInfer: Accelerating Generative Large Language Model Serving with + Speculative Inference and Token Tree Verification + + +
+ The high computational and memory requirements of generative large language +models (LLMs) make it challenging to serve them quickly and cheaply. This paper +introduces SpecInfer, an LLM serving system that accelerates generative LLM +inference with speculative inference and token tree verification. A key insight +behind Specinfer is to combine various collectively boost-tuned small language +models to jointly predict the LLM's outputs; the predictions are organized as a +token tree, whose nodes each represent a candidate token sequence. The +correctness of all candidate token sequences represented by a token tree is +verified against the LLM in parallel using a novel tree-based parallel decoding +mechanism. SpecInfer uses an LLM as a token tree verifier instead of an +incremental decoder, which significantly reduces the end-to-end latency and +computational requirement for serving generative LLMs while provably preserving +model quality. Our evaluation shows that SpecInfer outperforms existing LLM +serving systems by 1.3-2.4x for distributed LLM inference and by 2.6-3.5x for +offloading-based LLM inference, while preserving the same generative +performance. SpecInfer is publicly available at +https://github.com/flexflow/FlexFlow/tree/inference. + +
+
+
+
+
+ + ♻ ☆ How does over-squashing affect the power of GNNs? + + +
+ Graph Neural Networks (GNNs) are the state-of-the-art model for machine +learning on graph-structured data. The most popular class of GNNs operate by +exchanging information between adjacent nodes, and are known as Message Passing +Neural Networks (MPNNs). Given their widespread use, understanding the +expressive power of MPNNs is a key question. However, existing results +typically consider settings with uninformative node features. In this paper, we +provide a rigorous analysis to determine which function classes of node +features can be learned by an MPNN of a given capacity. We do so by measuring +the level of pairwise interactions between nodes that MPNNs allow for. This +measure provides a novel quantitative characterization of the so-called +over-squashing effect, which is observed to occur when a large volume of +messages is aggregated into fixed-size vectors. Using our measure, we prove +that, to guarantee sufficient communication between pairs of nodes, the +capacity of the MPNN must be large enough, depending on properties of the input +graph structure, such as commute times. For many relevant scenarios, our +analysis results in impossibility statements in practice, showing that +over-squashing hinders the expressive power of MPNNs. We validate our +theoretical findings through extensive controlled experiments and ablation +studies. + +
+
+ comment: 37 pages +
+
+
+
+
+ + ♻ ☆ Bi-level Contrastive Learning for Knowledge-Enhanced Molecule + Representations + + +
+ Molecule representation learning underpins diverse downstream applications +such as molecular property and side effect understanding and prediction. In +this paper, we recognize the two-level structure of individual molecule as +having intrinsic graph structure as well as being a node in a large molecule +knowledge graph, and present GODE, a new approach that seamlessly integrates +graph representations of individual molecules with multi-domain biomedical data +from knowledge graphs. By pre-training two graph neural networks (GNNs) on +different graph structures, combined with contrastive learning, GODE adeptly +fuses molecular structures with their corresponding knowledge graph +substructures. This fusion results in a more robust and informative +representation, enhancing molecular property prediction by harnessing both +chemical and biological information. Finetuned on 11 chemical property tasks, +our model surpasses benchmarks, achieving an average ROC-AUC improvement of +14.5%, 9.8%, and 7.3% on BBBP, SIDER, and Tox21 datasets. In regression tasks +on ESOL and QM7 datasets, we achieve average improvements of 21.0% and 29.6% +improvements in RMSE and MAE, setting a new field benchmark. + +
+
+
+
+
+ + ♻ ☆ Text-only domain adaptation for end-to-end ASR using integrated + text-to-mel-spectrogram generator INTERSPEECH 2023 + + +
+ We propose an end-to-end Automatic Speech Recognition (ASR) system that can +be trained on transcribed speech data, text-only data, or a mixture of both. +The proposed model uses an integrated auxiliary block for text-based training. +This block combines a non-autoregressive multi-speaker text-to-mel-spectrogram +generator with a GAN-based enhancer to improve the spectrogram quality. The +proposed system can generate a mel-spectrogram dynamically during training. It +can be used to adapt the ASR model to a new domain by using text-only data from +this domain. We demonstrate that the proposed training method significantly +improves ASR accuracy compared to the system trained on transcribed speech +only. It also surpasses cascade TTS systems with the vocoder in the adaptation +quality and training speed. + +
+
+ comment: Accepted to INTERSPEECH 2023 +
+
+
+
+
+ + ♻ ☆ An ensemble of VisNet, Transformer-M, and pretraining models for + molecular property prediction in OGB Large-Scale Challenge @ NeurIPS 2022 + + +
+ In the technical report, we provide our solution for OGB-LSC 2022 Graph +Regression Task. The target of this task is to predict the quantum chemical +property, HOMO-LUMO gap for a given molecule on PCQM4Mv2 dataset. In the +competition, we designed two kinds of models: Transformer-M-ViSNet which is an +geometry-enhanced graph neural network for fully connected molecular graphs and +Pretrained-3D-ViSNet which is a pretrained ViSNet by distilling geomeotric +information from optimized structures. With an ensemble of 22 models, ViSNet +Team achieved the MAE of 0.0723 eV on the test-challenge set, dramatically +reducing the error by 39.75% compared with the best method in the last year +competition. + +
+
+
+
+
+ + ♻ ☆ Unsafe Diffusion: On the Generation of Unsafe Images and Hateful Memes + From Text-To-Image Models + + +
+ State-of-the-art Text-to-Image models like Stable Diffusion and DALLE$\cdot$2 +are revolutionizing how people generate visual content. At the same time, +society has serious concerns about how adversaries can exploit such models to +generate unsafe images. In this work, we focus on demystifying the generation +of unsafe images and hateful memes from Text-to-Image models. We first +construct a typology of unsafe images consisting of five categories (sexually +explicit, violent, disturbing, hateful, and political). Then, we assess the +proportion of unsafe images generated by four advanced Text-to-Image models +using four prompt datasets. We find that these models can generate a +substantial percentage of unsafe images; across four models and four prompt +datasets, 14.56% of all generated images are unsafe. When comparing the four +models, we find different risk levels, with Stable Diffusion being the most +prone to generating unsafe content (18.92% of all generated images are unsafe). +Given Stable Diffusion's tendency to generate more unsafe content, we evaluate +its potential to generate hateful meme variants if exploited by an adversary to +attack a specific individual or community. We employ three image editing +methods, DreamBooth, Textual Inversion, and SDEdit, which are supported by +Stable Diffusion. Our evaluation result shows that 24% of the generated images +using DreamBooth are hateful meme variants that present the features of the +original hateful meme and the target individual/community; these generated +images are comparable to hateful meme variants collected from the real world. +Overall, our results demonstrate that the danger of large-scale generation of +unsafe images is imminent. We discuss several mitigating measures, such as +curating training data, regulating prompts, and implementing safety filters, +and encourage better safeguard tools to be developed to prevent unsafe +generation. + +
+
+ comment: To Appear in the ACM Conference on Computer and Communications + Security, November 26, 2023 +
+
+
+
+
+ + ♻ ☆ 3D-aware Blending with Generative NeRFs ICCV 2023 + + +
+ Image blending aims to combine multiple images seamlessly. It remains +challenging for existing 2D-based methods, especially when input images are +misaligned due to differences in 3D camera poses and object shapes. To tackle +these issues, we propose a 3D-aware blending method using generative Neural +Radiance Fields (NeRF), including two key components: 3D-aware alignment and +3D-aware blending. For 3D-aware alignment, we first estimate the camera pose of +the reference image with respect to generative NeRFs and then perform 3D local +alignment for each part. To further leverage 3D information of the generative +NeRF, we propose 3D-aware blending that directly blends images on the NeRF's +latent representation space, rather than raw pixel space. Collectively, our +method outperforms existing 2D baselines, as validated by extensive +quantitative and qualitative evaluations with FFHQ and AFHQ-Cat. + +
+
+ comment: ICCV 2023, Project page: https://blandocs.github.io/blendnerf +
+
+
+
+
+ + ♻ ☆ Active Learning for Optimal Intervention Design in Causal Models + + +
+ Sequential experimental design to discover interventions that achieve a +desired outcome is a key problem in various domains including science, +engineering and public policy. When the space of possible interventions is +large, making an exhaustive search infeasible, experimental design strategies +are needed. In this context, encoding the causal relationships between the +variables, and thus the effect of interventions on the system, is critical for +identifying desirable interventions more efficiently. Here, we develop a causal +active learning strategy to identify interventions that are optimal, as +measured by the discrepancy between the post-interventional mean of the +distribution and a desired target mean. The approach employs a Bayesian update +for the causal model and prioritizes interventions using a carefully designed, +causally informed acquisition function. This acquisition function is evaluated +in closed form, allowing for fast optimization. The resulting algorithms are +theoretically grounded with information-theoretic bounds and provable +consistency results for linear causal models with known causal graph. We apply +our approach to both synthetic data and single-cell transcriptomic data from +Perturb-CITE-seq experiments to identify optimal perturbations that induce a +specific cell state transition. The causally informed acquisition function +generally outperforms existing criteria allowing for optimal intervention +design with fewer but carefully selected samples. + +
+
+
+
+
+ + ♻ ☆ Editing Language Model-based Knowledge Graph Embeddings + + +
+ Recently decades have witnessed the empirical success of framing Knowledge +Graph (KG) embeddings via language models. However, language model-based KG +embeddings are usually deployed as static artifacts, making them difficult to +modify post-deployment without re-training after deployment. To address this +issue, we propose a new task of editing language model-based KG embeddings in +this paper. This task is designed to facilitate rapid, data-efficient updates +to KG embeddings without compromising the performance of other aspects. We +build four new datasets: E-FB15k237, A-FB15k237, E-WN18RR, and A-WN18RR, and +evaluate several knowledge editing baselines demonstrating the limited ability +of previous models to handle the proposed challenging task. We further propose +a simple yet strong baseline dubbed KGEditor, which utilizes additional +parametric layers of the hyper network to edit/add facts. Our comprehensive +experimental results reveal that KGEditor excels in updating specific facts +without impacting the overall performance, even when faced with limited +training resources. Code and datasets are available in +https://github.com/zjunlp/PromptKG/tree/main/deltaKG. + +
+
+ comment: Work in progress and the project website is + https://zjunlp.github.io/project/KGE_Editing/ +
+
+
+
+
+ + ♻ ☆ Neural radiance fields in the industrial and robotics domain: + applications, research opportunities and use cases + + +
+ The proliferation of technologies, such as extended reality (XR), has +increased the demand for high-quality three-dimensional (3D) graphical +representations. Industrial 3D applications encompass computer-aided design +(CAD), finite element analysis (FEA), scanning, and robotics. However, current +methods employed for industrial 3D representations suffer from high +implementation costs and reliance on manual human input for accurate 3D +modeling. To address these challenges, neural radiance fields (NeRFs) have +emerged as a promising approach for learning 3D scene representations based on +provided training 2D images. Despite a growing interest in NeRFs, their +potential applications in various industrial subdomains are still unexplored. +In this paper, we deliver a comprehensive examination of NeRF industrial +applications while also providing direction for future research endeavors. We +also present a series of proof-of-concept experiments that demonstrate the +potential of NeRFs in the industrial domain. These experiments include +NeRF-based video compression techniques and using NeRFs for 3D motion +estimation in the context of collision avoidance. In the video compression +experiment, our results show compression savings up to 48\% and 74\% for +resolutions of 1920x1080 and 300x168, respectively. The motion estimation +experiment used a 3D animation of a robotic arm to train Dynamic-NeRF (D-NeRF) +and achieved an average peak signal-to-noise ratio (PSNR) of disparity map with +the value of 23 dB and an structural similarity index measure (SSIM) 0.97. + +
+
+
+
+
+ + ♻ ☆ STS-GAN: Can We Synthesize Solid Texture with High Fidelity from + Arbitrary 2D Exemplar? + + +
+ Solid texture synthesis (STS), an effective way to extend a 2D exemplar to a +3D solid volume, exhibits advantages in computational photography. However, +existing methods generally fail to accurately learn arbitrary textures, which +may result in the failure to synthesize solid textures with high fidelity. In +this paper, we propose a novel generative adversarial nets-based framework +(STS-GAN) to extend the given 2D exemplar to arbitrary 3D solid textures. In +STS-GAN, multi-scale 2D texture discriminators evaluate the similarity between +the given 2D exemplar and slices from the generated 3D texture, promoting the +3D texture generator synthesizing realistic solid textures. Finally, +experiments demonstrate that the proposed method can generate high-fidelity +solid textures with similar visual characteristics to the 2D exemplar. + +
+
+
+
+
+ + ♻ ☆ Relevant Entity Selection: Knowledge Graph Bootstrapping via Zero-Shot + Analogical Pruning + + +
+ Knowledge Graph Construction (KGC) can be seen as an iterative process +starting from a high quality nucleus that is refined by knowledge extraction +approaches in a virtuous loop. Such a nucleus can be obtained from knowledge +existing in an open KG like Wikidata. However, due to the size of such generic +KGs, integrating them as a whole may entail irrelevant content and scalability +issues. We propose an analogy-based approach that starts from seed entities of +interest in a generic KG, and keeps or prunes their neighboring entities. We +evaluate our approach on Wikidata through two manually labeled datasets that +contain either domain-homogeneous or -heterogeneous seed entities. We +empirically show that our analogy-based approach outperforms LSTM, Random +Forest, SVM, and MLP, with a drastically lower number of parameters. We also +evaluate its generalization potential in a transfer learning setting. These +results advocate for the further integration of analogy-based inference in +tasks related to the KG lifecycle. + +
+
+
+
+
+ + ♻ ☆ Beyond the Meta: Leveraging Game Design Parameters for Patch-Agnostic + Esport Analytics + + +
+ Esport games comprise a sizeable fraction of the global games market, and is +the fastest growing segment in games. This has given rise to the domain of +esports analytics, which uses telemetry data from games to inform players, +coaches, broadcasters and other stakeholders. Compared to traditional sports, +esport titles change rapidly, in terms of mechanics as well as rules. Due to +these frequent changes to the parameters of the game, esport analytics models +can have a short life-spam, a problem which is largely ignored within the +literature. This paper extracts information from game design (i.e. patch notes) +and utilises clustering techniques to propose a new form of character +representation. As a case study, a neural network model is trained to predict +the number of kills in a Dota 2 match utilising this novel character +representation technique. The performance of this model is then evaluated +against two distinct baselines, including conventional techniques. Not only did +the model significantly outperform the baselines in terms of accuracy (85% +AUC), but the model also maintains the accuracy in two newer iterations of the +game that introduced one new character and a brand new character type. These +changes introduced to the design of the game would typically break conventional +techniques that are commonly used within the literature. Therefore, the +proposed methodology for representing characters can increase the life-spam of +machine learning models as well as contribute to a higher performance when +compared to traditional techniques typically employed within the literature. + +
+
+
+
+
+ + ♻ ☆ Beyond Individual Input for Deep Anomaly Detection on Tabular Data + + +
+ Anomaly detection is crucial in various domains, such as finance, healthcare, +and cybersecurity. In this paper, we propose a novel deep anomaly detection +method for tabular data that leverages Non-Parametric Transformers (NPTs), a +model initially proposed for supervised tasks, to capture both feature-feature +and sample-sample dependencies. In a reconstruction-based framework, we train +the NPT to reconstruct masked features of normal samples. In a non-parametric +fashion, we leverage the whole training set during inference and use the +model's ability to reconstruct the masked features during to generate an +anomaly score. To the best of our knowledge, our proposed method is the first +to successfully combine feature-feature and sample-sample dependencies for +anomaly detection on tabular datasets. We evaluate our method on an extensive +benchmark of 31 tabular datasets and demonstrate that our approach outperforms +existing state-of-the-art methods based on the F1-score and AUROC by a +significant margin. + +
+
+
+
+
+ + ♻ ☆ Architecture-Preserving Provable Repair of Deep Neural Networks + + +
+ Deep neural networks (DNNs) are becoming increasingly important components of +software, and are considered the state-of-the-art solution for a number of +problems, such as image recognition. However, DNNs are far from infallible, and +incorrect behavior of DNNs can have disastrous real-world consequences. This +paper addresses the problem of architecture-preserving V-polytope provable +repair of DNNs. A V-polytope defines a convex bounded polytope using its vertex +representation. V-polytope provable repair guarantees that the repaired DNN +satisfies the given specification on the infinite set of points in the given +V-polytope. An architecture-preserving repair only modifies the parameters of +the DNN, without modifying its architecture. The repair has the flexibility to +modify multiple layers of the DNN, and runs in polynomial time. It supports +DNNs with activation functions that have some linear pieces, as well as +fully-connected, convolutional, pooling and residual layers. To the best our +knowledge, this is the first provable repair approach that has all of these +features. We implement our approach in a tool called APRNN. Using MNIST, +ImageNet, and ACAS Xu DNNs, we show that it has better efficiency, scalability, +and generalization compared to PRDNN and REASSURE, prior provable repair +methods that are not architecture preserving. + +
+
+ comment: Accepted paper at PLDI 2023. Tool is available at + https://github.com/95616ARG/APRNN/ +
+
+
+
+
+ + ♻ ☆ Are demographically invariant models and representations in medical + imaging fair? + + +
+ Medical imaging models have been shown to encode information about patient +demographics such as age, race, and sex in their latent representation, raising +concerns about their potential for discrimination. Here, we ask whether +requiring models not to encode demographic attributes is desirable. We point +out that marginal and class-conditional representation invariance imply the +standard group fairness notions of demographic parity and equalized odds, +respectively, while additionally requiring risk distribution matching, thus +potentially equalizing away important group differences. Enforcing the +traditional fairness notions directly instead does not entail these strong +constraints. Moreover, representationally invariant models may still take +demographic attributes into account for deriving predictions. The latter can be +prevented using counterfactual notions of (individual) fairness or invariance. +We caution, however, that properly defining medical image counterfactuals with +respect to demographic attributes is highly challenging. Finally, we posit that +encoding demographic attributes may even be advantageous if it enables learning +a task-specific encoding of demographic features that does not rely on social +constructs such as 'race' and 'gender.' We conclude that demographically +invariant representations are neither necessary nor sufficient for fairness in +medical imaging. Models may need to encode demographic attributes, lending +further urgency to calls for comprehensive model fairness assessments in terms +of predictive performance across diverse patient groups. + +
+
+
+
+
+ + ♻ ☆ SciRE-Solver: Efficient Sampling of Diffusion Probabilistic Models by + Score-integrand Solver with Recursive Derivative Estimation + + +
+ Diffusion probabilistic models (DPMs) are a powerful class of generative +models known for their ability to generate high-fidelity image samples. A major +challenge in the implementation of DPMs is the slow sampling process. In this +work, we bring a high-efficiency sampler for DPMs. Specifically, we propose a +score-based exact solution paradigm for the diffusion ODEs corresponding to the +sampling process of DPMs, which introduces a new perspective on developing +numerical algorithms for solving diffusion ODEs. To achieve an efficient +sampler, we propose a recursive derivative estimation (RDE) method to reduce +the estimation error. With our proposed solution paradigm and RDE method, we +propose the score-integrand solver with the convergence order guarantee as +efficient solver (SciRE-Solver) for solving diffusion ODEs. The SciRE-Solver +attains state-of-the-art (SOTA) sampling performance with a limited number of +score function evaluations (NFE) on both discrete-time and continuous-time DPMs +in comparison to existing training-free sampling algorithms. Such as, we +achieve $3.48$ FID with $12$ NFE and $2.42$ FID with $20$ NFE for +continuous-time DPMs on CIFAR10, respectively. Different from other samplers, +SciRE-Solver has the promising potential to surpass the FIDs achieved in the +original papers of some pre-trained models with a small NFEs. For example, we +reach SOTA value of $2.40$ FID with $100$ NFE for continuous-time DPM and of +$3.15$ FID with $84$ NFE for discrete-time DPM on CIFAR-10, as well as of +$2.17$ ($2.02$) FID with $18$ ($50$) NFE for discrete-time DPM on CelebA +64$\times$64. + +
+
+
+
+
+ + ♻ ☆ Stochastic Constrained DRO with a Complexity Independent of Sample Size + + +
+ Distributionally Robust Optimization (DRO), as a popular method to train +robust models against distribution shift between training and test sets, has +received tremendous attention in recent years. In this paper, we propose and +analyze stochastic algorithms that apply to both non-convex and convex losses +for solving Kullback Leibler divergence constrained DRO problem. Compared with +existing methods solving this problem, our stochastic algorithms not only enjoy +competitive if not better complexity independent of sample size but also just +require a constant batch size at every iteration, which is more practical for +broad applications. We establish a nearly optimal complexity bound for finding +an $\epsilon$ stationary solution for non-convex losses and an optimal +complexity for finding an $\epsilon$ optimal solution for convex losses. +Empirical studies demonstrate the effectiveness of the proposed algorithms for +solving non-convex and convex constrained DRO problems. + +
+
+ comment: 37 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Non-linear Embeddings in Hilbert Simplex Geometry + + +
+ A key technique of machine learning and computer vision is to embed discrete +weighted graphs into continuous spaces for further downstream processing. +Embedding discrete hierarchical structures in hyperbolic geometry has proven +very successful since it was shown that any weighted tree can be embedded in +that geometry with arbitrary low distortion. Various optimization methods for +hyperbolic embeddings based on common models of hyperbolic geometry have been +studied. In this paper, we consider Hilbert geometry for the standard simplex +which is isometric to a vector space equipped with the variation polytope norm. +We study the representation power of this Hilbert simplex geometry by embedding +distance matrices of graphs. Our findings demonstrate that Hilbert simplex +geometry is competitive to alternative geometries such as the Poincar\'e +hyperbolic ball or the Euclidean geometry for embedding tasks while being fast +and numerically robust. + +
+
+ comment: 21 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ DiffGuard: Semantic Mismatch-Guided Out-of-Distribution Detection using + Pre-trained Diffusion Models ICCV2023 + + +
+ Given a classifier, the inherent property of semantic Out-of-Distribution +(OOD) samples is that their contents differ from all legal classes in terms of +semantics, namely semantic mismatch. There is a recent work that directly +applies it to OOD detection, which employs a conditional Generative Adversarial +Network (cGAN) to enlarge semantic mismatch in the image space. While achieving +remarkable OOD detection performance on small datasets, it is not applicable to +ImageNet-scale datasets due to the difficulty in training cGANs with both input +images and labels as conditions. As diffusion models are much easier to train +and amenable to various conditions compared to cGANs, in this work, we propose +to directly use pre-trained diffusion models for semantic mismatch-guided OOD +detection, named DiffGuard. Specifically, given an OOD input image and the +predicted label from the classifier, we try to enlarge the semantic difference +between the reconstructed OOD image under these conditions and the original +input image. We also present several test-time techniques to further strengthen +such differences. Experimental results show that DiffGuard is effective on both +Cifar-10 and hard cases of the large-scale ImageNet, and it can be easily +combined with existing OOD detection techniques to achieve state-of-the-art OOD +detection results. + +
+
+ comment: Accepted by ICCV2023, with supplementary materials +
+
+
+
+
+ + ♻ ☆ Learning Ability of Interpolating Deep Convolutional Neural Networks + + +
+ It is frequently observed that overparameterized neural networks generalize +well. Regarding such phenomena, existing theoretical work mainly devotes to +linear settings or fully-connected neural networks. This paper studies the +learning ability of an important family of deep neural networks, deep +convolutional neural networks (DCNNs), under both underparameterized and +overparameterized settings. We establish the first learning rates of +underparameterized DCNNs without parameter or function variable structure +restrictions presented in the literature. We also show that by adding +well-defined layers to a non-interpolating DCNN, we can obtain some +interpolating DCNNs that maintain the good learning rates of the +non-interpolating DCNN. This result is achieved by a novel network deepening +scheme designed for DCNNs. Our work provides theoretical verification of how +overfitted DCNNs generalize well. + +
+
+
+
+
+ + ♻ ☆ A Scalable Test Problem Generator for Sequential Transfer Optimization + + +
+ Sequential transfer optimization (STO), which aims to improve the +optimization performance on a task at hand by exploiting the knowledge captured +from several previously-solved optimization tasks stored in a database, has +been gaining increasing research attention over the years. However, despite +remarkable advances in algorithm design, the development of a systematic +benchmark suite for comprehensive comparisons of STO algorithms received far +less attention. Existing test problems are either simply generated by +assembling other benchmark functions or extended from specific practical +problems with limited variations. The relationships between the optimal +solutions of the source and target tasks in these problems are always manually +configured, limiting their ability to model different relationships presented +in real-world problems. Consequently, the good performance achieved by an +algorithm on these problems might be biased and could not be generalized to +other problems. In light of the above, in this study, we first introduce four +rudimentary concepts for characterizing STO problems (STOPs) and present an +important problem feature, namely similarity distribution, which quantitatively +delineates the relationship between the optima of the source and target tasks. +Then, we propose the general design guidelines and a problem generator with +superior scalability. Specifically, the similarity distribution of an STOP can +be easily customized, enabling a continuous spectrum of representation of the +diverse similarity relationships of real-world problems. Lastly, a benchmark +suite with 12 STOPs featured by a variety of customized similarity +relationships is developed using the proposed generator, which would serve as +an arena for STO algorithms and provide more comprehensive evaluation results. +The source code of the problem generator is available at +https://github.com/XmingHsueh/STOP-G. + +
+
+
+
+
+ + ♻ ☆ Capturing the Diffusive Behavior of the Multiscale Linear Transport + Equations by Asymptotic-Preserving Convolutional DeepONets + + +
+ In this paper, we introduce two types of novel Asymptotic-Preserving +Convolutional Deep Operator Networks (APCONs) designed to address the +multiscale time-dependent linear transport problem. We observe that the vanilla +physics-informed DeepONets with modified MLP may exhibit instability in +maintaining the desired limiting macroscopic behavior. Therefore, this +necessitates the utilization of an asymptotic-preserving loss function. Drawing +inspiration from the heat kernel in the diffusion equation, we propose a new +architecture called Convolutional Deep Operator Networks, which employ multiple +local convolution operations instead of a global heat kernel, along with +pooling and activation operations in each filter layer. Our APCON methods +possess a parameter count that is independent of the grid size and are capable +of capturing the diffusive behavior of the linear transport problem. Finally, +we validate the effectiveness of our methods through several numerical +examples. + +
+
+
+
+
+ + ♻ ☆ CUTS+: High-dimensional Causal Discovery from Irregular Time-series AAAI-24 + + +
+ Causal discovery in time-series is a fundamental problem in the machine +learning community, enabling causal reasoning and decision-making in complex +scenarios. Recently, researchers successfully discover causality by combining +neural networks with Granger causality, but their performances degrade largely +when encountering high-dimensional data because of the highly redundant network +design and huge causal graphs. Moreover, the missing entries in the +observations further hamper the causal structural learning. To overcome these +limitations, We propose CUTS+, which is built on the Granger-causality-based +causal discovery method CUTS and raises the scalability by introducing a +technique called Coarse-to-fine-discovery (C2FD) and leveraging a +message-passing-based graph neural network (MPGNN). Compared to previous +methods on simulated, quasi-real, and real datasets, we show that CUTS+ largely +improves the causal discovery performance on high-dimensional data with +different types of irregular sampling. + +
+
+ comment: Submit to AAAI-24 +
+
+
+
+
+ + ♻ ☆ ST-former for short-term passenger flow prediction during COVID-19 in + urban rail transit system + + +
+ Accurate passenger flow prediction of urban rail transit is essential for +improving the performance of intelligent transportation systems, especially +during the epidemic. How to dynamically model the complex spatiotemporal +dependencies of passenger flow is the main issue in achieving accurate +passenger flow prediction during the epidemic. To solve this issue, this paper +proposes a brand-new transformer-based architecture called STformer under the +encoder-decoder framework specifically for COVID-19. Concretely, we develop a +modified self-attention mechanism named Causal-Convolution ProbSparse +Self-Attention (CPSA) to model the multiple temporal dependencies of passenger +flow with low computational costs. To capture the complex and dynamic spatial +dependencies, we introduce a novel Adaptive Multi-Graph Convolution Network +(AMGCN) by leveraging multiple graphs in a self-adaptive manner. Additionally, +the Multi-source Data Fusion block fuses the passenger flow data, COVID-19 +confirmed case data, and the relevant social media data to study the impact of +COVID-19 to passenger flow. Experiments on real-world passenger flow datasets +demonstrate the superiority of ST-former over the other eleven state-of-the-art +methods. Several ablation studies are carried out to verify the effectiveness +and reliability of our model structure. Results can provide critical insights +for the operation of URT systems. + +
+
+ comment: There are some errors that might mislead readers for this version. + There is no new version right now +
+
+
+
+
+ + ♻ ☆ Spatial-Temporal Attention Fusion Network for short-term passenger flow + prediction on holidays in urban rail transit systems + + +
+ The short term passenger flow prediction of the urban rail transit system is +of great significance for traffic operation and management. The emerging deep +learning-based models provide effective methods to improve prediction accuracy. +However, most of the existing models mainly predict the passenger flow on +general weekdays or weekends. There are only few studies focusing on predicting +the passenger flow on holidays, which is a significantly challenging task for +traffic management because of its suddenness and irregularity. To this end, we +propose a deep learning-based model named Spatial Temporal Attention Fusion +Network comprising a novel Multi-Graph Attention Network, a Conv-Attention +Block, and Feature Fusion Block for short-term passenger flow prediction on +holidays. The multi-graph attention network is applied to extract the complex +spatial dependencies of passenger flow dynamically and the conv-attention block +is applied to extract the temporal dependencies of passenger flow from global +and local perspectives. Moreover, in addition to the historical passenger flow +data, the social media data, which has been proven that they can effectively +reflect the evolution trend of passenger flow under events, are also fused into +the feature fusion block of STAFN. The STAFN is tested on two large-scale urban +rail transit AFC datasets from China on the New Year holiday, and the +prediction performance of the model are compared with that of several +conventional prediction models. Results demonstrate its better robustness and +advantages among benchmark methods, which can provide overwhelming support for +practical applications of short term passenger flow prediction on holidays. + +
+
+ comment: There are some errors that might mislead readers for this version. + There is no new version right now +
+
+
+
+
+ + ♻ ☆ STG-GAN: A spatiotemporal graph generative adversarial networks for + short-term passenger flow prediction in urban rail transit systems + + +
+ Short-term passenger flow prediction is an important but challenging task for +better managing urban rail transit (URT) systems. Some emerging deep learning +models provide good insights to improve short-term prediction accuracy. +However, there exist many complex spatiotemporal dependencies in URT systems. +Most previous methods only consider the absolute error between ground truth and +predictions as the optimization objective, which fails to account for spatial +and temporal constraints on the predictions. Furthermore, a large number of +existing prediction models introduce complex neural network layers to improve +accuracy while ignoring their training efficiency and memory occupancy, +decreasing the chances to be applied to the real world. To overcome these +limitations, we propose a novel deep learning-based spatiotemporal graph +generative adversarial network (STG-GAN) model with higher prediction accuracy, +higher efficiency, and lower memory occupancy to predict short-term passenger +flows of the URT network. Our model consists of two major parts, which are +optimized in an adversarial learning manner: (1) a generator network including +gated temporal conventional networks (TCN) and weight sharing graph convolution +networks (GCN) to capture structural spatiotemporal dependencies and generate +predictions with a relatively small computational burden; (2) a discriminator +network including a spatial discriminator and a temporal discriminator to +enhance the spatial and temporal constraints of the predictions. The STG-GAN is +evaluated on two large-scale real-world datasets from Beijing Subway. A +comparison with those of several state-of-the-art models illustrates its +superiority and robustness. This study can provide critical experience in +conducting short-term passenger flow predictions, especially from the +perspective of real-world applications. + +
+
+ comment: There are some errors that might mislead readers for this version. + There is no new version right now +
+
+
+
+
+ + ♻ ☆ Deep Reinforcement Learning with Multitask Episodic Memory Based on + Task-Conditioned Hypernetwork + + +
+ Deep reinforcement learning algorithms are usually impeded by sampling +inefficiency, heavily depending on multiple interactions with the environment +to acquire accurate decision-making capabilities. In contrast, humans rely on +their hippocampus to retrieve relevant information from past experiences of +relevant tasks, which guides their decision-making when learning a new task, +rather than exclusively depending on environmental interactions. Nevertheless, +designing a hippocampus-like module for an agent to incorporate past +experiences into established reinforcement learning algorithms presents two +challenges. The first challenge involves selecting the most relevant past +experiences for the current task, and the second challenge is integrating such +experiences into the decision network. To address these challenges, we propose +a novel method that utilizes a retrieval network based on task-conditioned +hypernetwork, which adapts the retrieval network's parameters depending on the +task. At the same time, a dynamic modification mechanism enhances the +collaborative efforts between the retrieval and decision networks. We evaluate +the proposed method on the MiniGrid environment.The experimental results +demonstrate that our proposed method significantly outperforms strong +baselines. + +
+
+
+
+
+ + ♻ ☆ Interaction-Aware Personalized Vehicle Trajectory Prediction Using + Temporal Graph Neural Networks + + +
+ Accurate prediction of vehicle trajectories is vital for advanced driver +assistance systems and autonomous vehicles. Existing methods mainly rely on +generic trajectory predictions derived from large datasets, overlooking the +personalized driving patterns of individual drivers. To address this gap, we +propose an approach for interaction-aware personalized vehicle trajectory +prediction that incorporates temporal graph neural networks. Our method +utilizes Graph Convolution Networks (GCN) and Long Short-Term Memory (LSTM) to +model the spatio-temporal interactions between target vehicles and their +surrounding traffic. To personalize the predictions, we establish a pipeline +that leverages transfer learning: the model is initially pre-trained on a +large-scale trajectory dataset and then fine-tuned for each driver using their +specific driving data. We employ human-in-the-loop simulation to collect +personalized naturalistic driving trajectories and corresponding surrounding +vehicle trajectories. Experimental results demonstrate the superior performance +of our personalized GCN-LSTM model, particularly for longer prediction +horizons, compared to its generic counterpart. Moreover, the personalized model +outperforms individual models created without pre-training, emphasizing the +significance of pre-training on a large dataset to avoid overfitting. By +incorporating personalization, our approach enhances trajectory prediction +accuracy. + +
+
+
+
+
+ + ♻ ☆ Neuro-Dynamic State Estimation for Networked Microgrids + + +
+ We devise neuro-dynamic state estimation (Neuro-DSE), a learning-based +dynamic state estimation (DSE) algorithm for networked microgrids (NMs) under +unknown subsystems. Our contributions include: 1) a data-driven Neuro-DSE +algorithm for NMs DSE with partially unidentified dynamic models, which +incorporates the neural-ordinary-differential-equations (ODE-Net) into Kalman +filters; 2) a self-refining Neuro-DSE algorithm (Neuro-DSE+) which enables +data-driven DSE under limited and noisy measurements by establishing an +automatic filtering, augmenting and correcting framework; 3) a +Neuro-KalmanNet-DSE algorithm which further integrates KalmanNet with Neuro-DSE +to relieve the model mismatch of both neural- and physics-based dynamic models; +and 4) an augmented Neuro-DSE for joint estimation of NMs states and unknown +parameters (e.g., inertia). Extensive case studies demonstrate the efficacy of +Neuro-DSE and its variants under different noise levels, control modes, power +sources, observabilities and model knowledge, respectively. + +
+
+ comment: This paper needs to be withdrawn by the author. In Section II, Part + C, there is lack of procedure to achieve parameter estimation using the + proposed model. In Section V, Part E, experiment parameter setting is missed. + Noise for estimating inertia case needs to be reset for simulation. + Additional tests need to be added. These two parts need to be rewritten +
+
+
+
+
+ + ♻ ☆ DyTed: Disentangled Representation Learning for Discrete-time Dynamic + Graph + + +
+ Unsupervised representation learning for dynamic graphs has attracted a lot +of research attention in recent years. Compared with static graph, the dynamic +graph is a comprehensive embodiment of both the intrinsic stable +characteristics of nodes and the time-related dynamic preference. However, +existing methods generally mix these two types of information into a single +representation space, which may lead to poor explanation, less robustness, and +a limited ability when applied to different downstream tasks. To solve the +above problems, in this paper, we propose a novel disenTangled representation +learning framework for discrete-time Dynamic graphs, namely DyTed. We specially +design a temporal-clips contrastive learning task together with a structure +contrastive learning to effectively identify the time-invariant and +time-varying representations respectively. To further enhance the +disentanglement of these two types of representation, we propose a +disentanglement-aware discriminator under an adversarial learning framework +from the perspective of information theory. Extensive experiments on Tencent +and five commonly used public datasets demonstrate that DyTed, as a general +framework that can be applied to existing methods, achieves state-of-the-art +performance on various downstream tasks, as well as be more robust against +noise. + +
+
+
+
+
+ + ♻ ☆ Explainable Machine Learning for Categorical and Mixed Data with + Lossless Visualization + + +
+ Building accurate and interpretable Machine Learning (ML) models for +heterogeneous/mixed data is a long-standing challenge for algorithms designed +for numeric data. This work focuses on developing numeric coding schemes for +non-numeric attributes for ML algorithms to support accurate and explainable ML +models, methods for lossless visualization of n-D non-numeric categorical data +with visual rule discovery in these visualizations, and accurate and +explainable ML models for categorical data. This study proposes a +classification of mixed data types and analyzes their important role in Machine +Learning. It presents a toolkit for enforcing interpretability of all internal +operations of ML algorithms on mixed data with a visual data exploration on +mixed data. A new Sequential Rule Generation (SRG) algorithm for explainable +rule generation with categorical data is proposed and successfully evaluated in +multiple computational experiments. This work is one of the steps to the full +scope ML algorithms for mixed data supported by lossless visualization of n-D +data in General Line Coordinates beyond Parallel Coordinates. + +
+
+ comment: 46 pages, 32 figures, 29 tables. arXiv admin note: substantial text + overlap with arXiv:2206.06476 +
+
+
+
+
+ + ♻ ☆ Prompting the Hidden Talent of Web-Scale Speech Models for Zero-Shot + Task Generalization + + +
+ We investigate the emergent abilities of the recently proposed web-scale +speech model Whisper, by adapting it to unseen tasks with prompt engineering. +We selected three tasks: audio-visual speech recognition (AVSR), code-switched +speech recognition (CS-ASR), and speech translation (ST) on unseen language +pairs. We design task-specific prompts, by either leveraging another +large-scale model, or simply manipulating the special tokens in the default +prompts. Experiments show that compared to the default prompts, our proposed +prompts improve performance by 10% to 45% on the three zero-shot tasks, and +even outperform SotA supervised models on some datasets. In addition, our +experiments reveal many interesting properties of Whisper, including its +robustness to prompts, bias on accents, and the multilingual understanding in +its latent space. Code is available at +https://github.com/jasonppy/PromptingWhisper + +
+
+ comment: Interspeech 2023 +
+
+
+
+
+ + ♻ ☆ Ablating Concepts in Text-to-Image Diffusion Models ICCV 2023 + + +
+ Large-scale text-to-image diffusion models can generate high-fidelity images +with powerful compositional ability. However, these models are typically +trained on an enormous amount of Internet data, often containing copyrighted +material, licensed images, and personal photos. Furthermore, they have been +found to replicate the style of various living artists or memorize exact +training samples. How can we remove such copyrighted concepts or images without +retraining the model from scratch? To achieve this goal, we propose an +efficient method of ablating concepts in the pretrained model, i.e., preventing +the generation of a target concept. Our algorithm learns to match the image +distribution for a target style, instance, or text prompt we wish to ablate to +the distribution corresponding to an anchor concept. This prevents the model +from generating target concepts given its text condition. Extensive experiments +show that our method can successfully prevent the generation of the ablated +concept while preserving closely related concepts in the model. + +
+
+ comment: ICCV 2023. Project website: https://www.cs.cmu.edu/~concept-ablation/ +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ SCANet: A Self- and Cross-Attention Network for Audio-Visual Speech + Separation + + +
+ The integration of different modalities, such as audio and visual +information, plays a crucial role in human perception of the surrounding +environment. Recent research has made significant progress in designing fusion +modules for audio-visual speech separation. However, they predominantly focus +on multi-modal fusion architectures situated either at the top or bottom +positions, rather than comprehensively considering multi-modal fusion at +various hierarchical positions within the network. In this paper, we propose a +novel model called self- and cross-attention network (SCANet), which leverages +the attention mechanism for efficient audio-visual feature fusion. SCANet +consists of two types of attention blocks: self-attention (SA) and +cross-attention (CA) blocks, where the CA blocks are distributed at the top +(TCA), middle (MCA) and bottom (BCA) of SCANet. These blocks maintain the +ability to learn modality-specific features and enable the extraction of +different semantics from audio-visual features. Comprehensive experiments on +three standard audio-visual separation benchmarks (LRS2, LRS3, and VoxCeleb2) +demonstrate the effectiveness of SCANet, outperforming existing +state-of-the-art (SOTA) methods while maintaining comparable inference time. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ☆ Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme + Detection ACM MM + + +
+ Hateful meme detection is a challenging multimodal task that requires +comprehension of both vision and language, as well as cross-modal interactions. +Recent studies have tried to fine-tune pre-trained vision-language models +(PVLMs) for this task. However, with increasing model sizes, it becomes +important to leverage powerful PVLMs more efficiently, rather than simply +fine-tuning them. Recently, researchers have attempted to convert meme images +into textual captions and prompt language models for predictions. This approach +has shown good performance but suffers from non-informative image captions. +Considering the two factors mentioned above, we propose a probing-based +captioning approach to leverage PVLMs in a zero-shot visual question answering +(VQA) manner. Specifically, we prompt a frozen PVLM by asking hateful +content-related questions and use the answers as image captions (which we call +Pro-Cap), so that the captions contain information critical for hateful content +detection. The good performance of models with Pro-Cap on three benchmarks +validates the effectiveness and generalization of the proposed method. + +
+
+ comment: Camera-ready for 23, ACM MM +
+
+
+
+
+ + ☆ Improving Anomaly Segmentation with Multi-Granularity Cross-Domain + Alignment + + +
+ Anomaly segmentation plays a crucial role in identifying anomalous objects +within images, which facilitates the detection of road anomalies for autonomous +driving. Although existing methods have shown impressive results in anomaly +segmentation using synthetic training data, the domain discrepancies between +synthetic training data and real test data are often neglected. To address this +issue, the Multi-Granularity Cross-Domain Alignment (MGCDA) framework is +proposed for anomaly segmentation in complex driving environments. It uniquely +combines a new Multi-source Domain Adversarial Training (MDAT) module and a +novel Cross-domain Anomaly-aware Contrastive Learning (CACL) method to boost +the generality of the model, seamlessly integrating multi-domain data at both +scene and sample levels. Multi-source domain adversarial loss and a dynamic +label smoothing strategy are integrated into the MDAT module to facilitate the +acquisition of domain-invariant features at the scene level, through +adversarial training across multiple stages. CACL aligns sample-level +representations with contrastive loss on cross-domain data, which utilizes an +anomaly-aware sampling strategy to efficiently sample hard samples and anchors. +The proposed framework has decent properties of parameter-free during the +inference stage and is compatible with other anomaly segmentation networks. +Experimental conducted on Fishyscapes and RoadAnomaly datasets demonstrate that +the proposed framework achieves state-of-the-art performance. + +
+
+ comment: Accepted to ACM Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ Understanding User Behavior in Volumetric Video Watching: Dataset, + Analysis and Prediction ACM MM'23 + + +
+ Volumetric video emerges as a new attractive video paradigm in recent years +since it provides an immersive and interactive 3D viewing experience with six +degree-of-freedom (DoF). Unlike traditional 2D or panoramic videos, volumetric +videos require dense point clouds, voxels, meshes, or huge neural models to +depict volumetric scenes, which results in a prohibitively high bandwidth +burden for video delivery. Users' behavior analysis, especially the viewport +and gaze analysis, then plays a significant role in prioritizing the content +streaming within users' viewport and degrading the remaining content to +maximize user QoE with limited bandwidth. Although understanding user behavior +is crucial, to the best of our best knowledge, there are no available 3D +volumetric video viewing datasets containing fine-grained user interactivity +features, not to mention further analysis and behavior prediction. In this +paper, we for the first time release a volumetric video viewing behavior +dataset, with a large scale, multiple dimensions, and diverse conditions. We +conduct an in-depth analysis to understand user behaviors when viewing +volumetric videos. Interesting findings on user viewport, gaze, and motion +preference related to different videos and users are revealed. We finally +design a transformer-based viewport prediction model that fuses the features of +both gaze and motion, which is able to achieve high accuracy at various +conditions. Our prediction model is expected to further benefit volumetric +video streaming optimization. Our dataset, along with the corresponding +visualization tools is accessible at +https://cuhksz-inml.github.io/user-behavior-in-vv-watching/ + +
+
+ comment: Accepted by ACM MM'23 +
+
+
+
+
+ + ♻ ☆ Seeing through the Brain: Image Reconstruction of Visual Perception from + Human Brain Signals + + +
+ Seeing is believing, however, the underlying mechanism of how human visual +perceptions are intertwined with our cognitions is still a mystery. Thanks to +the recent advances in both neuroscience and artificial intelligence, we have +been able to record the visually evoked brain activities and mimic the visual +perception ability through computational approaches. In this paper, we pay +attention to visual stimuli reconstruction by reconstructing the observed +images based on portably accessible brain signals, i.e., electroencephalography +(EEG) data. Since EEG signals are dynamic in the time-series format and are +notorious to be noisy, processing and extracting useful information requires +more dedicated efforts; In this paper, we propose a comprehensive pipeline, +named NeuroImagen, for reconstructing visual stimuli images from EEG signals. +Specifically, we incorporate a novel multi-level perceptual information +decoding to draw multi-grained outputs from the given EEG data. A latent +diffusion model will then leverage the extracted information to reconstruct the +high-resolution visual stimuli images. The experimental results have +illustrated the effectiveness of image reconstruction and superior quantitative +performance of our proposed method. + +
+
+ comment: A preprint version of an ongoing work +
+
+
+
+
+ + ♻ ☆ VoxBlink: X-Large Speaker Verification Dataset on Camera ICASSP2023 + + +
+ In this paper, we contribute a novel and extensive dataset for speaker +verification, which contains noisy 38k identities/1.45M utterances (VoxBlink) +and relatively cleaned 18k identities/1.02M (VoxBlink-Clean) utterances for +training. Firstly, we collect a 60K+ users' list as well as their avatar and +download their SHORT videos on the YouTube. Then, an automatically pipeline is +devised to extract target user's speech segments and videos, which is efficient +and scalable. To the best of our knowledge, the VoxBlink dataset is the largest +speaker recognition dataset. Secondly, we develop a series of experiments based +on VoxBlink-clean together with VoxCeleb2. Our findings highlight a notable +improvement in performance, ranging from 15% to 30%, across different backbone +architectures, upon integrating our dataset for training. The dataset will be +released SOON~. + +
+
+ comment: submit to ICASSP2023 +
+
+
+
+
+ + ♻ ☆ A Weakly Supervised Approach to Emotion-change Prediction and Improved + Mood Inference + + +
+ Whilst a majority of affective computing research focuses on inferring +emotions, examining mood or understanding the \textit{mood-emotion interplay} +has received significantly less attention. Building on prior work, we (a) +deduce and incorporate emotion-change ($\Delta$) information for inferring +mood, without resorting to annotated labels, and (b) attempt mood prediction +for long duration video clips, in alignment with the characterisation of mood. +We generate the emotion-change ($\Delta$) labels via metric learning from a +pre-trained Siamese Network, and use these in addition to mood labels for mood +classification. Experiments evaluating \textit{unimodal} (training only using +mood labels) vs \textit{multimodal} (training using mood plus $\Delta$ labels) +models show that mood prediction benefits from the incorporation of +emotion-change information, emphasising the importance of modelling the +mood-emotion interplay for effective mood inference. + +
+
+ comment: 9 pages, 3 figures, 6 tables, published in IEEE International + Conference on Affective Computing and Intelligent Interaction +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`